aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig9
-rw-r--r--fs/9p/Makefile3
-rw-r--r--fs/9p/cache.c474
-rw-r--r--fs/9p/cache.h176
-rw-r--r--fs/9p/v9fs.c196
-rw-r--r--fs/9p/v9fs.h13
-rw-r--r--fs/9p/v9fs_vfs.h6
-rw-r--r--fs/9p/vfs_addr.c88
-rw-r--r--fs/9p/vfs_file.c25
-rw-r--r--fs/9p/vfs_inode.c61
-rw-r--r--fs/9p/vfs_super.c16
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/adfs/inode.c7
-rw-r--r--fs/afs/flock.c2
-rw-r--r--fs/afs/proc.c8
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/aio.c57
-rw-r--r--fs/anon_inodes.c68
-rw-r--r--fs/autofs/dirhash.c2
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/binfmt_elf.c96
-rw-r--r--fs/binfmt_elf_fdpic.c73
-rw-r--r--fs/binfmt_flat.c22
-rw-r--r--fs/block_dev.c3
-rw-r--r--fs/btrfs/disk-io.c3
-rw-r--r--fs/btrfs/inode.c28
-rw-r--r--fs/btrfs/ordered-data.c1
-rw-r--r--fs/btrfs/super.c4
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/buffer.c57
-rw-r--r--fs/char_dev.c3
-rw-r--r--fs/cifs/cifs_dfs_ref.c4
-rw-r--r--fs/cifs/cifsfs.c4
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/coda/coda_int.h1
-rw-r--r--fs/compat.c7
-rw-r--r--fs/devpts/inode.c3
-rw-r--r--fs/dlm/debug_fs.c12
-rw-r--r--fs/dlm/lowcomms.c26
-rw-r--r--fs/drop_caches.c4
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h2
-rw-r--r--fs/ecryptfs/mmap.c2
-rw-r--r--fs/eventfd.c67
-rw-r--r--fs/exec.c125
-rw-r--r--fs/ext2/namei.c2
-rw-r--r--fs/ext2/xip.c2
-rw-r--r--fs/ext3/fsync.c12
-rw-r--r--fs/ext3/inode.c28
-rw-r--r--fs/ext3/super.c4
-rw-r--r--fs/ext4/Kconfig11
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/ext4.h91
-rw-r--r--fs/ext4/ext4_extents.h4
-rw-r--r--fs/ext4/ext4_jbd2.c9
-rw-r--r--fs/ext4/extents.c112
-rw-r--r--fs/ext4/fsync.c13
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c152
-rw-r--r--fs/ext4/ioctl.c7
-rw-r--r--fs/ext4/mballoc.c429
-rw-r--r--fs/ext4/mballoc.h22
-rw-r--r--fs/ext4/migrate.c22
-rw-r--r--fs/ext4/move_extent.c334
-rw-r--r--fs/ext4/namei.c22
-rw-r--r--fs/ext4/resize.c7
-rw-r--r--fs/ext4/super.c159
-rw-r--r--fs/ext4/xattr.c15
-rw-r--r--fs/fcntl.c108
-rw-r--r--fs/file_table.c6
-rw-r--r--fs/fs-writeback.c345
-rw-r--r--fs/fuse/control.c138
-rw-r--r--fs/fuse/dev.c10
-rw-r--r--fs/fuse/fuse_i.h18
-rw-r--r--fs/fuse/inode.c82
-rw-r--r--fs/gfs2/ops_inode.c1
-rw-r--r--fs/gfs2/rgrp.c2
-rw-r--r--fs/hugetlbfs/inode.c15
-rw-r--r--fs/inode.c39
-rw-r--r--fs/jbd/checkpoint.c6
-rw-r--r--fs/jbd/commit.c2
-rw-r--r--fs/jbd/journal.c30
-rw-r--r--fs/jbd/recovery.c18
-rw-r--r--fs/jbd/revoke.c16
-rw-r--r--fs/jbd/transaction.c9
-rw-r--r--fs/jbd2/commit.c12
-rw-r--r--fs/jbd2/journal.c10
-rw-r--r--fs/jbd2/transaction.c7
-rw-r--r--fs/jffs2/background.c20
-rw-r--r--fs/jffs2/malloc.c4
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/lockd/clntlock.c2
-rw-r--r--fs/lockd/clntproc.c2
-rw-r--r--fs/lockd/host.c4
-rw-r--r--fs/lockd/mon.c2
-rw-r--r--fs/lockd/svclock.c2
-rw-r--r--fs/lockd/svcsubs.c2
-rw-r--r--fs/lockd/xdr.c1
-rw-r--r--fs/lockd/xdr4.c1
-rw-r--r--fs/locks.c2
-rw-r--r--fs/minix/dir.c22
-rw-r--r--fs/ncpfs/dir.c2
-rw-r--r--fs/ncpfs/ioctl.c2
-rw-r--r--fs/nfs/callback_xdr.c2
-rw-r--r--fs/nfs/client.c27
-rw-r--r--fs/nfs/fscache.c25
-rw-r--r--fs/nfs/fscache.h6
-rw-r--r--fs/nfs/nfs2xdr.c1
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs3xdr.c1
-rw-r--r--fs/nfs/nfs4proc.c1
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/nfs4xdr.c1
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/super.c80
-rw-r--r--fs/nfs/write.c1
-rw-r--r--fs/nfsd/export.c4
-rw-r--r--fs/nfsd/nfs3xdr.c75
-rw-r--r--fs/nfsd/nfs4acl.c4
-rw-r--r--fs/nfsd/nfs4callback.c263
-rw-r--r--fs/nfsd/nfs4idmap.c1
-rw-r--r--fs/nfsd/nfs4proc.c89
-rw-r--r--fs/nfsd/nfs4state.c685
-rw-r--r--fs/nfsd/nfs4xdr.c42
-rw-r--r--fs/nfsd/nfsctl.c8
-rw-r--r--fs/nfsd/nfsfh.c158
-rw-r--r--fs/nfsd/nfssvc.c54
-rw-r--r--fs/nfsd/vfs.c9
-rw-r--r--fs/nilfs2/btnode.c2
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/nilfs2/gcinode.c2
-rw-r--r--fs/nilfs2/inode.c2
-rw-r--r--fs/nilfs2/mdt.c4
-rw-r--r--fs/nilfs2/namei.c6
-rw-r--r--fs/nilfs2/nilfs.h10
-rw-r--r--fs/nilfs2/super.c4
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/ntfs/file.c42
-rw-r--r--fs/ntfs/layout.h2
-rw-r--r--fs/ntfs/malloc.h2
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/alloc.c1342
-rw-r--r--fs/ocfs2/alloc.h101
-rw-r--r--fs/ocfs2/aops.c37
-rw-r--r--fs/ocfs2/aops.h2
-rw-r--r--fs/ocfs2/buffer_head_io.c47
-rw-r--r--fs/ocfs2/buffer_head_io.h8
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h1
-rw-r--r--fs/ocfs2/cluster/netdebug.c4
-rw-r--r--fs/ocfs2/dir.c107
-rw-r--r--fs/ocfs2/dlm/dlmast.c1
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c1
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c3
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c1
-rw-r--r--fs/ocfs2/dlm/dlmlock.c1
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c1
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c1
-rw-r--r--fs/ocfs2/dlm/dlmthread.c7
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c1
-rw-r--r--fs/ocfs2/dlmglue.c105
-rw-r--r--fs/ocfs2/dlmglue.h6
-rw-r--r--fs/ocfs2/extent_map.c33
-rw-r--r--fs/ocfs2/extent_map.h8
-rw-r--r--fs/ocfs2/file.c151
-rw-r--r--fs/ocfs2/file.h2
-rw-r--r--fs/ocfs2/inode.c86
-rw-r--r--fs/ocfs2/inode.h20
-rw-r--r--fs/ocfs2/ioctl.c14
-rw-r--r--fs/ocfs2/journal.c82
-rw-r--r--fs/ocfs2/journal.h94
-rw-r--r--fs/ocfs2/localalloc.c12
-rw-r--r--fs/ocfs2/namei.c341
-rw-r--r--fs/ocfs2/namei.h6
-rw-r--r--fs/ocfs2/ocfs2.h52
-rw-r--r--fs/ocfs2/ocfs2_fs.h107
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/quota.h2
-rw-r--r--fs/ocfs2/quota_global.c9
-rw-r--r--fs/ocfs2/quota_local.c26
-rw-r--r--fs/ocfs2/refcounttree.c4313
-rw-r--r--fs/ocfs2/refcounttree.h106
-rw-r--r--fs/ocfs2/resize.c16
-rw-r--r--fs/ocfs2/slot_map.c10
-rw-r--r--fs/ocfs2/suballoc.c35
-rw-r--r--fs/ocfs2/super.c16
-rw-r--r--fs/ocfs2/symlink.c1
-rw-r--r--fs/ocfs2/uptodate.c265
-rw-r--r--fs/ocfs2/uptodate.h51
-rw-r--r--fs/ocfs2/xattr.c2056
-rw-r--r--fs/ocfs2/xattr.h15
-rw-r--r--fs/omfs/dir.c2
-rw-r--r--fs/omfs/file.c4
-rw-r--r--fs/omfs/inode.c2
-rw-r--r--fs/omfs/omfs.h6
-rw-r--r--fs/open.c5
-rw-r--r--fs/partitions/check.c4
-rw-r--r--fs/proc/array.c85
-rw-r--r--fs/proc/base.c67
-rw-r--r--fs/proc/kcore.c335
-rw-r--r--fs/proc/meminfo.c4
-rw-r--r--fs/proc/nommu.c2
-rw-r--r--fs/proc/page.c5
-rw-r--r--fs/proc/proc_sysctl.c2
-rw-r--r--fs/proc/task_mmu.c57
-rw-r--r--fs/qnx4/Kconfig11
-rw-r--r--fs/qnx4/Makefile2
-rw-r--r--fs/qnx4/bitmap.c81
-rw-r--r--fs/qnx4/dir.c5
-rw-r--r--fs/qnx4/file.c40
-rw-r--r--fs/qnx4/inode.c84
-rw-r--r--fs/qnx4/namei.c105
-rw-r--r--fs/qnx4/qnx4.h8
-rw-r--r--fs/qnx4/truncate.c34
-rw-r--r--fs/quota/dquot.c4
-rw-r--r--fs/ramfs/inode.c4
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/romfs/super.c4
-rw-r--r--fs/select.c14
-rw-r--r--fs/smbfs/proc.c2
-rw-r--r--fs/squashfs/super.c4
-rw-r--r--fs/super.c8
-rw-r--r--fs/sync.c10
-rw-r--r--fs/ubifs/budget.c22
-rw-r--r--fs/ubifs/commit.c2
-rw-r--r--fs/ubifs/debug.c112
-rw-r--r--fs/ubifs/debug.h5
-rw-r--r--fs/ubifs/file.c62
-rw-r--r--fs/ubifs/gc.c2
-rw-r--r--fs/ubifs/io.c29
-rw-r--r--fs/ubifs/journal.c13
-rw-r--r--fs/ubifs/key.h35
-rw-r--r--fs/ubifs/log.c17
-rw-r--r--fs/ubifs/lprops.c43
-rw-r--r--fs/ubifs/master.c20
-rw-r--r--fs/ubifs/orphan.c7
-rw-r--r--fs/ubifs/recovery.c4
-rw-r--r--fs/ubifs/replay.c6
-rw-r--r--fs/ubifs/scan.c32
-rw-r--r--fs/ubifs/super.c34
-rw-r--r--fs/ubifs/tnc.c76
-rw-r--r--fs/ubifs/tnc_commit.c2
-rw-r--r--fs/ubifs/ubifs-media.h7
-rw-r--r--fs/ubifs/ubifs.h13
-rw-r--r--fs/ubifs/xattr.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c19
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.c51
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c28
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c15
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c3
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c78
-rw-r--r--fs/xfs/xfs_ag.h9
-rw-r--r--fs/xfs/xfs_bmap.c2
-rw-r--r--fs/xfs/xfs_bmap.h11
-rw-r--r--fs/xfs/xfs_bmap_btree.c20
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c42
-rw-r--r--fs/xfs/xfs_btree.h15
-rw-r--r--fs/xfs/xfs_fs.h2
-rw-r--r--fs/xfs/xfs_ialloc.c805
-rw-r--r--fs/xfs/xfs_ialloc.h18
-rw-r--r--fs/xfs/xfs_iget.c27
-rw-r--r--fs/xfs/xfs_inode.c8
-rw-r--r--fs/xfs/xfs_inode.h8
-rw-r--r--fs/xfs/xfs_inode_item.c10
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_inum.h1
-rw-r--r--fs/xfs/xfs_itable.c98
-rw-r--r--fs/xfs/xfs_itable.h5
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_mount.c2
-rw-r--r--fs/xfs/xfs_mount.h3
-rw-r--r--fs/xfs/xfs_mru_cache.c29
-rw-r--r--fs/xfs/xfs_mru_cache.h1
-rw-r--r--fs/xfs/xfs_rw.c84
-rw-r--r--fs/xfs/xfs_rw.h7
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_buf.c4
-rw-r--r--fs/xfs/xfs_trans_inode.c86
-rw-r--r--fs/xfs/xfs_vnodeops.c17
286 files changed, 13782 insertions, 4810 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 74e0723e90bc..795233702a4e 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -8,3 +8,12 @@ config 9P_FS
8 See <http://v9fs.sf.net> for more information. 8 See <http://v9fs.sf.net> for more information.
9 9
10 If unsure, say N. 10 If unsure, say N.
11
12config 9P_FSCACHE
13 bool "Enable 9P client caching support (EXPERIMENTAL)"
14 depends on EXPERIMENTAL
15 depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y
16 help
17 Choose Y here to enable persistent, read-only local
18 caching support for 9p clients using FS-Cache
19
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index bc7f0d1551e6..1a940ec7af61 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -8,5 +8,6 @@ obj-$(CONFIG_9P_FS) := 9p.o
8 vfs_dir.o \ 8 vfs_dir.o \
9 vfs_dentry.o \ 9 vfs_dentry.o \
10 v9fs.o \ 10 v9fs.o \
11 fid.o \ 11 fid.o
12 12
139p-$(CONFIG_9P_FSCACHE) += cache.o
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
new file mode 100644
index 000000000000..51c94e26a346
--- /dev/null
+++ b/fs/9p/cache.c
@@ -0,0 +1,474 @@
1/*
2 * V9FS cache definitions.
3 *
4 * Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to:
17 * Free Software Foundation
18 * 51 Franklin Street, Fifth Floor
19 * Boston, MA 02111-1301 USA
20 *
21 */
22
23#include <linux/jiffies.h>
24#include <linux/file.h>
25#include <linux/stat.h>
26#include <linux/sched.h>
27#include <linux/fs.h>
28#include <net/9p/9p.h>
29
30#include "v9fs.h"
31#include "cache.h"
32
33#define CACHETAG_LEN 11
34
35struct kmem_cache *vcookie_cache;
36
37struct fscache_netfs v9fs_cache_netfs = {
38 .name = "9p",
39 .version = 0,
40};
41
42static void init_once(void *foo)
43{
44 struct v9fs_cookie *vcookie = (struct v9fs_cookie *) foo;
45 vcookie->fscache = NULL;
46 vcookie->qid = NULL;
47 inode_init_once(&vcookie->inode);
48}
49
50/**
51 * v9fs_init_vcookiecache - initialize a cache for vcookies to maintain
52 * vcookie to inode mapping
53 *
54 * Returns 0 on success.
55 */
56
57static int v9fs_init_vcookiecache(void)
58{
59 vcookie_cache = kmem_cache_create("vcookie_cache",
60 sizeof(struct v9fs_cookie),
61 0, (SLAB_RECLAIM_ACCOUNT|
62 SLAB_MEM_SPREAD),
63 init_once);
64 if (!vcookie_cache)
65 return -ENOMEM;
66
67 return 0;
68}
69
70/**
71 * v9fs_destroy_vcookiecache - destroy the cache of vcookies
72 *
73 */
74
75static void v9fs_destroy_vcookiecache(void)
76{
77 kmem_cache_destroy(vcookie_cache);
78}
79
80int __v9fs_cache_register(void)
81{
82 int ret;
83 ret = v9fs_init_vcookiecache();
84 if (ret < 0)
85 return ret;
86
87 return fscache_register_netfs(&v9fs_cache_netfs);
88}
89
90void __v9fs_cache_unregister(void)
91{
92 v9fs_destroy_vcookiecache();
93 fscache_unregister_netfs(&v9fs_cache_netfs);
94}
95
96/**
97 * v9fs_random_cachetag - Generate a random tag to be associated
98 * with a new cache session.
99 *
100 * The value of jiffies is used for a fairly randomly cache tag.
101 */
102
103static
104int v9fs_random_cachetag(struct v9fs_session_info *v9ses)
105{
106 v9ses->cachetag = kmalloc(CACHETAG_LEN, GFP_KERNEL);
107 if (!v9ses->cachetag)
108 return -ENOMEM;
109
110 return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies);
111}
112
113static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
114 void *buffer, uint16_t bufmax)
115{
116 struct v9fs_session_info *v9ses;
117 uint16_t klen = 0;
118
119 v9ses = (struct v9fs_session_info *)cookie_netfs_data;
120 P9_DPRINTK(P9_DEBUG_FSC, "session %p buf %p size %u", v9ses,
121 buffer, bufmax);
122
123 if (v9ses->cachetag)
124 klen = strlen(v9ses->cachetag);
125
126 if (klen > bufmax)
127 return 0;
128
129 memcpy(buffer, v9ses->cachetag, klen);
130 P9_DPRINTK(P9_DEBUG_FSC, "cache session tag %s", v9ses->cachetag);
131 return klen;
132}
133
134const struct fscache_cookie_def v9fs_cache_session_index_def = {
135 .name = "9P.session",
136 .type = FSCACHE_COOKIE_TYPE_INDEX,
137 .get_key = v9fs_cache_session_get_key,
138};
139
140void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
141{
142 /* If no cache session tag was specified, we generate a random one. */
143 if (!v9ses->cachetag)
144 v9fs_random_cachetag(v9ses);
145
146 v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
147 &v9fs_cache_session_index_def,
148 v9ses);
149 P9_DPRINTK(P9_DEBUG_FSC, "session %p get cookie %p", v9ses,
150 v9ses->fscache);
151}
152
153void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
154{
155 P9_DPRINTK(P9_DEBUG_FSC, "session %p put cookie %p", v9ses,
156 v9ses->fscache);
157 fscache_relinquish_cookie(v9ses->fscache, 0);
158 v9ses->fscache = NULL;
159}
160
161
162static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
163 void *buffer, uint16_t bufmax)
164{
165 const struct v9fs_cookie *vcookie = cookie_netfs_data;
166 memcpy(buffer, &vcookie->qid->path, sizeof(vcookie->qid->path));
167
168 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &vcookie->inode,
169 vcookie->qid->path);
170 return sizeof(vcookie->qid->path);
171}
172
173static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
174 uint64_t *size)
175{
176 const struct v9fs_cookie *vcookie = cookie_netfs_data;
177 *size = i_size_read(&vcookie->inode);
178
179 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &vcookie->inode,
180 *size);
181}
182
183static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
184 void *buffer, uint16_t buflen)
185{
186 const struct v9fs_cookie *vcookie = cookie_netfs_data;
187 memcpy(buffer, &vcookie->qid->version, sizeof(vcookie->qid->version));
188
189 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &vcookie->inode,
190 vcookie->qid->version);
191 return sizeof(vcookie->qid->version);
192}
193
194static enum
195fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
196 const void *buffer,
197 uint16_t buflen)
198{
199 const struct v9fs_cookie *vcookie = cookie_netfs_data;
200
201 if (buflen != sizeof(vcookie->qid->version))
202 return FSCACHE_CHECKAUX_OBSOLETE;
203
204 if (memcmp(buffer, &vcookie->qid->version,
205 sizeof(vcookie->qid->version)))
206 return FSCACHE_CHECKAUX_OBSOLETE;
207
208 return FSCACHE_CHECKAUX_OKAY;
209}
210
211static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
212{
213 struct v9fs_cookie *vcookie = cookie_netfs_data;
214 struct pagevec pvec;
215 pgoff_t first;
216 int loop, nr_pages;
217
218 pagevec_init(&pvec, 0);
219 first = 0;
220
221 for (;;) {
222 nr_pages = pagevec_lookup(&pvec, vcookie->inode.i_mapping,
223 first,
224 PAGEVEC_SIZE - pagevec_count(&pvec));
225 if (!nr_pages)
226 break;
227
228 for (loop = 0; loop < nr_pages; loop++)
229 ClearPageFsCache(pvec.pages[loop]);
230
231 first = pvec.pages[nr_pages - 1]->index + 1;
232
233 pvec.nr = nr_pages;
234 pagevec_release(&pvec);
235 cond_resched();
236 }
237}
238
239const struct fscache_cookie_def v9fs_cache_inode_index_def = {
240 .name = "9p.inode",
241 .type = FSCACHE_COOKIE_TYPE_DATAFILE,
242 .get_key = v9fs_cache_inode_get_key,
243 .get_attr = v9fs_cache_inode_get_attr,
244 .get_aux = v9fs_cache_inode_get_aux,
245 .check_aux = v9fs_cache_inode_check_aux,
246 .now_uncached = v9fs_cache_inode_now_uncached,
247};
248
249void v9fs_cache_inode_get_cookie(struct inode *inode)
250{
251 struct v9fs_cookie *vcookie;
252 struct v9fs_session_info *v9ses;
253
254 if (!S_ISREG(inode->i_mode))
255 return;
256
257 vcookie = v9fs_inode2cookie(inode);
258 if (vcookie->fscache)
259 return;
260
261 v9ses = v9fs_inode2v9ses(inode);
262 vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
263 &v9fs_cache_inode_index_def,
264 vcookie);
265
266 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
267 vcookie->fscache);
268}
269
270void v9fs_cache_inode_put_cookie(struct inode *inode)
271{
272 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
273
274 if (!vcookie->fscache)
275 return;
276 P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
277 vcookie->fscache);
278
279 fscache_relinquish_cookie(vcookie->fscache, 0);
280 vcookie->fscache = NULL;
281}
282
283void v9fs_cache_inode_flush_cookie(struct inode *inode)
284{
285 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
286
287 if (!vcookie->fscache)
288 return;
289 P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
290 vcookie->fscache);
291
292 fscache_relinquish_cookie(vcookie->fscache, 1);
293 vcookie->fscache = NULL;
294}
295
296void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
297{
298 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
299 struct p9_fid *fid;
300
301 if (!vcookie->fscache)
302 return;
303
304 spin_lock(&vcookie->lock);
305 fid = filp->private_data;
306 if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
307 v9fs_cache_inode_flush_cookie(inode);
308 else
309 v9fs_cache_inode_get_cookie(inode);
310
311 spin_unlock(&vcookie->lock);
312}
313
314void v9fs_cache_inode_reset_cookie(struct inode *inode)
315{
316 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
317 struct v9fs_session_info *v9ses;
318 struct fscache_cookie *old;
319
320 if (!vcookie->fscache)
321 return;
322
323 old = vcookie->fscache;
324
325 spin_lock(&vcookie->lock);
326 fscache_relinquish_cookie(vcookie->fscache, 1);
327
328 v9ses = v9fs_inode2v9ses(inode);
329 vcookie->fscache = fscache_acquire_cookie(v9ses->fscache,
330 &v9fs_cache_inode_index_def,
331 vcookie);
332
333 P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
334 inode, old, vcookie->fscache);
335
336 spin_unlock(&vcookie->lock);
337}
338
339int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
340{
341 struct inode *inode = page->mapping->host;
342 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
343
344 BUG_ON(!vcookie->fscache);
345
346 if (PageFsCache(page)) {
347 if (fscache_check_page_write(vcookie->fscache, page)) {
348 if (!(gfp & __GFP_WAIT))
349 return 0;
350 fscache_wait_on_page_write(vcookie->fscache, page);
351 }
352
353 fscache_uncache_page(vcookie->fscache, page);
354 ClearPageFsCache(page);
355 }
356
357 return 1;
358}
359
360void __v9fs_fscache_invalidate_page(struct page *page)
361{
362 struct inode *inode = page->mapping->host;
363 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
364
365 BUG_ON(!vcookie->fscache);
366
367 if (PageFsCache(page)) {
368 fscache_wait_on_page_write(vcookie->fscache, page);
369 BUG_ON(!PageLocked(page));
370 fscache_uncache_page(vcookie->fscache, page);
371 ClearPageFsCache(page);
372 }
373}
374
375static void v9fs_vfs_readpage_complete(struct page *page, void *data,
376 int error)
377{
378 if (!error)
379 SetPageUptodate(page);
380
381 unlock_page(page);
382}
383
384/**
385 * __v9fs_readpage_from_fscache - read a page from cache
386 *
387 * Returns 0 if the pages are in cache and a BIO is submitted,
388 * 1 if the pages are not in cache and -error otherwise.
389 */
390
391int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
392{
393 int ret;
394 const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
395
396 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
397 if (!vcookie->fscache)
398 return -ENOBUFS;
399
400 ret = fscache_read_or_alloc_page(vcookie->fscache,
401 page,
402 v9fs_vfs_readpage_complete,
403 NULL,
404 GFP_KERNEL);
405 switch (ret) {
406 case -ENOBUFS:
407 case -ENODATA:
408 P9_DPRINTK(P9_DEBUG_FSC, "page/inode not in cache %d", ret);
409 return 1;
410 case 0:
411 P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
412 return ret;
413 default:
414 P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
415 return ret;
416 }
417}
418
419/**
420 * __v9fs_readpages_from_fscache - read multiple pages from cache
421 *
422 * Returns 0 if the pages are in cache and a BIO is submitted,
423 * 1 if the pages are not in cache and -error otherwise.
424 */
425
426int __v9fs_readpages_from_fscache(struct inode *inode,
427 struct address_space *mapping,
428 struct list_head *pages,
429 unsigned *nr_pages)
430{
431 int ret;
432 const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
433
434 P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
435 if (!vcookie->fscache)
436 return -ENOBUFS;
437
438 ret = fscache_read_or_alloc_pages(vcookie->fscache,
439 mapping, pages, nr_pages,
440 v9fs_vfs_readpage_complete,
441 NULL,
442 mapping_gfp_mask(mapping));
443 switch (ret) {
444 case -ENOBUFS:
445 case -ENODATA:
446 P9_DPRINTK(P9_DEBUG_FSC, "pages/inodes not in cache %d", ret);
447 return 1;
448 case 0:
449 BUG_ON(!list_empty(pages));
450 BUG_ON(*nr_pages != 0);
451 P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted");
452 return ret;
453 default:
454 P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret);
455 return ret;
456 }
457}
458
459/**
460 * __v9fs_readpage_to_fscache - write a page to the cache
461 *
462 */
463
464void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
465{
466 int ret;
467 const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
468
469 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
470 ret = fscache_write_page(vcookie->fscache, page, GFP_KERNEL);
471 P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret);
472 if (ret != 0)
473 v9fs_uncache_page(inode, page);
474}
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
new file mode 100644
index 000000000000..a94192bfaee8
--- /dev/null
+++ b/fs/9p/cache.h
@@ -0,0 +1,176 @@
1/*
2 * V9FS cache definitions.
3 *
4 * Copyright (C) 2009 by Abhishek Kulkarni <adkulkar@umail.iu.edu>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to:
17 * Free Software Foundation
18 * 51 Franklin Street, Fifth Floor
19 * Boston, MA 02111-1301 USA
20 *
21 */
22
23#ifndef _9P_CACHE_H
24#ifdef CONFIG_9P_FSCACHE
25#include <linux/fscache.h>
26#include <linux/spinlock.h>
27
28extern struct kmem_cache *vcookie_cache;
29
30struct v9fs_cookie {
31 spinlock_t lock;
32 struct inode inode;
33 struct fscache_cookie *fscache;
34 struct p9_qid *qid;
35};
36
37static inline struct v9fs_cookie *v9fs_inode2cookie(const struct inode *inode)
38{
39 return container_of(inode, struct v9fs_cookie, inode);
40}
41
42extern struct fscache_netfs v9fs_cache_netfs;
43extern const struct fscache_cookie_def v9fs_cache_session_index_def;
44extern const struct fscache_cookie_def v9fs_cache_inode_index_def;
45
46extern void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses);
47extern void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses);
48
49extern void v9fs_cache_inode_get_cookie(struct inode *inode);
50extern void v9fs_cache_inode_put_cookie(struct inode *inode);
51extern void v9fs_cache_inode_flush_cookie(struct inode *inode);
52extern void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp);
53extern void v9fs_cache_inode_reset_cookie(struct inode *inode);
54
55extern int __v9fs_cache_register(void);
56extern void __v9fs_cache_unregister(void);
57
58extern int __v9fs_fscache_release_page(struct page *page, gfp_t gfp);
59extern void __v9fs_fscache_invalidate_page(struct page *page);
60extern int __v9fs_readpage_from_fscache(struct inode *inode,
61 struct page *page);
62extern int __v9fs_readpages_from_fscache(struct inode *inode,
63 struct address_space *mapping,
64 struct list_head *pages,
65 unsigned *nr_pages);
66extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
67
68
69/**
70 * v9fs_cache_register - Register v9fs file system with the cache
71 */
72static inline int v9fs_cache_register(void)
73{
74 return __v9fs_cache_register();
75}
76
77/**
78 * v9fs_cache_unregister - Unregister v9fs from the cache
79 */
80static inline void v9fs_cache_unregister(void)
81{
82 __v9fs_cache_unregister();
83}
84
85static inline int v9fs_fscache_release_page(struct page *page,
86 gfp_t gfp)
87{
88 return __v9fs_fscache_release_page(page, gfp);
89}
90
91static inline void v9fs_fscache_invalidate_page(struct page *page)
92{
93 __v9fs_fscache_invalidate_page(page);
94}
95
96static inline int v9fs_readpage_from_fscache(struct inode *inode,
97 struct page *page)
98{
99 return __v9fs_readpage_from_fscache(inode, page);
100}
101
102static inline int v9fs_readpages_from_fscache(struct inode *inode,
103 struct address_space *mapping,
104 struct list_head *pages,
105 unsigned *nr_pages)
106{
107 return __v9fs_readpages_from_fscache(inode, mapping, pages,
108 nr_pages);
109}
110
111static inline void v9fs_readpage_to_fscache(struct inode *inode,
112 struct page *page)
113{
114 if (PageFsCache(page))
115 __v9fs_readpage_to_fscache(inode, page);
116}
117
118static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
119{
120 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
121 fscache_uncache_page(vcookie->fscache, page);
122 BUG_ON(PageFsCache(page));
123}
124
125static inline void v9fs_vcookie_set_qid(struct inode *inode,
126 struct p9_qid *qid)
127{
128 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode);
129 spin_lock(&vcookie->lock);
130 vcookie->qid = qid;
131 spin_unlock(&vcookie->lock);
132}
133
134#else /* CONFIG_9P_FSCACHE */
135
136static inline int v9fs_cache_register(void)
137{
138 return 1;
139}
140
141static inline void v9fs_cache_unregister(void) {}
142
143static inline int v9fs_fscache_release_page(struct page *page,
144 gfp_t gfp) {
145 return 1;
146}
147
148static inline void v9fs_fscache_invalidate_page(struct page *page) {}
149
150static inline int v9fs_readpage_from_fscache(struct inode *inode,
151 struct page *page)
152{
153 return -ENOBUFS;
154}
155
156static inline int v9fs_readpages_from_fscache(struct inode *inode,
157 struct address_space *mapping,
158 struct list_head *pages,
159 unsigned *nr_pages)
160{
161 return -ENOBUFS;
162}
163
164static inline void v9fs_readpage_to_fscache(struct inode *inode,
165 struct page *page)
166{}
167
168static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
169{}
170
171static inline void v9fs_vcookie_set_qid(struct inode *inode,
172 struct p9_qid *qid)
173{}
174
175#endif /* CONFIG_9P_FSCACHE */
176#endif /* _9P_CACHE_H */
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index f7003cfac63d..cf62b05e296a 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -34,21 +34,25 @@
34#include <net/9p/transport.h> 34#include <net/9p/transport.h>
35#include "v9fs.h" 35#include "v9fs.h"
36#include "v9fs_vfs.h" 36#include "v9fs_vfs.h"
37#include "cache.h"
38
39static DEFINE_SPINLOCK(v9fs_sessionlist_lock);
40static LIST_HEAD(v9fs_sessionlist);
37 41
38/* 42/*
39 * Option Parsing (code inspired by NFS code) 43 * Option Parsing (code inspired by NFS code)
40 * NOTE: each transport will parse its own options 44 * NOTE: each transport will parse its own options
41 */ 45 */
42 46
43enum { 47enum {
44 /* Options that take integer arguments */ 48 /* Options that take integer arguments */
45 Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid, 49 Opt_debug, Opt_dfltuid, Opt_dfltgid, Opt_afid,
46 /* String options */ 50 /* String options */
47 Opt_uname, Opt_remotename, Opt_trans, 51 Opt_uname, Opt_remotename, Opt_trans, Opt_cache, Opt_cachetag,
48 /* Options that take no arguments */ 52 /* Options that take no arguments */
49 Opt_nodevmap, 53 Opt_nodevmap,
50 /* Cache options */ 54 /* Cache options */
51 Opt_cache_loose, 55 Opt_cache_loose, Opt_fscache,
52 /* Access options */ 56 /* Access options */
53 Opt_access, 57 Opt_access,
54 /* Error token */ 58 /* Error token */
@@ -63,8 +67,10 @@ static const match_table_t tokens = {
63 {Opt_uname, "uname=%s"}, 67 {Opt_uname, "uname=%s"},
64 {Opt_remotename, "aname=%s"}, 68 {Opt_remotename, "aname=%s"},
65 {Opt_nodevmap, "nodevmap"}, 69 {Opt_nodevmap, "nodevmap"},
66 {Opt_cache_loose, "cache=loose"}, 70 {Opt_cache, "cache=%s"},
67 {Opt_cache_loose, "loose"}, 71 {Opt_cache_loose, "loose"},
72 {Opt_fscache, "fscache"},
73 {Opt_cachetag, "cachetag=%s"},
68 {Opt_access, "access=%s"}, 74 {Opt_access, "access=%s"},
69 {Opt_err, NULL} 75 {Opt_err, NULL}
70}; 76};
@@ -89,16 +95,16 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
89 v9ses->afid = ~0; 95 v9ses->afid = ~0;
90 v9ses->debug = 0; 96 v9ses->debug = 0;
91 v9ses->cache = 0; 97 v9ses->cache = 0;
98#ifdef CONFIG_9P_FSCACHE
99 v9ses->cachetag = NULL;
100#endif
92 101
93 if (!opts) 102 if (!opts)
94 return 0; 103 return 0;
95 104
96 options = kstrdup(opts, GFP_KERNEL); 105 options = kstrdup(opts, GFP_KERNEL);
97 if (!options) { 106 if (!options)
98 P9_DPRINTK(P9_DEBUG_ERROR, 107 goto fail_option_alloc;
99 "failed to allocate copy of option string\n");
100 return -ENOMEM;
101 }
102 108
103 while ((p = strsep(&options, ",")) != NULL) { 109 while ((p = strsep(&options, ",")) != NULL) {
104 int token; 110 int token;
@@ -143,16 +149,33 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
143 case Opt_cache_loose: 149 case Opt_cache_loose:
144 v9ses->cache = CACHE_LOOSE; 150 v9ses->cache = CACHE_LOOSE;
145 break; 151 break;
152 case Opt_fscache:
153 v9ses->cache = CACHE_FSCACHE;
154 break;
155 case Opt_cachetag:
156#ifdef CONFIG_9P_FSCACHE
157 v9ses->cachetag = match_strdup(&args[0]);
158#endif
159 break;
160 case Opt_cache:
161 s = match_strdup(&args[0]);
162 if (!s)
163 goto fail_option_alloc;
164
165 if (strcmp(s, "loose") == 0)
166 v9ses->cache = CACHE_LOOSE;
167 else if (strcmp(s, "fscache") == 0)
168 v9ses->cache = CACHE_FSCACHE;
169 else
170 v9ses->cache = CACHE_NONE;
171 kfree(s);
172 break;
146 173
147 case Opt_access: 174 case Opt_access:
148 s = match_strdup(&args[0]); 175 s = match_strdup(&args[0]);
149 if (!s) { 176 if (!s)
150 P9_DPRINTK(P9_DEBUG_ERROR, 177 goto fail_option_alloc;
151 "failed to allocate copy" 178
152 " of option argument\n");
153 ret = -ENOMEM;
154 break;
155 }
156 v9ses->flags &= ~V9FS_ACCESS_MASK; 179 v9ses->flags &= ~V9FS_ACCESS_MASK;
157 if (strcmp(s, "user") == 0) 180 if (strcmp(s, "user") == 0)
158 v9ses->flags |= V9FS_ACCESS_USER; 181 v9ses->flags |= V9FS_ACCESS_USER;
@@ -173,6 +196,11 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
173 } 196 }
174 kfree(options); 197 kfree(options);
175 return ret; 198 return ret;
199
200fail_option_alloc:
201 P9_DPRINTK(P9_DEBUG_ERROR,
202 "failed to allocate copy of option argument\n");
203 return -ENOMEM;
176} 204}
177 205
178/** 206/**
@@ -200,6 +228,10 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
200 return ERR_PTR(-ENOMEM); 228 return ERR_PTR(-ENOMEM);
201 } 229 }
202 230
231 spin_lock(&v9fs_sessionlist_lock);
232 list_add(&v9ses->slist, &v9fs_sessionlist);
233 spin_unlock(&v9fs_sessionlist_lock);
234
203 v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER; 235 v9ses->flags = V9FS_EXTENDED | V9FS_ACCESS_USER;
204 strcpy(v9ses->uname, V9FS_DEFUSER); 236 strcpy(v9ses->uname, V9FS_DEFUSER);
205 strcpy(v9ses->aname, V9FS_DEFANAME); 237 strcpy(v9ses->aname, V9FS_DEFANAME);
@@ -249,6 +281,11 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
249 else 281 else
250 fid->uid = ~0; 282 fid->uid = ~0;
251 283
284#ifdef CONFIG_9P_FSCACHE
285 /* register the session for caching */
286 v9fs_cache_session_get_cookie(v9ses);
287#endif
288
252 return fid; 289 return fid;
253 290
254error: 291error:
@@ -268,8 +305,18 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
268 v9ses->clnt = NULL; 305 v9ses->clnt = NULL;
269 } 306 }
270 307
308#ifdef CONFIG_9P_FSCACHE
309 if (v9ses->fscache) {
310 v9fs_cache_session_put_cookie(v9ses);
311 kfree(v9ses->cachetag);
312 }
313#endif
271 __putname(v9ses->uname); 314 __putname(v9ses->uname);
272 __putname(v9ses->aname); 315 __putname(v9ses->aname);
316
317 spin_lock(&v9fs_sessionlist_lock);
318 list_del(&v9ses->slist);
319 spin_unlock(&v9fs_sessionlist_lock);
273} 320}
274 321
275/** 322/**
@@ -286,25 +333,132 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
286 333
287extern int v9fs_error_init(void); 334extern int v9fs_error_init(void);
288 335
336static struct kobject *v9fs_kobj;
337
338#ifdef CONFIG_9P_FSCACHE
289/** 339/**
290 * v9fs_init - Initialize module 340 * caches_show - list caches associated with a session
341 *
342 * Returns the size of buffer written.
343 */
344
345static ssize_t caches_show(struct kobject *kobj,
346 struct kobj_attribute *attr,
347 char *buf)
348{
349 ssize_t n = 0, count = 0, limit = PAGE_SIZE;
350 struct v9fs_session_info *v9ses;
351
352 spin_lock(&v9fs_sessionlist_lock);
353 list_for_each_entry(v9ses, &v9fs_sessionlist, slist) {
354 if (v9ses->cachetag) {
355 n = snprintf(buf, limit, "%s\n", v9ses->cachetag);
356 if (n < 0) {
357 count = n;
358 break;
359 }
360
361 count += n;
362 limit -= n;
363 }
364 }
365
366 spin_unlock(&v9fs_sessionlist_lock);
367 return count;
368}
369
370static struct kobj_attribute v9fs_attr_cache = __ATTR_RO(caches);
371#endif /* CONFIG_9P_FSCACHE */
372
373static struct attribute *v9fs_attrs[] = {
374#ifdef CONFIG_9P_FSCACHE
375 &v9fs_attr_cache.attr,
376#endif
377 NULL,
378};
379
380static struct attribute_group v9fs_attr_group = {
381 .attrs = v9fs_attrs,
382};
383
384/**
385 * v9fs_sysfs_init - Initialize the v9fs sysfs interface
386 *
387 */
388
389static int v9fs_sysfs_init(void)
390{
391 v9fs_kobj = kobject_create_and_add("9p", fs_kobj);
392 if (!v9fs_kobj)
393 return -ENOMEM;
394
395 if (sysfs_create_group(v9fs_kobj, &v9fs_attr_group)) {
396 kobject_put(v9fs_kobj);
397 return -ENOMEM;
398 }
399
400 return 0;
401}
402
403/**
404 * v9fs_sysfs_cleanup - Unregister the v9fs sysfs interface
405 *
406 */
407
408static void v9fs_sysfs_cleanup(void)
409{
410 sysfs_remove_group(v9fs_kobj, &v9fs_attr_group);
411 kobject_put(v9fs_kobj);
412}
413
414/**
415 * init_v9fs - Initialize module
291 * 416 *
292 */ 417 */
293 418
294static int __init init_v9fs(void) 419static int __init init_v9fs(void)
295{ 420{
421 int err;
296 printk(KERN_INFO "Installing v9fs 9p2000 file system support\n"); 422 printk(KERN_INFO "Installing v9fs 9p2000 file system support\n");
297 /* TODO: Setup list of registered trasnport modules */ 423 /* TODO: Setup list of registered trasnport modules */
298 return register_filesystem(&v9fs_fs_type); 424 err = register_filesystem(&v9fs_fs_type);
425 if (err < 0) {
426 printk(KERN_ERR "Failed to register filesystem\n");
427 return err;
428 }
429
430 err = v9fs_cache_register();
431 if (err < 0) {
432 printk(KERN_ERR "Failed to register v9fs for caching\n");
433 goto out_fs_unreg;
434 }
435
436 err = v9fs_sysfs_init();
437 if (err < 0) {
438 printk(KERN_ERR "Failed to register with sysfs\n");
439 goto out_sysfs_cleanup;
440 }
441
442 return 0;
443
444out_sysfs_cleanup:
445 v9fs_sysfs_cleanup();
446
447out_fs_unreg:
448 unregister_filesystem(&v9fs_fs_type);
449
450 return err;
299} 451}
300 452
301/** 453/**
302 * v9fs_init - shutdown module 454 * exit_v9fs - shutdown module
303 * 455 *
304 */ 456 */
305 457
306static void __exit exit_v9fs(void) 458static void __exit exit_v9fs(void)
307{ 459{
460 v9fs_sysfs_cleanup();
461 v9fs_cache_unregister();
308 unregister_filesystem(&v9fs_fs_type); 462 unregister_filesystem(&v9fs_fs_type);
309} 463}
310 464
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 38762bf102a9..019f4ccb70c1 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -51,6 +51,7 @@ enum p9_session_flags {
51enum p9_cache_modes { 51enum p9_cache_modes {
52 CACHE_NONE, 52 CACHE_NONE,
53 CACHE_LOOSE, 53 CACHE_LOOSE,
54 CACHE_FSCACHE,
54}; 55};
55 56
56/** 57/**
@@ -60,6 +61,8 @@ enum p9_cache_modes {
60 * @debug: debug level 61 * @debug: debug level
61 * @afid: authentication handle 62 * @afid: authentication handle
62 * @cache: cache mode of type &p9_cache_modes 63 * @cache: cache mode of type &p9_cache_modes
64 * @cachetag: the tag of the cache associated with this session
65 * @fscache: session cookie associated with FS-Cache
63 * @options: copy of options string given by user 66 * @options: copy of options string given by user
64 * @uname: string user name to mount hierarchy as 67 * @uname: string user name to mount hierarchy as
65 * @aname: mount specifier for remote hierarchy 68 * @aname: mount specifier for remote hierarchy
@@ -68,7 +71,7 @@ enum p9_cache_modes {
68 * @dfltgid: default numeric groupid to mount hierarchy as 71 * @dfltgid: default numeric groupid to mount hierarchy as
69 * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy 72 * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy
70 * @clnt: reference to 9P network client instantiated for this session 73 * @clnt: reference to 9P network client instantiated for this session
71 * @debugfs_dir: reference to debugfs_dir which can be used for add'l debug 74 * @slist: reference to list of registered 9p sessions
72 * 75 *
73 * This structure holds state for each session instance established during 76 * This structure holds state for each session instance established during
74 * a sys_mount() . 77 * a sys_mount() .
@@ -84,6 +87,10 @@ struct v9fs_session_info {
84 unsigned short debug; 87 unsigned short debug;
85 unsigned int afid; 88 unsigned int afid;
86 unsigned int cache; 89 unsigned int cache;
90#ifdef CONFIG_9P_FSCACHE
91 char *cachetag;
92 struct fscache_cookie *fscache;
93#endif
87 94
88 char *uname; /* user name to mount as */ 95 char *uname; /* user name to mount as */
89 char *aname; /* name of remote hierarchy being mounted */ 96 char *aname; /* name of remote hierarchy being mounted */
@@ -92,11 +99,9 @@ struct v9fs_session_info {
92 unsigned int dfltgid; /* default gid for legacy support */ 99 unsigned int dfltgid; /* default gid for legacy support */
93 u32 uid; /* if ACCESS_SINGLE, the uid that has access */ 100 u32 uid; /* if ACCESS_SINGLE, the uid that has access */
94 struct p9_client *clnt; /* 9p client */ 101 struct p9_client *clnt; /* 9p client */
95 struct dentry *debugfs_dir; 102 struct list_head slist; /* list of sessions registered with v9fs */
96}; 103};
97 104
98extern struct dentry *v9fs_debugfs_root;
99
100struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, 105struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
101 char *); 106 char *);
102void v9fs_session_close(struct v9fs_session_info *v9ses); 107void v9fs_session_close(struct v9fs_session_info *v9ses);
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index f0c7de78e205..3a7560e35865 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -44,7 +44,13 @@ extern const struct file_operations v9fs_dir_operations;
44extern const struct dentry_operations v9fs_dentry_operations; 44extern const struct dentry_operations v9fs_dentry_operations;
45extern const struct dentry_operations v9fs_cached_dentry_operations; 45extern const struct dentry_operations v9fs_cached_dentry_operations;
46 46
47#ifdef CONFIG_9P_FSCACHE
48struct inode *v9fs_alloc_inode(struct super_block *sb);
49void v9fs_destroy_inode(struct inode *inode);
50#endif
51
47struct inode *v9fs_get_inode(struct super_block *sb, int mode); 52struct inode *v9fs_get_inode(struct super_block *sb, int mode);
53void v9fs_clear_inode(struct inode *inode);
48ino_t v9fs_qid2ino(struct p9_qid *qid); 54ino_t v9fs_qid2ino(struct p9_qid *qid);
49void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); 55void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
50int v9fs_dir_release(struct inode *inode, struct file *filp); 56int v9fs_dir_release(struct inode *inode, struct file *filp);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 92828281a30b..90e38449f4b3 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -38,6 +38,7 @@
38 38
39#include "v9fs.h" 39#include "v9fs.h"
40#include "v9fs_vfs.h" 40#include "v9fs_vfs.h"
41#include "cache.h"
41 42
42/** 43/**
43 * v9fs_vfs_readpage - read an entire page in from 9P 44 * v9fs_vfs_readpage - read an entire page in from 9P
@@ -52,18 +53,31 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
52 int retval; 53 int retval;
53 loff_t offset; 54 loff_t offset;
54 char *buffer; 55 char *buffer;
56 struct inode *inode;
55 57
58 inode = page->mapping->host;
56 P9_DPRINTK(P9_DEBUG_VFS, "\n"); 59 P9_DPRINTK(P9_DEBUG_VFS, "\n");
60
61 BUG_ON(!PageLocked(page));
62
63 retval = v9fs_readpage_from_fscache(inode, page);
64 if (retval == 0)
65 return retval;
66
57 buffer = kmap(page); 67 buffer = kmap(page);
58 offset = page_offset(page); 68 offset = page_offset(page);
59 69
60 retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset); 70 retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset);
61 if (retval < 0) 71 if (retval < 0) {
72 v9fs_uncache_page(inode, page);
62 goto done; 73 goto done;
74 }
63 75
64 memset(buffer + retval, 0, PAGE_CACHE_SIZE - retval); 76 memset(buffer + retval, 0, PAGE_CACHE_SIZE - retval);
65 flush_dcache_page(page); 77 flush_dcache_page(page);
66 SetPageUptodate(page); 78 SetPageUptodate(page);
79
80 v9fs_readpage_to_fscache(inode, page);
67 retval = 0; 81 retval = 0;
68 82
69done: 83done:
@@ -72,6 +86,78 @@ done:
72 return retval; 86 return retval;
73} 87}
74 88
89/**
90 * v9fs_vfs_readpages - read a set of pages from 9P
91 *
92 * @filp: file being read
93 * @mapping: the address space
94 * @pages: list of pages to read
95 * @nr_pages: count of pages to read
96 *
97 */
98
99static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
100 struct list_head *pages, unsigned nr_pages)
101{
102 int ret = 0;
103 struct inode *inode;
104
105 inode = mapping->host;
106 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
107
108 ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages);
109 if (ret == 0)
110 return ret;
111
112 ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp);
113 P9_DPRINTK(P9_DEBUG_VFS, " = %d\n", ret);
114 return ret;
115}
116
117/**
118 * v9fs_release_page - release the private state associated with a page
119 *
120 * Returns 1 if the page can be released, false otherwise.
121 */
122
123static int v9fs_release_page(struct page *page, gfp_t gfp)
124{
125 if (PagePrivate(page))
126 return 0;
127
128 return v9fs_fscache_release_page(page, gfp);
129}
130
131/**
132 * v9fs_invalidate_page - Invalidate a page completely or partially
133 *
134 * @page: structure to page
135 * @offset: offset in the page
136 */
137
138static void v9fs_invalidate_page(struct page *page, unsigned long offset)
139{
140 if (offset == 0)
141 v9fs_fscache_invalidate_page(page);
142}
143
144/**
145 * v9fs_launder_page - Writeback a dirty page
146 * Since the writes go directly to the server, we simply return a 0
147 * here to indicate success.
148 *
149 * Returns 0 on success.
150 */
151
152static int v9fs_launder_page(struct page *page)
153{
154 return 0;
155}
156
75const struct address_space_operations v9fs_addr_operations = { 157const struct address_space_operations v9fs_addr_operations = {
76 .readpage = v9fs_vfs_readpage, 158 .readpage = v9fs_vfs_readpage,
159 .readpages = v9fs_vfs_readpages,
160 .releasepage = v9fs_release_page,
161 .invalidatepage = v9fs_invalidate_page,
162 .launder_page = v9fs_launder_page,
77}; 163};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 68bf2af6c389..3902bf43a088 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -32,6 +32,7 @@
32#include <linux/string.h> 32#include <linux/string.h>
33#include <linux/inet.h> 33#include <linux/inet.h>
34#include <linux/list.h> 34#include <linux/list.h>
35#include <linux/pagemap.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <linux/idr.h> 37#include <linux/idr.h>
37#include <net/9p/9p.h> 38#include <net/9p/9p.h>
@@ -40,6 +41,7 @@
40#include "v9fs.h" 41#include "v9fs.h"
41#include "v9fs_vfs.h" 42#include "v9fs_vfs.h"
42#include "fid.h" 43#include "fid.h"
44#include "cache.h"
43 45
44static const struct file_operations v9fs_cached_file_operations; 46static const struct file_operations v9fs_cached_file_operations;
45 47
@@ -72,7 +74,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
72 return err; 74 return err;
73 } 75 }
74 if (omode & P9_OTRUNC) { 76 if (omode & P9_OTRUNC) {
75 inode->i_size = 0; 77 i_size_write(inode, 0);
76 inode->i_blocks = 0; 78 inode->i_blocks = 0;
77 } 79 }
78 if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses))) 80 if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses)))
@@ -85,6 +87,10 @@ int v9fs_file_open(struct inode *inode, struct file *file)
85 /* enable cached file options */ 87 /* enable cached file options */
86 if(file->f_op == &v9fs_file_operations) 88 if(file->f_op == &v9fs_file_operations)
87 file->f_op = &v9fs_cached_file_operations; 89 file->f_op = &v9fs_cached_file_operations;
90
91#ifdef CONFIG_9P_FSCACHE
92 v9fs_cache_inode_set_cookie(inode, file);
93#endif
88 } 94 }
89 95
90 return 0; 96 return 0;
@@ -210,6 +216,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
210 struct p9_client *clnt; 216 struct p9_client *clnt;
211 struct inode *inode = filp->f_path.dentry->d_inode; 217 struct inode *inode = filp->f_path.dentry->d_inode;
212 int origin = *offset; 218 int origin = *offset;
219 unsigned long pg_start, pg_end;
213 220
214 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, 221 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
215 (int)count, (int)*offset); 222 (int)count, (int)*offset);
@@ -225,7 +232,7 @@ v9fs_file_write(struct file *filp, const char __user * data,
225 if (count < rsize) 232 if (count < rsize)
226 rsize = count; 233 rsize = count;
227 234
228 n = p9_client_write(fid, NULL, data+total, *offset+total, 235 n = p9_client_write(fid, NULL, data+total, origin+total,
229 rsize); 236 rsize);
230 if (n <= 0) 237 if (n <= 0)
231 break; 238 break;
@@ -234,14 +241,14 @@ v9fs_file_write(struct file *filp, const char __user * data,
234 } while (count > 0); 241 } while (count > 0);
235 242
236 if (total > 0) { 243 if (total > 0) {
237 invalidate_inode_pages2_range(inode->i_mapping, origin, 244 pg_start = origin >> PAGE_CACHE_SHIFT;
238 origin+total); 245 pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
246 if (inode->i_mapping && inode->i_mapping->nrpages)
247 invalidate_inode_pages2_range(inode->i_mapping,
248 pg_start, pg_end);
239 *offset += total; 249 *offset += total;
240 } 250 i_size_write(inode, i_size_read(inode) + total);
241 251 inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
242 if (*offset > inode->i_size) {
243 inode->i_size = *offset;
244 inode->i_blocks = (inode->i_size + 512 - 1) >> 9;
245 } 252 }
246 253
247 if (n < 0) 254 if (n < 0)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 06a223d50a81..5947628aefef 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -40,6 +40,7 @@
40#include "v9fs.h" 40#include "v9fs.h"
41#include "v9fs_vfs.h" 41#include "v9fs_vfs.h"
42#include "fid.h" 42#include "fid.h"
43#include "cache.h"
43 44
44static const struct inode_operations v9fs_dir_inode_operations; 45static const struct inode_operations v9fs_dir_inode_operations;
45static const struct inode_operations v9fs_dir_inode_operations_ext; 46static const struct inode_operations v9fs_dir_inode_operations_ext;
@@ -197,6 +198,39 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
197 wstat->extension = NULL; 198 wstat->extension = NULL;
198} 199}
199 200
201#ifdef CONFIG_9P_FSCACHE
202/**
203 * v9fs_alloc_inode - helper function to allocate an inode
204 * This callback is executed before setting up the inode so that we
205 * can associate a vcookie with each inode.
206 *
207 */
208
209struct inode *v9fs_alloc_inode(struct super_block *sb)
210{
211 struct v9fs_cookie *vcookie;
212 vcookie = (struct v9fs_cookie *)kmem_cache_alloc(vcookie_cache,
213 GFP_KERNEL);
214 if (!vcookie)
215 return NULL;
216
217 vcookie->fscache = NULL;
218 vcookie->qid = NULL;
219 spin_lock_init(&vcookie->lock);
220 return &vcookie->inode;
221}
222
223/**
224 * v9fs_destroy_inode - destroy an inode
225 *
226 */
227
228void v9fs_destroy_inode(struct inode *inode)
229{
230 kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode));
231}
232#endif
233
200/** 234/**
201 * v9fs_get_inode - helper function to setup an inode 235 * v9fs_get_inode - helper function to setup an inode
202 * @sb: superblock 236 * @sb: superblock
@@ -326,6 +360,21 @@ error:
326} 360}
327*/ 361*/
328 362
363
364/**
365 * v9fs_clear_inode - release an inode
366 * @inode: inode to release
367 *
368 */
369void v9fs_clear_inode(struct inode *inode)
370{
371 filemap_fdatawrite(inode->i_mapping);
372
373#ifdef CONFIG_9P_FSCACHE
374 v9fs_cache_inode_put_cookie(inode);
375#endif
376}
377
329/** 378/**
330 * v9fs_inode_from_fid - populate an inode by issuing a attribute request 379 * v9fs_inode_from_fid - populate an inode by issuing a attribute request
331 * @v9ses: session information 380 * @v9ses: session information
@@ -356,8 +405,14 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
356 405
357 v9fs_stat2inode(st, ret, sb); 406 v9fs_stat2inode(st, ret, sb);
358 ret->i_ino = v9fs_qid2ino(&st->qid); 407 ret->i_ino = v9fs_qid2ino(&st->qid);
408
409#ifdef CONFIG_9P_FSCACHE
410 v9fs_vcookie_set_qid(ret, &st->qid);
411 v9fs_cache_inode_get_cookie(ret);
412#endif
359 p9stat_free(st); 413 p9stat_free(st);
360 kfree(st); 414 kfree(st);
415
361 return ret; 416 return ret;
362 417
363error: 418error:
@@ -751,7 +806,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
751 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); 806 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
752 err = -EPERM; 807 err = -EPERM;
753 v9ses = v9fs_inode2v9ses(dentry->d_inode); 808 v9ses = v9fs_inode2v9ses(dentry->d_inode);
754 if (v9ses->cache == CACHE_LOOSE) 809 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
755 return simple_getattr(mnt, dentry, stat); 810 return simple_getattr(mnt, dentry, stat);
756 811
757 fid = v9fs_fid_lookup(dentry); 812 fid = v9fs_fid_lookup(dentry);
@@ -872,10 +927,10 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
872 } else 927 } else
873 inode->i_rdev = 0; 928 inode->i_rdev = 0;
874 929
875 inode->i_size = stat->length; 930 i_size_write(inode, stat->length);
876 931
877 /* not real number of blocks, but 512 byte ones ... */ 932 /* not real number of blocks, but 512 byte ones ... */
878 inode->i_blocks = (inode->i_size + 512 - 1) >> 9; 933 inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
879} 934}
880 935
881/** 936/**
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 8961f1a8f668..14a86448572c 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -44,21 +44,9 @@
44#include "v9fs_vfs.h" 44#include "v9fs_vfs.h"
45#include "fid.h" 45#include "fid.h"
46 46
47static void v9fs_clear_inode(struct inode *);
48static const struct super_operations v9fs_super_ops; 47static const struct super_operations v9fs_super_ops;
49 48
50/** 49/**
51 * v9fs_clear_inode - release an inode
52 * @inode: inode to release
53 *
54 */
55
56static void v9fs_clear_inode(struct inode *inode)
57{
58 filemap_fdatawrite(inode->i_mapping);
59}
60
61/**
62 * v9fs_set_super - set the superblock 50 * v9fs_set_super - set the superblock
63 * @s: super block 51 * @s: super block
64 * @data: file system specific data 52 * @data: file system specific data
@@ -220,6 +208,10 @@ v9fs_umount_begin(struct super_block *sb)
220} 208}
221 209
222static const struct super_operations v9fs_super_ops = { 210static const struct super_operations v9fs_super_ops = {
211#ifdef CONFIG_9P_FSCACHE
212 .alloc_inode = v9fs_alloc_inode,
213 .destroy_inode = v9fs_destroy_inode,
214#endif
223 .statfs = simple_statfs, 215 .statfs = simple_statfs,
224 .clear_inode = v9fs_clear_inode, 216 .clear_inode = v9fs_clear_inode,
225 .show_options = generic_show_options, 217 .show_options = generic_show_options,
diff --git a/fs/Kconfig b/fs/Kconfig
index 455aa207e67e..d4bf8caad8d0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -109,6 +109,7 @@ source "fs/sysfs/Kconfig"
109 109
110config TMPFS 110config TMPFS
111 bool "Virtual memory file system support (former shm fs)" 111 bool "Virtual memory file system support (former shm fs)"
112 depends on SHMEM
112 help 113 help
113 Tmpfs is a file system which keeps all files in virtual memory. 114 Tmpfs is a file system which keeps all files in virtual memory.
114 115
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 798cb071d132..3f57ce4bee5d 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -19,9 +19,6 @@ static int
19adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh, 19adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
20 int create) 20 int create)
21{ 21{
22 if (block < 0)
23 goto abort_negative;
24
25 if (!create) { 22 if (!create) {
26 if (block >= inode->i_blocks) 23 if (block >= inode->i_blocks)
27 goto abort_toobig; 24 goto abort_toobig;
@@ -34,10 +31,6 @@ adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh,
34 /* don't support allocation of blocks yet */ 31 /* don't support allocation of blocks yet */
35 return -EIO; 32 return -EIO;
36 33
37abort_negative:
38 adfs_error(inode->i_sb, "block %d < 0", block);
39 return -EIO;
40
41abort_toobig: 34abort_toobig:
42 return 0; 35 return 0;
43} 36}
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 3ff8bdd18fb3..0931bc1325eb 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -21,7 +21,7 @@ static void afs_fl_release_private(struct file_lock *fl);
21static struct workqueue_struct *afs_lock_manager; 21static struct workqueue_struct *afs_lock_manager;
22static DEFINE_MUTEX(afs_lock_manager_mutex); 22static DEFINE_MUTEX(afs_lock_manager_mutex);
23 23
24static struct file_lock_operations afs_lock_ops = { 24static const struct file_lock_operations afs_lock_ops = {
25 .fl_copy_lock = afs_fl_copy_lock, 25 .fl_copy_lock = afs_fl_copy_lock,
26 .fl_release_private = afs_fl_release_private, 26 .fl_release_private = afs_fl_release_private,
27}; 27};
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 8630615e57fe..852739d262a9 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -28,7 +28,7 @@ static int afs_proc_cells_show(struct seq_file *m, void *v);
28static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, 28static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
29 size_t size, loff_t *_pos); 29 size_t size, loff_t *_pos);
30 30
31static struct seq_operations afs_proc_cells_ops = { 31static const struct seq_operations afs_proc_cells_ops = {
32 .start = afs_proc_cells_start, 32 .start = afs_proc_cells_start,
33 .next = afs_proc_cells_next, 33 .next = afs_proc_cells_next,
34 .stop = afs_proc_cells_stop, 34 .stop = afs_proc_cells_stop,
@@ -70,7 +70,7 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
70static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v); 70static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v);
71static int afs_proc_cell_volumes_show(struct seq_file *m, void *v); 71static int afs_proc_cell_volumes_show(struct seq_file *m, void *v);
72 72
73static struct seq_operations afs_proc_cell_volumes_ops = { 73static const struct seq_operations afs_proc_cell_volumes_ops = {
74 .start = afs_proc_cell_volumes_start, 74 .start = afs_proc_cell_volumes_start,
75 .next = afs_proc_cell_volumes_next, 75 .next = afs_proc_cell_volumes_next,
76 .stop = afs_proc_cell_volumes_stop, 76 .stop = afs_proc_cell_volumes_stop,
@@ -95,7 +95,7 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
95static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v); 95static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v);
96static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v); 96static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v);
97 97
98static struct seq_operations afs_proc_cell_vlservers_ops = { 98static const struct seq_operations afs_proc_cell_vlservers_ops = {
99 .start = afs_proc_cell_vlservers_start, 99 .start = afs_proc_cell_vlservers_start,
100 .next = afs_proc_cell_vlservers_next, 100 .next = afs_proc_cell_vlservers_next,
101 .stop = afs_proc_cell_vlservers_stop, 101 .stop = afs_proc_cell_vlservers_stop,
@@ -119,7 +119,7 @@ static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
119static void afs_proc_cell_servers_stop(struct seq_file *p, void *v); 119static void afs_proc_cell_servers_stop(struct seq_file *p, void *v);
120static int afs_proc_cell_servers_show(struct seq_file *m, void *v); 120static int afs_proc_cell_servers_show(struct seq_file *m, void *v);
121 121
122static struct seq_operations afs_proc_cell_servers_ops = { 122static const struct seq_operations afs_proc_cell_servers_ops = {
123 .start = afs_proc_cell_servers_start, 123 .start = afs_proc_cell_servers_start,
124 .next = afs_proc_cell_servers_next, 124 .next = afs_proc_cell_servers_next,
125 .stop = afs_proc_cell_servers_stop, 125 .stop = afs_proc_cell_servers_stop,
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c2e7a7ff0080..c63a3c8beb73 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -712,7 +712,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
712 .bdi = mapping->backing_dev_info, 712 .bdi = mapping->backing_dev_info,
713 .sync_mode = WB_SYNC_ALL, 713 .sync_mode = WB_SYNC_ALL,
714 .nr_to_write = LONG_MAX, 714 .nr_to_write = LONG_MAX,
715 .for_writepages = 1,
716 .range_cyclic = 1, 715 .range_cyclic = 1,
717 }; 716 };
718 int ret; 717 int ret;
diff --git a/fs/aio.c b/fs/aio.c
index d065b2c3273e..02a2c9340573 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -24,6 +24,7 @@
24#include <linux/file.h> 24#include <linux/file.h>
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/mman.h> 26#include <linux/mman.h>
27#include <linux/mmu_context.h>
27#include <linux/slab.h> 28#include <linux/slab.h>
28#include <linux/timer.h> 29#include <linux/timer.h>
29#include <linux/aio.h> 30#include <linux/aio.h>
@@ -34,7 +35,6 @@
34 35
35#include <asm/kmap_types.h> 36#include <asm/kmap_types.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
37#include <asm/mmu_context.h>
38 38
39#if DEBUG > 1 39#if DEBUG > 1
40#define dprintk printk 40#define dprintk printk
@@ -78,6 +78,7 @@ static int __init aio_setup(void)
78 78
79 return 0; 79 return 0;
80} 80}
81__initcall(aio_setup);
81 82
82static void aio_free_ring(struct kioctx *ctx) 83static void aio_free_ring(struct kioctx *ctx)
83{ 84{
@@ -380,6 +381,7 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
380 __set_current_state(TASK_RUNNING); 381 __set_current_state(TASK_RUNNING);
381 return iocb->ki_user_data; 382 return iocb->ki_user_data;
382} 383}
384EXPORT_SYMBOL(wait_on_sync_kiocb);
383 385
384/* exit_aio: called when the last user of mm goes away. At this point, 386/* exit_aio: called when the last user of mm goes away. At this point,
385 * there is no way for any new requests to be submited or any of the 387 * there is no way for any new requests to be submited or any of the
@@ -573,6 +575,7 @@ int aio_put_req(struct kiocb *req)
573 spin_unlock_irq(&ctx->ctx_lock); 575 spin_unlock_irq(&ctx->ctx_lock);
574 return ret; 576 return ret;
575} 577}
578EXPORT_SYMBOL(aio_put_req);
576 579
577static struct kioctx *lookup_ioctx(unsigned long ctx_id) 580static struct kioctx *lookup_ioctx(unsigned long ctx_id)
578{ 581{
@@ -595,51 +598,6 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
595} 598}
596 599
597/* 600/*
598 * use_mm
599 * Makes the calling kernel thread take on the specified
600 * mm context.
601 * Called by the retry thread execute retries within the
602 * iocb issuer's mm context, so that copy_from/to_user
603 * operations work seamlessly for aio.
604 * (Note: this routine is intended to be called only
605 * from a kernel thread context)
606 */
607static void use_mm(struct mm_struct *mm)
608{
609 struct mm_struct *active_mm;
610 struct task_struct *tsk = current;
611
612 task_lock(tsk);
613 active_mm = tsk->active_mm;
614 atomic_inc(&mm->mm_count);
615 tsk->mm = mm;
616 tsk->active_mm = mm;
617 switch_mm(active_mm, mm, tsk);
618 task_unlock(tsk);
619
620 mmdrop(active_mm);
621}
622
623/*
624 * unuse_mm
625 * Reverses the effect of use_mm, i.e. releases the
626 * specified mm context which was earlier taken on
627 * by the calling kernel thread
628 * (Note: this routine is intended to be called only
629 * from a kernel thread context)
630 */
631static void unuse_mm(struct mm_struct *mm)
632{
633 struct task_struct *tsk = current;
634
635 task_lock(tsk);
636 tsk->mm = NULL;
637 /* active_mm is still 'mm' */
638 enter_lazy_tlb(mm, tsk);
639 task_unlock(tsk);
640}
641
642/*
643 * Queue up a kiocb to be retried. Assumes that the kiocb 601 * Queue up a kiocb to be retried. Assumes that the kiocb
644 * has already been marked as kicked, and places it on 602 * has already been marked as kicked, and places it on
645 * the retry run list for the corresponding ioctx, if it 603 * the retry run list for the corresponding ioctx, if it
@@ -1037,6 +995,7 @@ put_rq:
1037 spin_unlock_irqrestore(&ctx->ctx_lock, flags); 995 spin_unlock_irqrestore(&ctx->ctx_lock, flags);
1038 return ret; 996 return ret;
1039} 997}
998EXPORT_SYMBOL(aio_complete);
1040 999
1041/* aio_read_evt 1000/* aio_read_evt
1042 * Pull an event off of the ioctx's event ring. Returns the number of 1001 * Pull an event off of the ioctx's event ring. Returns the number of
@@ -1825,9 +1784,3 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
1825 asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout); 1784 asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout);
1826 return ret; 1785 return ret;
1827} 1786}
1828
1829__initcall(aio_setup);
1830
1831EXPORT_SYMBOL(aio_complete);
1832EXPORT_SYMBOL(aio_put_req);
1833EXPORT_SYMBOL(wait_on_sync_kiocb);
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 47d4a01c5393..d11c51fc2a3f 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -77,28 +77,24 @@ static const struct address_space_operations anon_aops = {
77 * 77 *
78 * Creates a new file by hooking it on a single inode. This is useful for files 78 * Creates a new file by hooking it on a single inode. This is useful for files
79 * that do not need to have a full-fledged inode in order to operate correctly. 79 * that do not need to have a full-fledged inode in order to operate correctly.
80 * All the files created with anon_inode_getfd() will share a single inode, 80 * All the files created with anon_inode_getfile() will share a single inode,
81 * hence saving memory and avoiding code duplication for the file/inode/dentry 81 * hence saving memory and avoiding code duplication for the file/inode/dentry
82 * setup. Returns new descriptor or -error. 82 * setup. Returns the newly created file* or an error pointer.
83 */ 83 */
84int anon_inode_getfd(const char *name, const struct file_operations *fops, 84struct file *anon_inode_getfile(const char *name,
85 void *priv, int flags) 85 const struct file_operations *fops,
86 void *priv, int flags)
86{ 87{
87 struct qstr this; 88 struct qstr this;
88 struct dentry *dentry; 89 struct dentry *dentry;
89 struct file *file; 90 struct file *file;
90 int error, fd; 91 int error;
91 92
92 if (IS_ERR(anon_inode_inode)) 93 if (IS_ERR(anon_inode_inode))
93 return -ENODEV; 94 return ERR_PTR(-ENODEV);
94 95
95 if (fops->owner && !try_module_get(fops->owner)) 96 if (fops->owner && !try_module_get(fops->owner))
96 return -ENOENT; 97 return ERR_PTR(-ENOENT);
97
98 error = get_unused_fd_flags(flags);
99 if (error < 0)
100 goto err_module;
101 fd = error;
102 98
103 /* 99 /*
104 * Link the inode to a directory entry by creating a unique name 100 * Link the inode to a directory entry by creating a unique name
@@ -110,7 +106,7 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
110 this.hash = 0; 106 this.hash = 0;
111 dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this); 107 dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
112 if (!dentry) 108 if (!dentry)
113 goto err_put_unused_fd; 109 goto err_module;
114 110
115 /* 111 /*
116 * We know the anon_inode inode count is always greater than zero, 112 * We know the anon_inode inode count is always greater than zero,
@@ -136,16 +132,54 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
136 file->f_version = 0; 132 file->f_version = 0;
137 file->private_data = priv; 133 file->private_data = priv;
138 134
135 return file;
136
137err_dput:
138 dput(dentry);
139err_module:
140 module_put(fops->owner);
141 return ERR_PTR(error);
142}
143EXPORT_SYMBOL_GPL(anon_inode_getfile);
144
145/**
146 * anon_inode_getfd - creates a new file instance by hooking it up to an
147 * anonymous inode, and a dentry that describe the "class"
148 * of the file
149 *
150 * @name: [in] name of the "class" of the new file
151 * @fops: [in] file operations for the new file
152 * @priv: [in] private data for the new file (will be file's private_data)
153 * @flags: [in] flags
154 *
155 * Creates a new file by hooking it on a single inode. This is useful for files
156 * that do not need to have a full-fledged inode in order to operate correctly.
157 * All the files created with anon_inode_getfd() will share a single inode,
158 * hence saving memory and avoiding code duplication for the file/inode/dentry
159 * setup. Returns new descriptor or an error code.
160 */
161int anon_inode_getfd(const char *name, const struct file_operations *fops,
162 void *priv, int flags)
163{
164 int error, fd;
165 struct file *file;
166
167 error = get_unused_fd_flags(flags);
168 if (error < 0)
169 return error;
170 fd = error;
171
172 file = anon_inode_getfile(name, fops, priv, flags);
173 if (IS_ERR(file)) {
174 error = PTR_ERR(file);
175 goto err_put_unused_fd;
176 }
139 fd_install(fd, file); 177 fd_install(fd, file);
140 178
141 return fd; 179 return fd;
142 180
143err_dput:
144 dput(dentry);
145err_put_unused_fd: 181err_put_unused_fd:
146 put_unused_fd(fd); 182 put_unused_fd(fd);
147err_module:
148 module_put(fops->owner);
149 return error; 183 return error;
150} 184}
151EXPORT_SYMBOL_GPL(anon_inode_getfd); 185EXPORT_SYMBOL_GPL(anon_inode_getfd);
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 2316e944a109..e947915109e5 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -90,7 +90,7 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
90 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); 90 DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
91 continue; 91 continue;
92 } 92 }
93 while (d_mountpoint(path.dentry) && follow_down(&path)); 93 while (d_mountpoint(path.dentry) && follow_down(&path))
94 ; 94 ;
95 umount_ok = may_umount(path.mnt); 95 umount_ok = may_umount(path.mnt);
96 path_put(&path); 96 path_put(&path);
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 615d5496fe0f..dd376c124e71 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -842,7 +842,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
842 sb->s_magic = BEFS_SUPER_MAGIC; 842 sb->s_magic = BEFS_SUPER_MAGIC;
843 /* Set real blocksize of fs */ 843 /* Set real blocksize of fs */
844 sb_set_blocksize(sb, (ulong) befs_sb->block_size); 844 sb_set_blocksize(sb, (ulong) befs_sb->block_size);
845 sb->s_op = (struct super_operations *) &befs_sops; 845 sb->s_op = &befs_sops;
846 root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir))); 846 root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir)));
847 if (IS_ERR(root)) { 847 if (IS_ERR(root)) {
848 ret = PTR_ERR(root); 848 ret = PTR_ERR(root);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7c1e65d54872..b9b3bb51b1e4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1280,9 +1280,6 @@ static int writenote(struct memelfnote *men, struct file *file,
1280#define DUMP_WRITE(addr, nr) \ 1280#define DUMP_WRITE(addr, nr) \
1281 if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ 1281 if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
1282 goto end_coredump; 1282 goto end_coredump;
1283#define DUMP_SEEK(off) \
1284 if (!dump_seek(file, (off))) \
1285 goto end_coredump;
1286 1283
1287static void fill_elf_header(struct elfhdr *elf, int segs, 1284static void fill_elf_header(struct elfhdr *elf, int segs,
1288 u16 machine, u32 flags, u8 osabi) 1285 u16 machine, u32 flags, u8 osabi)
@@ -1714,42 +1711,52 @@ struct elf_note_info {
1714 int numnote; 1711 int numnote;
1715}; 1712};
1716 1713
1717static int fill_note_info(struct elfhdr *elf, int phdrs, 1714static int elf_note_info_init(struct elf_note_info *info)
1718 struct elf_note_info *info,
1719 long signr, struct pt_regs *regs)
1720{ 1715{
1721#define NUM_NOTES 6 1716 memset(info, 0, sizeof(*info));
1722 struct list_head *t;
1723
1724 info->notes = NULL;
1725 info->prstatus = NULL;
1726 info->psinfo = NULL;
1727 info->fpu = NULL;
1728#ifdef ELF_CORE_COPY_XFPREGS
1729 info->xfpu = NULL;
1730#endif
1731 INIT_LIST_HEAD(&info->thread_list); 1717 INIT_LIST_HEAD(&info->thread_list);
1732 1718
1733 info->notes = kmalloc(NUM_NOTES * sizeof(struct memelfnote), 1719 /* Allocate space for six ELF notes */
1734 GFP_KERNEL); 1720 info->notes = kmalloc(6 * sizeof(struct memelfnote), GFP_KERNEL);
1735 if (!info->notes) 1721 if (!info->notes)
1736 return 0; 1722 return 0;
1737 info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL); 1723 info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
1738 if (!info->psinfo) 1724 if (!info->psinfo)
1739 return 0; 1725 goto notes_free;
1740 info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL); 1726 info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
1741 if (!info->prstatus) 1727 if (!info->prstatus)
1742 return 0; 1728 goto psinfo_free;
1743 info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL); 1729 info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
1744 if (!info->fpu) 1730 if (!info->fpu)
1745 return 0; 1731 goto prstatus_free;
1746#ifdef ELF_CORE_COPY_XFPREGS 1732#ifdef ELF_CORE_COPY_XFPREGS
1747 info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL); 1733 info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
1748 if (!info->xfpu) 1734 if (!info->xfpu)
1749 return 0; 1735 goto fpu_free;
1736#endif
1737 return 1;
1738#ifdef ELF_CORE_COPY_XFPREGS
1739 fpu_free:
1740 kfree(info->fpu);
1750#endif 1741#endif
1742 prstatus_free:
1743 kfree(info->prstatus);
1744 psinfo_free:
1745 kfree(info->psinfo);
1746 notes_free:
1747 kfree(info->notes);
1748 return 0;
1749}
1750
1751static int fill_note_info(struct elfhdr *elf, int phdrs,
1752 struct elf_note_info *info,
1753 long signr, struct pt_regs *regs)
1754{
1755 struct list_head *t;
1756
1757 if (!elf_note_info_init(info))
1758 return 0;
1751 1759
1752 info->thread_status_size = 0;
1753 if (signr) { 1760 if (signr) {
1754 struct core_thread *ct; 1761 struct core_thread *ct;
1755 struct elf_thread_status *ets; 1762 struct elf_thread_status *ets;
@@ -1809,8 +1816,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
1809#endif 1816#endif
1810 1817
1811 return 1; 1818 return 1;
1812
1813#undef NUM_NOTES
1814} 1819}
1815 1820
1816static size_t get_note_info_size(struct elf_note_info *info) 1821static size_t get_note_info_size(struct elf_note_info *info)
@@ -2016,7 +2021,8 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
2016 goto end_coredump; 2021 goto end_coredump;
2017 2022
2018 /* Align to page */ 2023 /* Align to page */
2019 DUMP_SEEK(dataoff - foffset); 2024 if (!dump_seek(file, dataoff - foffset))
2025 goto end_coredump;
2020 2026
2021 for (vma = first_vma(current, gate_vma); vma != NULL; 2027 for (vma = first_vma(current, gate_vma); vma != NULL;
2022 vma = next_vma(vma, gate_vma)) { 2028 vma = next_vma(vma, gate_vma)) {
@@ -2027,33 +2033,19 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
2027 2033
2028 for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { 2034 for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
2029 struct page *page; 2035 struct page *page;
2030 struct vm_area_struct *tmp_vma; 2036 int stop;
2031 2037
2032 if (get_user_pages(current, current->mm, addr, 1, 0, 1, 2038 page = get_dump_page(addr);
2033 &page, &tmp_vma) <= 0) { 2039 if (page) {
2034 DUMP_SEEK(PAGE_SIZE); 2040 void *kaddr = kmap(page);
2035 } else { 2041 stop = ((size += PAGE_SIZE) > limit) ||
2036 if (page == ZERO_PAGE(0)) { 2042 !dump_write(file, kaddr, PAGE_SIZE);
2037 if (!dump_seek(file, PAGE_SIZE)) { 2043 kunmap(page);
2038 page_cache_release(page);
2039 goto end_coredump;
2040 }
2041 } else {
2042 void *kaddr;
2043 flush_cache_page(tmp_vma, addr,
2044 page_to_pfn(page));
2045 kaddr = kmap(page);
2046 if ((size += PAGE_SIZE) > limit ||
2047 !dump_write(file, kaddr,
2048 PAGE_SIZE)) {
2049 kunmap(page);
2050 page_cache_release(page);
2051 goto end_coredump;
2052 }
2053 kunmap(page);
2054 }
2055 page_cache_release(page); 2044 page_cache_release(page);
2056 } 2045 } else
2046 stop = !dump_seek(file, PAGE_SIZE);
2047 if (stop)
2048 goto end_coredump;
2057 } 2049 }
2058 } 2050 }
2059 2051
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 20fbeced472b..38502c67987c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -283,20 +283,23 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
283 } 283 }
284 284
285 stack_size = exec_params.stack_size; 285 stack_size = exec_params.stack_size;
286 if (stack_size < interp_params.stack_size)
287 stack_size = interp_params.stack_size;
288
289 if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK) 286 if (exec_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
290 executable_stack = EXSTACK_ENABLE_X; 287 executable_stack = EXSTACK_ENABLE_X;
291 else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK) 288 else if (exec_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
292 executable_stack = EXSTACK_DISABLE_X; 289 executable_stack = EXSTACK_DISABLE_X;
293 else if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
294 executable_stack = EXSTACK_ENABLE_X;
295 else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
296 executable_stack = EXSTACK_DISABLE_X;
297 else 290 else
298 executable_stack = EXSTACK_DEFAULT; 291 executable_stack = EXSTACK_DEFAULT;
299 292
293 if (stack_size == 0) {
294 stack_size = interp_params.stack_size;
295 if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
296 executable_stack = EXSTACK_ENABLE_X;
297 else if (interp_params.flags & ELF_FDPIC_FLAG_NOEXEC_STACK)
298 executable_stack = EXSTACK_DISABLE_X;
299 else
300 executable_stack = EXSTACK_DEFAULT;
301 }
302
300 retval = -ENOEXEC; 303 retval = -ENOEXEC;
301 if (stack_size == 0) 304 if (stack_size == 0)
302 goto error; 305 goto error;
@@ -1325,9 +1328,6 @@ static int writenote(struct memelfnote *men, struct file *file)
1325#define DUMP_WRITE(addr, nr) \ 1328#define DUMP_WRITE(addr, nr) \
1326 if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ 1329 if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
1327 goto end_coredump; 1330 goto end_coredump;
1328#define DUMP_SEEK(off) \
1329 if (!dump_seek(file, (off))) \
1330 goto end_coredump;
1331 1331
1332static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) 1332static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
1333{ 1333{
@@ -1518,6 +1518,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
1518 unsigned long *limit, unsigned long mm_flags) 1518 unsigned long *limit, unsigned long mm_flags)
1519{ 1519{
1520 struct vm_area_struct *vma; 1520 struct vm_area_struct *vma;
1521 int err = 0;
1521 1522
1522 for (vma = current->mm->mmap; vma; vma = vma->vm_next) { 1523 for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
1523 unsigned long addr; 1524 unsigned long addr;
@@ -1525,43 +1526,26 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
1525 if (!maydump(vma, mm_flags)) 1526 if (!maydump(vma, mm_flags))
1526 continue; 1527 continue;
1527 1528
1528 for (addr = vma->vm_start; 1529 for (addr = vma->vm_start; addr < vma->vm_end;
1529 addr < vma->vm_end; 1530 addr += PAGE_SIZE) {
1530 addr += PAGE_SIZE 1531 struct page *page = get_dump_page(addr);
1531 ) { 1532 if (page) {
1532 struct vm_area_struct *vma; 1533 void *kaddr = kmap(page);
1533 struct page *page; 1534 *size += PAGE_SIZE;
1534 1535 if (*size > *limit)
1535 if (get_user_pages(current, current->mm, addr, 1, 0, 1, 1536 err = -EFBIG;
1536 &page, &vma) <= 0) { 1537 else if (!dump_write(file, kaddr, PAGE_SIZE))
1537 DUMP_SEEK(file->f_pos + PAGE_SIZE); 1538 err = -EIO;
1538 }
1539 else if (page == ZERO_PAGE(0)) {
1540 page_cache_release(page);
1541 DUMP_SEEK(file->f_pos + PAGE_SIZE);
1542 }
1543 else {
1544 void *kaddr;
1545
1546 flush_cache_page(vma, addr, page_to_pfn(page));
1547 kaddr = kmap(page);
1548 if ((*size += PAGE_SIZE) > *limit ||
1549 !dump_write(file, kaddr, PAGE_SIZE)
1550 ) {
1551 kunmap(page);
1552 page_cache_release(page);
1553 return -EIO;
1554 }
1555 kunmap(page); 1539 kunmap(page);
1556 page_cache_release(page); 1540 page_cache_release(page);
1557 } 1541 } else if (!dump_seek(file, file->f_pos + PAGE_SIZE))
1542 err = -EFBIG;
1543 if (err)
1544 goto out;
1558 } 1545 }
1559 } 1546 }
1560 1547out:
1561 return 0; 1548 return err;
1562
1563end_coredump:
1564 return -EFBIG;
1565} 1549}
1566#endif 1550#endif
1567 1551
@@ -1802,7 +1786,8 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
1802 goto end_coredump; 1786 goto end_coredump;
1803 } 1787 }
1804 1788
1805 DUMP_SEEK(dataoff); 1789 if (!dump_seek(file, dataoff))
1790 goto end_coredump;
1806 1791
1807 if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0) 1792 if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0)
1808 goto end_coredump; 1793 goto end_coredump;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e92f229e3c6e..a2796651e756 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -278,8 +278,6 @@ static int decompress_exec(
278 ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos); 278 ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos);
279 if (ret <= 0) 279 if (ret <= 0)
280 break; 280 break;
281 if (ret >= (unsigned long) -4096)
282 break;
283 len -= ret; 281 len -= ret;
284 282
285 strm.next_in = buf; 283 strm.next_in = buf;
@@ -335,7 +333,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
335 "(%d != %d)", (unsigned) r, curid, id); 333 "(%d != %d)", (unsigned) r, curid, id);
336 goto failed; 334 goto failed;
337 } else if ( ! p->lib_list[id].loaded && 335 } else if ( ! p->lib_list[id].loaded &&
338 load_flat_shared_library(id, p) > (unsigned long) -4096) { 336 IS_ERR_VALUE(load_flat_shared_library(id, p))) {
339 printk("BINFMT_FLAT: failed to load library %d", id); 337 printk("BINFMT_FLAT: failed to load library %d", id);
340 goto failed; 338 goto failed;
341 } 339 }
@@ -545,7 +543,7 @@ static int load_flat_file(struct linux_binprm * bprm,
545 textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, 543 textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
546 MAP_PRIVATE|MAP_EXECUTABLE, 0); 544 MAP_PRIVATE|MAP_EXECUTABLE, 0);
547 up_write(&current->mm->mmap_sem); 545 up_write(&current->mm->mmap_sem);
548 if (!textpos || textpos >= (unsigned long) -4096) { 546 if (!textpos || IS_ERR_VALUE(textpos)) {
549 if (!textpos) 547 if (!textpos)
550 textpos = (unsigned long) -ENOMEM; 548 textpos = (unsigned long) -ENOMEM;
551 printk("Unable to mmap process text, errno %d\n", (int)-textpos); 549 printk("Unable to mmap process text, errno %d\n", (int)-textpos);
@@ -560,7 +558,7 @@ static int load_flat_file(struct linux_binprm * bprm,
560 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); 558 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
561 up_write(&current->mm->mmap_sem); 559 up_write(&current->mm->mmap_sem);
562 560
563 if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) { 561 if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) {
564 if (!realdatastart) 562 if (!realdatastart)
565 realdatastart = (unsigned long) -ENOMEM; 563 realdatastart = (unsigned long) -ENOMEM;
566 printk("Unable to allocate RAM for process data, errno %d\n", 564 printk("Unable to allocate RAM for process data, errno %d\n",
@@ -587,7 +585,7 @@ static int load_flat_file(struct linux_binprm * bprm,
587 result = bprm->file->f_op->read(bprm->file, (char *) datapos, 585 result = bprm->file->f_op->read(bprm->file, (char *) datapos,
588 data_len + (relocs * sizeof(unsigned long)), &fpos); 586 data_len + (relocs * sizeof(unsigned long)), &fpos);
589 } 587 }
590 if (result >= (unsigned long)-4096) { 588 if (IS_ERR_VALUE(result)) {
591 printk("Unable to read data+bss, errno %d\n", (int)-result); 589 printk("Unable to read data+bss, errno %d\n", (int)-result);
592 do_munmap(current->mm, textpos, text_len); 590 do_munmap(current->mm, textpos, text_len);
593 do_munmap(current->mm, realdatastart, data_len + extra); 591 do_munmap(current->mm, realdatastart, data_len + extra);
@@ -607,7 +605,7 @@ static int load_flat_file(struct linux_binprm * bprm,
607 PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); 605 PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
608 up_write(&current->mm->mmap_sem); 606 up_write(&current->mm->mmap_sem);
609 607
610 if (!textpos || textpos >= (unsigned long) -4096) { 608 if (!textpos || IS_ERR_VALUE(textpos)) {
611 if (!textpos) 609 if (!textpos)
612 textpos = (unsigned long) -ENOMEM; 610 textpos = (unsigned long) -ENOMEM;
613 printk("Unable to allocate RAM for process text/data, errno %d\n", 611 printk("Unable to allocate RAM for process text/data, errno %d\n",
@@ -641,7 +639,7 @@ static int load_flat_file(struct linux_binprm * bprm,
641 fpos = 0; 639 fpos = 0;
642 result = bprm->file->f_op->read(bprm->file, 640 result = bprm->file->f_op->read(bprm->file,
643 (char *) textpos, text_len, &fpos); 641 (char *) textpos, text_len, &fpos);
644 if (result < (unsigned long) -4096) 642 if (!IS_ERR_VALUE(result))
645 result = decompress_exec(bprm, text_len, (char *) datapos, 643 result = decompress_exec(bprm, text_len, (char *) datapos,
646 data_len + (relocs * sizeof(unsigned long)), 0); 644 data_len + (relocs * sizeof(unsigned long)), 0);
647 } 645 }
@@ -651,13 +649,13 @@ static int load_flat_file(struct linux_binprm * bprm,
651 fpos = 0; 649 fpos = 0;
652 result = bprm->file->f_op->read(bprm->file, 650 result = bprm->file->f_op->read(bprm->file,
653 (char *) textpos, text_len, &fpos); 651 (char *) textpos, text_len, &fpos);
654 if (result < (unsigned long) -4096) { 652 if (!IS_ERR_VALUE(result)) {
655 fpos = ntohl(hdr->data_start); 653 fpos = ntohl(hdr->data_start);
656 result = bprm->file->f_op->read(bprm->file, (char *) datapos, 654 result = bprm->file->f_op->read(bprm->file, (char *) datapos,
657 data_len + (relocs * sizeof(unsigned long)), &fpos); 655 data_len + (relocs * sizeof(unsigned long)), &fpos);
658 } 656 }
659 } 657 }
660 if (result >= (unsigned long)-4096) { 658 if (IS_ERR_VALUE(result)) {
661 printk("Unable to read code+data+bss, errno %d\n",(int)-result); 659 printk("Unable to read code+data+bss, errno %d\n",(int)-result);
662 do_munmap(current->mm, textpos, text_len + data_len + extra + 660 do_munmap(current->mm, textpos, text_len + data_len + extra +
663 MAX_SHARED_LIBS * sizeof(unsigned long)); 661 MAX_SHARED_LIBS * sizeof(unsigned long));
@@ -835,7 +833,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
835 833
836 res = prepare_binprm(&bprm); 834 res = prepare_binprm(&bprm);
837 835
838 if (res <= (unsigned long)-4096) 836 if (!IS_ERR_VALUE(res))
839 res = load_flat_file(&bprm, libs, id, NULL); 837 res = load_flat_file(&bprm, libs, id, NULL);
840 838
841 abort_creds(bprm.cred); 839 abort_creds(bprm.cred);
@@ -880,7 +878,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
880 stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */ 878 stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */
881 879
882 res = load_flat_file(bprm, &libinfo, 0, &stack_len); 880 res = load_flat_file(bprm, &libinfo, 0, &stack_len);
883 if (res > (unsigned long)-4096) 881 if (IS_ERR_VALUE(res))
884 return res; 882 return res;
885 883
886 /* Update data segment pointers for all libraries */ 884 /* Update data segment pointers for all libraries */
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3581a4e53942..5d1ed50bd46c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -420,7 +420,6 @@ static void bdev_destroy_inode(struct inode *inode)
420{ 420{
421 struct bdev_inode *bdi = BDEV_I(inode); 421 struct bdev_inode *bdi = BDEV_I(inode);
422 422
423 bdi->bdev.bd_inode_backing_dev_info = NULL;
424 kmem_cache_free(bdev_cachep, bdi); 423 kmem_cache_free(bdev_cachep, bdi);
425} 424}
426 425
@@ -1115,7 +1114,7 @@ EXPORT_SYMBOL(revalidate_disk);
1115int check_disk_change(struct block_device *bdev) 1114int check_disk_change(struct block_device *bdev)
1116{ 1115{
1117 struct gendisk *disk = bdev->bd_disk; 1116 struct gendisk *disk = bdev->bd_disk;
1118 struct block_device_operations * bdops = disk->fops; 1117 const struct block_device_operations *bdops = disk->fops;
1119 1118
1120 if (!bdops->media_changed) 1119 if (!bdops->media_changed)
1121 return 0; 1120 return 0;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 15831d5c7367..6c4173146bb7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -772,7 +772,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
772 } 772 }
773} 773}
774 774
775static struct address_space_operations btree_aops = { 775static const struct address_space_operations btree_aops = {
776 .readpage = btree_readpage, 776 .readpage = btree_readpage,
777 .writepage = btree_writepage, 777 .writepage = btree_writepage,
778 .writepages = btree_writepages, 778 .writepages = btree_writepages,
@@ -1600,6 +1600,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1600 1600
1601 sb->s_blocksize = 4096; 1601 sb->s_blocksize = 4096;
1602 sb->s_blocksize_bits = blksize_bits(4096); 1602 sb->s_blocksize_bits = blksize_bits(4096);
1603 sb->s_bdi = &fs_info->bdi;
1603 1604
1604 /* 1605 /*
1605 * we set the i_size on the btree inode to the max possible int. 1606 * we set the i_size on the btree inode to the max possible int.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dd86050190fc..d154a3f365d5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -55,13 +55,13 @@ struct btrfs_iget_args {
55 struct btrfs_root *root; 55 struct btrfs_root *root;
56}; 56};
57 57
58static struct inode_operations btrfs_dir_inode_operations; 58static const struct inode_operations btrfs_dir_inode_operations;
59static struct inode_operations btrfs_symlink_inode_operations; 59static const struct inode_operations btrfs_symlink_inode_operations;
60static struct inode_operations btrfs_dir_ro_inode_operations; 60static const struct inode_operations btrfs_dir_ro_inode_operations;
61static struct inode_operations btrfs_special_inode_operations; 61static const struct inode_operations btrfs_special_inode_operations;
62static struct inode_operations btrfs_file_inode_operations; 62static const struct inode_operations btrfs_file_inode_operations;
63static struct address_space_operations btrfs_aops; 63static const struct address_space_operations btrfs_aops;
64static struct address_space_operations btrfs_symlink_aops; 64static const struct address_space_operations btrfs_symlink_aops;
65static struct file_operations btrfs_dir_file_operations; 65static struct file_operations btrfs_dir_file_operations;
66static struct extent_io_ops btrfs_extent_io_ops; 66static struct extent_io_ops btrfs_extent_io_ops;
67 67
@@ -5201,7 +5201,7 @@ static int btrfs_permission(struct inode *inode, int mask)
5201 return generic_permission(inode, mask, btrfs_check_acl); 5201 return generic_permission(inode, mask, btrfs_check_acl);
5202} 5202}
5203 5203
5204static struct inode_operations btrfs_dir_inode_operations = { 5204static const struct inode_operations btrfs_dir_inode_operations = {
5205 .getattr = btrfs_getattr, 5205 .getattr = btrfs_getattr,
5206 .lookup = btrfs_lookup, 5206 .lookup = btrfs_lookup,
5207 .create = btrfs_create, 5207 .create = btrfs_create,
@@ -5219,7 +5219,7 @@ static struct inode_operations btrfs_dir_inode_operations = {
5219 .removexattr = btrfs_removexattr, 5219 .removexattr = btrfs_removexattr,
5220 .permission = btrfs_permission, 5220 .permission = btrfs_permission,
5221}; 5221};
5222static struct inode_operations btrfs_dir_ro_inode_operations = { 5222static const struct inode_operations btrfs_dir_ro_inode_operations = {
5223 .lookup = btrfs_lookup, 5223 .lookup = btrfs_lookup,
5224 .permission = btrfs_permission, 5224 .permission = btrfs_permission,
5225}; 5225};
@@ -5259,7 +5259,7 @@ static struct extent_io_ops btrfs_extent_io_ops = {
5259 * 5259 *
5260 * For now we're avoiding this by dropping bmap. 5260 * For now we're avoiding this by dropping bmap.
5261 */ 5261 */
5262static struct address_space_operations btrfs_aops = { 5262static const struct address_space_operations btrfs_aops = {
5263 .readpage = btrfs_readpage, 5263 .readpage = btrfs_readpage,
5264 .writepage = btrfs_writepage, 5264 .writepage = btrfs_writepage,
5265 .writepages = btrfs_writepages, 5265 .writepages = btrfs_writepages,
@@ -5272,14 +5272,14 @@ static struct address_space_operations btrfs_aops = {
5272 .error_remove_page = generic_error_remove_page, 5272 .error_remove_page = generic_error_remove_page,
5273}; 5273};
5274 5274
5275static struct address_space_operations btrfs_symlink_aops = { 5275static const struct address_space_operations btrfs_symlink_aops = {
5276 .readpage = btrfs_readpage, 5276 .readpage = btrfs_readpage,
5277 .writepage = btrfs_writepage, 5277 .writepage = btrfs_writepage,
5278 .invalidatepage = btrfs_invalidatepage, 5278 .invalidatepage = btrfs_invalidatepage,
5279 .releasepage = btrfs_releasepage, 5279 .releasepage = btrfs_releasepage,
5280}; 5280};
5281 5281
5282static struct inode_operations btrfs_file_inode_operations = { 5282static const struct inode_operations btrfs_file_inode_operations = {
5283 .truncate = btrfs_truncate, 5283 .truncate = btrfs_truncate,
5284 .getattr = btrfs_getattr, 5284 .getattr = btrfs_getattr,
5285 .setattr = btrfs_setattr, 5285 .setattr = btrfs_setattr,
@@ -5291,7 +5291,7 @@ static struct inode_operations btrfs_file_inode_operations = {
5291 .fallocate = btrfs_fallocate, 5291 .fallocate = btrfs_fallocate,
5292 .fiemap = btrfs_fiemap, 5292 .fiemap = btrfs_fiemap,
5293}; 5293};
5294static struct inode_operations btrfs_special_inode_operations = { 5294static const struct inode_operations btrfs_special_inode_operations = {
5295 .getattr = btrfs_getattr, 5295 .getattr = btrfs_getattr,
5296 .setattr = btrfs_setattr, 5296 .setattr = btrfs_setattr,
5297 .permission = btrfs_permission, 5297 .permission = btrfs_permission,
@@ -5300,7 +5300,7 @@ static struct inode_operations btrfs_special_inode_operations = {
5300 .listxattr = btrfs_listxattr, 5300 .listxattr = btrfs_listxattr,
5301 .removexattr = btrfs_removexattr, 5301 .removexattr = btrfs_removexattr,
5302}; 5302};
5303static struct inode_operations btrfs_symlink_inode_operations = { 5303static const struct inode_operations btrfs_symlink_inode_operations = {
5304 .readlink = generic_readlink, 5304 .readlink = generic_readlink,
5305 .follow_link = page_follow_link_light, 5305 .follow_link = page_follow_link_light,
5306 .put_link = page_put_link, 5306 .put_link = page_put_link,
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d6f0806c682f..7b2f401e604e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -740,7 +740,6 @@ int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
740 .nr_to_write = mapping->nrpages * 2, 740 .nr_to_write = mapping->nrpages * 2,
741 .range_start = start, 741 .range_start = start,
742 .range_end = end, 742 .range_end = end,
743 .for_writepages = 1,
744 }; 743 };
745 return btrfs_writepages(mapping, &wbc); 744 return btrfs_writepages(mapping, &wbc);
746} 745}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6d6d06cb6dfc..2db17cd66fc5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -51,7 +51,7 @@
51#include "export.h" 51#include "export.h"
52#include "compression.h" 52#include "compression.h"
53 53
54static struct super_operations btrfs_super_ops; 54static const struct super_operations btrfs_super_ops;
55 55
56static void btrfs_put_super(struct super_block *sb) 56static void btrfs_put_super(struct super_block *sb)
57{ 57{
@@ -675,7 +675,7 @@ static int btrfs_unfreeze(struct super_block *sb)
675 return 0; 675 return 0;
676} 676}
677 677
678static struct super_operations btrfs_super_ops = { 678static const struct super_operations btrfs_super_ops = {
679 .delete_inode = btrfs_delete_inode, 679 .delete_inode = btrfs_delete_inode,
680 .put_super = btrfs_put_super, 680 .put_super = btrfs_put_super,
681 .sync_fs = btrfs_sync_fs, 681 .sync_fs = btrfs_sync_fs,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d91b0de7c502..30c0d45c1b5e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2605,7 +2605,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2605 extent); 2605 extent);
2606 cs = btrfs_file_extent_offset(src, extent); 2606 cs = btrfs_file_extent_offset(src, extent);
2607 cl = btrfs_file_extent_num_bytes(src, 2607 cl = btrfs_file_extent_num_bytes(src,
2608 extent);; 2608 extent);
2609 if (btrfs_file_extent_compression(src, 2609 if (btrfs_file_extent_compression(src,
2610 extent)) { 2610 extent)) {
2611 cs = 0; 2611 cs = 0;
diff --git a/fs/buffer.c b/fs/buffer.c
index 90a98865b0cc..209f7f15f5f8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -52,6 +52,7 @@ init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
52 bh->b_end_io = handler; 52 bh->b_end_io = handler;
53 bh->b_private = private; 53 bh->b_private = private;
54} 54}
55EXPORT_SYMBOL(init_buffer);
55 56
56static int sync_buffer(void *word) 57static int sync_buffer(void *word)
57{ 58{
@@ -80,6 +81,7 @@ void unlock_buffer(struct buffer_head *bh)
80 smp_mb__after_clear_bit(); 81 smp_mb__after_clear_bit();
81 wake_up_bit(&bh->b_state, BH_Lock); 82 wake_up_bit(&bh->b_state, BH_Lock);
82} 83}
84EXPORT_SYMBOL(unlock_buffer);
83 85
84/* 86/*
85 * Block until a buffer comes unlocked. This doesn't stop it 87 * Block until a buffer comes unlocked. This doesn't stop it
@@ -90,6 +92,7 @@ void __wait_on_buffer(struct buffer_head * bh)
90{ 92{
91 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); 93 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
92} 94}
95EXPORT_SYMBOL(__wait_on_buffer);
93 96
94static void 97static void
95__clear_page_buffers(struct page *page) 98__clear_page_buffers(struct page *page)
@@ -144,6 +147,7 @@ void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
144 __end_buffer_read_notouch(bh, uptodate); 147 __end_buffer_read_notouch(bh, uptodate);
145 put_bh(bh); 148 put_bh(bh);
146} 149}
150EXPORT_SYMBOL(end_buffer_read_sync);
147 151
148void end_buffer_write_sync(struct buffer_head *bh, int uptodate) 152void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
149{ 153{
@@ -164,6 +168,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
164 unlock_buffer(bh); 168 unlock_buffer(bh);
165 put_bh(bh); 169 put_bh(bh);
166} 170}
171EXPORT_SYMBOL(end_buffer_write_sync);
167 172
168/* 173/*
169 * Various filesystems appear to want __find_get_block to be non-blocking. 174 * Various filesystems appear to want __find_get_block to be non-blocking.
@@ -272,6 +277,7 @@ void invalidate_bdev(struct block_device *bdev)
272 invalidate_bh_lrus(); 277 invalidate_bh_lrus();
273 invalidate_mapping_pages(mapping, 0, -1); 278 invalidate_mapping_pages(mapping, 0, -1);
274} 279}
280EXPORT_SYMBOL(invalidate_bdev);
275 281
276/* 282/*
277 * Kick pdflush then try to free up some ZONE_NORMAL memory. 283 * Kick pdflush then try to free up some ZONE_NORMAL memory.
@@ -410,6 +416,7 @@ still_busy:
410 local_irq_restore(flags); 416 local_irq_restore(flags);
411 return; 417 return;
412} 418}
419EXPORT_SYMBOL(end_buffer_async_write);
413 420
414/* 421/*
415 * If a page's buffers are under async readin (end_buffer_async_read 422 * If a page's buffers are under async readin (end_buffer_async_read
@@ -438,8 +445,8 @@ static void mark_buffer_async_read(struct buffer_head *bh)
438 set_buffer_async_read(bh); 445 set_buffer_async_read(bh);
439} 446}
440 447
441void mark_buffer_async_write_endio(struct buffer_head *bh, 448static void mark_buffer_async_write_endio(struct buffer_head *bh,
442 bh_end_io_t *handler) 449 bh_end_io_t *handler)
443{ 450{
444 bh->b_end_io = handler; 451 bh->b_end_io = handler;
445 set_buffer_async_write(bh); 452 set_buffer_async_write(bh);
@@ -553,7 +560,7 @@ repeat:
553 return err; 560 return err;
554} 561}
555 562
556void do_thaw_all(struct work_struct *work) 563static void do_thaw_all(struct work_struct *work)
557{ 564{
558 struct super_block *sb; 565 struct super_block *sb;
559 char b[BDEVNAME_SIZE]; 566 char b[BDEVNAME_SIZE];
@@ -1172,6 +1179,7 @@ void mark_buffer_dirty(struct buffer_head *bh)
1172 } 1179 }
1173 } 1180 }
1174} 1181}
1182EXPORT_SYMBOL(mark_buffer_dirty);
1175 1183
1176/* 1184/*
1177 * Decrement a buffer_head's reference count. If all buffers against a page 1185 * Decrement a buffer_head's reference count. If all buffers against a page
@@ -1188,6 +1196,7 @@ void __brelse(struct buffer_head * buf)
1188 } 1196 }
1189 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); 1197 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1190} 1198}
1199EXPORT_SYMBOL(__brelse);
1191 1200
1192/* 1201/*
1193 * bforget() is like brelse(), except it discards any 1202 * bforget() is like brelse(), except it discards any
@@ -1206,6 +1215,7 @@ void __bforget(struct buffer_head *bh)
1206 } 1215 }
1207 __brelse(bh); 1216 __brelse(bh);
1208} 1217}
1218EXPORT_SYMBOL(__bforget);
1209 1219
1210static struct buffer_head *__bread_slow(struct buffer_head *bh) 1220static struct buffer_head *__bread_slow(struct buffer_head *bh)
1211{ 1221{
@@ -2218,6 +2228,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
2218 } 2228 }
2219 return 0; 2229 return 0;
2220} 2230}
2231EXPORT_SYMBOL(block_read_full_page);
2221 2232
2222/* utility function for filesystems that need to do work on expanding 2233/* utility function for filesystems that need to do work on expanding
2223 * truncates. Uses filesystem pagecache writes to allow the filesystem to 2234 * truncates. Uses filesystem pagecache writes to allow the filesystem to
@@ -2252,6 +2263,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
2252out: 2263out:
2253 return err; 2264 return err;
2254} 2265}
2266EXPORT_SYMBOL(generic_cont_expand_simple);
2255 2267
2256static int cont_expand_zero(struct file *file, struct address_space *mapping, 2268static int cont_expand_zero(struct file *file, struct address_space *mapping,
2257 loff_t pos, loff_t *bytes) 2269 loff_t pos, loff_t *bytes)
@@ -2352,6 +2364,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
2352out: 2364out:
2353 return err; 2365 return err;
2354} 2366}
2367EXPORT_SYMBOL(cont_write_begin);
2355 2368
2356int block_prepare_write(struct page *page, unsigned from, unsigned to, 2369int block_prepare_write(struct page *page, unsigned from, unsigned to,
2357 get_block_t *get_block) 2370 get_block_t *get_block)
@@ -2362,6 +2375,7 @@ int block_prepare_write(struct page *page, unsigned from, unsigned to,
2362 ClearPageUptodate(page); 2375 ClearPageUptodate(page);
2363 return err; 2376 return err;
2364} 2377}
2378EXPORT_SYMBOL(block_prepare_write);
2365 2379
2366int block_commit_write(struct page *page, unsigned from, unsigned to) 2380int block_commit_write(struct page *page, unsigned from, unsigned to)
2367{ 2381{
@@ -2369,6 +2383,7 @@ int block_commit_write(struct page *page, unsigned from, unsigned to)
2369 __block_commit_write(inode,page,from,to); 2383 __block_commit_write(inode,page,from,to);
2370 return 0; 2384 return 0;
2371} 2385}
2386EXPORT_SYMBOL(block_commit_write);
2372 2387
2373/* 2388/*
2374 * block_page_mkwrite() is not allowed to change the file size as it gets 2389 * block_page_mkwrite() is not allowed to change the file size as it gets
@@ -2426,6 +2441,7 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2426out: 2441out:
2427 return ret; 2442 return ret;
2428} 2443}
2444EXPORT_SYMBOL(block_page_mkwrite);
2429 2445
2430/* 2446/*
2431 * nobh_write_begin()'s prereads are special: the buffer_heads are freed 2447 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
@@ -2849,6 +2865,7 @@ unlock:
2849out: 2865out:
2850 return err; 2866 return err;
2851} 2867}
2868EXPORT_SYMBOL(block_truncate_page);
2852 2869
2853/* 2870/*
2854 * The generic ->writepage function for buffer-backed address_spaces 2871 * The generic ->writepage function for buffer-backed address_spaces
@@ -2890,6 +2907,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2890 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 2907 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2891 return __block_write_full_page(inode, page, get_block, wbc, handler); 2908 return __block_write_full_page(inode, page, get_block, wbc, handler);
2892} 2909}
2910EXPORT_SYMBOL(block_write_full_page_endio);
2893 2911
2894/* 2912/*
2895 * The generic ->writepage function for buffer-backed address_spaces 2913 * The generic ->writepage function for buffer-backed address_spaces
@@ -2900,7 +2918,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
2900 return block_write_full_page_endio(page, get_block, wbc, 2918 return block_write_full_page_endio(page, get_block, wbc,
2901 end_buffer_async_write); 2919 end_buffer_async_write);
2902} 2920}
2903 2921EXPORT_SYMBOL(block_write_full_page);
2904 2922
2905sector_t generic_block_bmap(struct address_space *mapping, sector_t block, 2923sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2906 get_block_t *get_block) 2924 get_block_t *get_block)
@@ -2913,6 +2931,7 @@ sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2913 get_block(inode, block, &tmp, 0); 2931 get_block(inode, block, &tmp, 0);
2914 return tmp.b_blocknr; 2932 return tmp.b_blocknr;
2915} 2933}
2934EXPORT_SYMBOL(generic_block_bmap);
2916 2935
2917static void end_bio_bh_io_sync(struct bio *bio, int err) 2936static void end_bio_bh_io_sync(struct bio *bio, int err)
2918{ 2937{
@@ -2982,6 +3001,7 @@ int submit_bh(int rw, struct buffer_head * bh)
2982 bio_put(bio); 3001 bio_put(bio);
2983 return ret; 3002 return ret;
2984} 3003}
3004EXPORT_SYMBOL(submit_bh);
2985 3005
2986/** 3006/**
2987 * ll_rw_block: low-level access to block devices (DEPRECATED) 3007 * ll_rw_block: low-level access to block devices (DEPRECATED)
@@ -3043,6 +3063,7 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3043 unlock_buffer(bh); 3063 unlock_buffer(bh);
3044 } 3064 }
3045} 3065}
3066EXPORT_SYMBOL(ll_rw_block);
3046 3067
3047/* 3068/*
3048 * For a data-integrity writeout, we need to wait upon any in-progress I/O 3069 * For a data-integrity writeout, we need to wait upon any in-progress I/O
@@ -3071,6 +3092,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
3071 } 3092 }
3072 return ret; 3093 return ret;
3073} 3094}
3095EXPORT_SYMBOL(sync_dirty_buffer);
3074 3096
3075/* 3097/*
3076 * try_to_free_buffers() checks if all the buffers on this particular page 3098 * try_to_free_buffers() checks if all the buffers on this particular page
@@ -3185,6 +3207,7 @@ void block_sync_page(struct page *page)
3185 if (mapping) 3207 if (mapping)
3186 blk_run_backing_dev(mapping->backing_dev_info, page); 3208 blk_run_backing_dev(mapping->backing_dev_info, page);
3187} 3209}
3210EXPORT_SYMBOL(block_sync_page);
3188 3211
3189/* 3212/*
3190 * There are no bdflush tunables left. But distributions are 3213 * There are no bdflush tunables left. But distributions are
@@ -3361,29 +3384,3 @@ void __init buffer_init(void)
3361 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); 3384 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3362 hotcpu_notifier(buffer_cpu_notify, 0); 3385 hotcpu_notifier(buffer_cpu_notify, 0);
3363} 3386}
3364
3365EXPORT_SYMBOL(__bforget);
3366EXPORT_SYMBOL(__brelse);
3367EXPORT_SYMBOL(__wait_on_buffer);
3368EXPORT_SYMBOL(block_commit_write);
3369EXPORT_SYMBOL(block_prepare_write);
3370EXPORT_SYMBOL(block_page_mkwrite);
3371EXPORT_SYMBOL(block_read_full_page);
3372EXPORT_SYMBOL(block_sync_page);
3373EXPORT_SYMBOL(block_truncate_page);
3374EXPORT_SYMBOL(block_write_full_page);
3375EXPORT_SYMBOL(block_write_full_page_endio);
3376EXPORT_SYMBOL(cont_write_begin);
3377EXPORT_SYMBOL(end_buffer_read_sync);
3378EXPORT_SYMBOL(end_buffer_write_sync);
3379EXPORT_SYMBOL(end_buffer_async_write);
3380EXPORT_SYMBOL(file_fsync);
3381EXPORT_SYMBOL(generic_block_bmap);
3382EXPORT_SYMBOL(generic_cont_expand_simple);
3383EXPORT_SYMBOL(init_buffer);
3384EXPORT_SYMBOL(invalidate_bdev);
3385EXPORT_SYMBOL(ll_rw_block);
3386EXPORT_SYMBOL(mark_buffer_dirty);
3387EXPORT_SYMBOL(submit_bh);
3388EXPORT_SYMBOL(sync_dirty_buffer);
3389EXPORT_SYMBOL(unlock_buffer);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 3cbc57f932d2..d6db933df2b2 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -264,7 +264,6 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
264{ 264{
265 struct char_device_struct *cd; 265 struct char_device_struct *cd;
266 struct cdev *cdev; 266 struct cdev *cdev;
267 char *s;
268 int err = -ENOMEM; 267 int err = -ENOMEM;
269 268
270 cd = __register_chrdev_region(major, baseminor, count, name); 269 cd = __register_chrdev_region(major, baseminor, count, name);
@@ -278,8 +277,6 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
278 cdev->owner = fops->owner; 277 cdev->owner = fops->owner;
279 cdev->ops = fops; 278 cdev->ops = fops;
280 kobject_set_name(&cdev->kobj, "%s", name); 279 kobject_set_name(&cdev->kobj, "%s", name);
281 for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/'))
282 *s = '!';
283 280
284 err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); 281 err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
285 if (err) 282 if (err)
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 606912d8f2a8..fea9e898c4ba 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -142,7 +142,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
142 rc = dns_resolve_server_name_to_ip(*devname, &srvIP); 142 rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
143 if (rc != 0) { 143 if (rc != 0) {
144 cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d", 144 cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
145 __func__, *devname, rc));; 145 __func__, *devname, rc));
146 goto compose_mount_options_err; 146 goto compose_mount_options_err;
147 } 147 }
148 /* md_len = strlen(...) + 12 for 'sep+prefixpath=' 148 /* md_len = strlen(...) + 12 for 'sep+prefixpath='
@@ -385,7 +385,7 @@ out_err:
385 goto out; 385 goto out;
386} 386}
387 387
388struct inode_operations cifs_dfs_referral_inode_operations = { 388const struct inode_operations cifs_dfs_referral_inode_operations = {
389 .follow_link = cifs_dfs_follow_mountpoint, 389 .follow_link = cifs_dfs_follow_mountpoint,
390}; 390};
391 391
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 3610e9958b4c..d79ce2e95c23 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -50,7 +50,7 @@
50#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ 50#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
51 51
52#ifdef CONFIG_CIFS_QUOTA 52#ifdef CONFIG_CIFS_QUOTA
53static struct quotactl_ops cifs_quotactl_ops; 53static const struct quotactl_ops cifs_quotactl_ops;
54#endif /* QUOTA */ 54#endif /* QUOTA */
55 55
56int cifsFYI = 0; 56int cifsFYI = 0;
@@ -517,7 +517,7 @@ int cifs_xstate_get(struct super_block *sb, struct fs_quota_stat *qstats)
517 return rc; 517 return rc;
518} 518}
519 519
520static struct quotactl_ops cifs_quotactl_ops = { 520static const struct quotactl_ops cifs_quotactl_ops = {
521 .set_xquota = cifs_xquota_set, 521 .set_xquota = cifs_xquota_set,
522 .get_xquota = cifs_xquota_get, 522 .get_xquota = cifs_xquota_get,
523 .set_xstate = cifs_xstate_set, 523 .set_xstate = cifs_xstate_set,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 094325e3f714..ac2b24c192f8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -67,7 +67,7 @@ extern int cifs_setattr(struct dentry *, struct iattr *);
67 67
68extern const struct inode_operations cifs_file_inode_ops; 68extern const struct inode_operations cifs_file_inode_ops;
69extern const struct inode_operations cifs_symlink_inode_ops; 69extern const struct inode_operations cifs_symlink_inode_ops;
70extern struct inode_operations cifs_dfs_referral_inode_operations; 70extern const struct inode_operations cifs_dfs_referral_inode_operations;
71 71
72 72
73/* Functions related to files and directories */ 73/* Functions related to files and directories */
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index 8ccd5ed81d9c..d99860a33890 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -2,6 +2,7 @@
2#define _CODA_INT_ 2#define _CODA_INT_
3 3
4struct dentry; 4struct dentry;
5struct file;
5 6
6extern struct file_system_type coda_fs_type; 7extern struct file_system_type coda_fs_type;
7extern unsigned long coda_timeout; 8extern unsigned long coda_timeout;
diff --git a/fs/compat.c b/fs/compat.c
index 6d6f98fe64a0..3aa48834a222 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -100,13 +100,6 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, char __user *filename, st
100 get_compat_timespec(&tv[1], &t[1])) 100 get_compat_timespec(&tv[1], &t[1]))
101 return -EFAULT; 101 return -EFAULT;
102 102
103 if ((tv[0].tv_nsec == UTIME_OMIT || tv[0].tv_nsec == UTIME_NOW)
104 && tv[0].tv_sec != 0)
105 return -EINVAL;
106 if ((tv[1].tv_nsec == UTIME_OMIT || tv[1].tv_nsec == UTIME_NOW)
107 && tv[1].tv_sec != 0)
108 return -EINVAL;
109
110 if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) 103 if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
111 return 0; 104 return 0;
112 } 105 }
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 75efb028974b..d5f8c96964be 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -18,14 +18,13 @@
18#include <linux/mount.h> 18#include <linux/mount.h>
19#include <linux/tty.h> 19#include <linux/tty.h>
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/magic.h>
21#include <linux/idr.h> 22#include <linux/idr.h>
22#include <linux/devpts_fs.h> 23#include <linux/devpts_fs.h>
23#include <linux/parser.h> 24#include <linux/parser.h>
24#include <linux/fsnotify.h> 25#include <linux/fsnotify.h>
25#include <linux/seq_file.h> 26#include <linux/seq_file.h>
26 27
27#define DEVPTS_SUPER_MAGIC 0x1cd1
28
29#define DEVPTS_DEFAULT_MODE 0600 28#define DEVPTS_DEFAULT_MODE 0600
30/* 29/*
31 * ptmx is a new node in /dev/pts and will be unused in legacy (single- 30 * ptmx is a new node in /dev/pts and will be unused in legacy (single-
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 1d1d27442235..1c8bb8c3a82e 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -386,9 +386,9 @@ static int table_seq_show(struct seq_file *seq, void *iter_ptr)
386 return rv; 386 return rv;
387} 387}
388 388
389static struct seq_operations format1_seq_ops; 389static const struct seq_operations format1_seq_ops;
390static struct seq_operations format2_seq_ops; 390static const struct seq_operations format2_seq_ops;
391static struct seq_operations format3_seq_ops; 391static const struct seq_operations format3_seq_ops;
392 392
393static void *table_seq_start(struct seq_file *seq, loff_t *pos) 393static void *table_seq_start(struct seq_file *seq, loff_t *pos)
394{ 394{
@@ -534,21 +534,21 @@ static void table_seq_stop(struct seq_file *seq, void *iter_ptr)
534 } 534 }
535} 535}
536 536
537static struct seq_operations format1_seq_ops = { 537static const struct seq_operations format1_seq_ops = {
538 .start = table_seq_start, 538 .start = table_seq_start,
539 .next = table_seq_next, 539 .next = table_seq_next,
540 .stop = table_seq_stop, 540 .stop = table_seq_stop,
541 .show = table_seq_show, 541 .show = table_seq_show,
542}; 542};
543 543
544static struct seq_operations format2_seq_ops = { 544static const struct seq_operations format2_seq_ops = {
545 .start = table_seq_start, 545 .start = table_seq_start,
546 .next = table_seq_next, 546 .next = table_seq_next,
547 .stop = table_seq_stop, 547 .stop = table_seq_stop,
548 .show = table_seq_show, 548 .show = table_seq_show,
549}; 549};
550 550
551static struct seq_operations format3_seq_ops = { 551static const struct seq_operations format3_seq_ops = {
552 .start = table_seq_start, 552 .start = table_seq_start,
553 .next = table_seq_next, 553 .next = table_seq_next,
554 .stop = table_seq_stop, 554 .stop = table_seq_stop,
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 618a60f03886..240cef14fe58 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -106,6 +106,7 @@ struct connection {
106#define CF_CONNECT_PENDING 3 106#define CF_CONNECT_PENDING 3
107#define CF_INIT_PENDING 4 107#define CF_INIT_PENDING 4
108#define CF_IS_OTHERCON 5 108#define CF_IS_OTHERCON 5
109#define CF_CLOSE 6
109 struct list_head writequeue; /* List of outgoing writequeue_entries */ 110 struct list_head writequeue; /* List of outgoing writequeue_entries */
110 spinlock_t writequeue_lock; 111 spinlock_t writequeue_lock;
111 int (*rx_action) (struct connection *); /* What to do when active */ 112 int (*rx_action) (struct connection *); /* What to do when active */
@@ -299,6 +300,8 @@ static void lowcomms_write_space(struct sock *sk)
299 300
300static inline void lowcomms_connect_sock(struct connection *con) 301static inline void lowcomms_connect_sock(struct connection *con)
301{ 302{
303 if (test_bit(CF_CLOSE, &con->flags))
304 return;
302 if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) 305 if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
303 queue_work(send_workqueue, &con->swork); 306 queue_work(send_workqueue, &con->swork);
304} 307}
@@ -926,10 +929,8 @@ static void tcp_connect_to_sock(struct connection *con)
926 goto out_err; 929 goto out_err;
927 930
928 memset(&saddr, 0, sizeof(saddr)); 931 memset(&saddr, 0, sizeof(saddr));
929 if (dlm_nodeid_to_addr(con->nodeid, &saddr)) { 932 if (dlm_nodeid_to_addr(con->nodeid, &saddr))
930 sock_release(sock);
931 goto out_err; 933 goto out_err;
932 }
933 934
934 sock->sk->sk_user_data = con; 935 sock->sk->sk_user_data = con;
935 con->rx_action = receive_from_sock; 936 con->rx_action = receive_from_sock;
@@ -1284,7 +1285,6 @@ out:
1284static void send_to_sock(struct connection *con) 1285static void send_to_sock(struct connection *con)
1285{ 1286{
1286 int ret = 0; 1287 int ret = 0;
1287 ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
1288 const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; 1288 const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1289 struct writequeue_entry *e; 1289 struct writequeue_entry *e;
1290 int len, offset; 1290 int len, offset;
@@ -1293,8 +1293,6 @@ static void send_to_sock(struct connection *con)
1293 if (con->sock == NULL) 1293 if (con->sock == NULL)
1294 goto out_connect; 1294 goto out_connect;
1295 1295
1296 sendpage = con->sock->ops->sendpage;
1297
1298 spin_lock(&con->writequeue_lock); 1296 spin_lock(&con->writequeue_lock);
1299 for (;;) { 1297 for (;;) {
1300 e = list_entry(con->writequeue.next, struct writequeue_entry, 1298 e = list_entry(con->writequeue.next, struct writequeue_entry,
@@ -1309,8 +1307,8 @@ static void send_to_sock(struct connection *con)
1309 1307
1310 ret = 0; 1308 ret = 0;
1311 if (len) { 1309 if (len) {
1312 ret = sendpage(con->sock, e->page, offset, len, 1310 ret = kernel_sendpage(con->sock, e->page, offset, len,
1313 msg_flags); 1311 msg_flags);
1314 if (ret == -EAGAIN || ret == 0) { 1312 if (ret == -EAGAIN || ret == 0) {
1315 cond_resched(); 1313 cond_resched();
1316 goto out; 1314 goto out;
@@ -1370,6 +1368,13 @@ int dlm_lowcomms_close(int nodeid)
1370 log_print("closing connection to node %d", nodeid); 1368 log_print("closing connection to node %d", nodeid);
1371 con = nodeid2con(nodeid, 0); 1369 con = nodeid2con(nodeid, 0);
1372 if (con) { 1370 if (con) {
1371 clear_bit(CF_CONNECT_PENDING, &con->flags);
1372 clear_bit(CF_WRITE_PENDING, &con->flags);
1373 set_bit(CF_CLOSE, &con->flags);
1374 if (cancel_work_sync(&con->swork))
1375 log_print("canceled swork for node %d", nodeid);
1376 if (cancel_work_sync(&con->rwork))
1377 log_print("canceled rwork for node %d", nodeid);
1373 clean_one_writequeue(con); 1378 clean_one_writequeue(con);
1374 close_connection(con, true); 1379 close_connection(con, true);
1375 } 1380 }
@@ -1395,9 +1400,10 @@ static void process_send_sockets(struct work_struct *work)
1395 1400
1396 if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { 1401 if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
1397 con->connect_action(con); 1402 con->connect_action(con);
1403 set_bit(CF_WRITE_PENDING, &con->flags);
1398 } 1404 }
1399 clear_bit(CF_WRITE_PENDING, &con->flags); 1405 if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags))
1400 send_to_sock(con); 1406 send_to_sock(con);
1401} 1407}
1402 1408
1403 1409
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index a2edb7913447..31f4b0e6d72c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -63,9 +63,9 @@ static void drop_slab(void)
63} 63}
64 64
65int drop_caches_sysctl_handler(ctl_table *table, int write, 65int drop_caches_sysctl_handler(ctl_table *table, int write,
66 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 66 void __user *buffer, size_t *length, loff_t *ppos)
67{ 67{
68 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 68 proc_dointvec_minmax(table, write, buffer, length, ppos);
69 if (write) { 69 if (write) {
70 if (sysctl_drop_caches & 1) 70 if (sysctl_drop_caches & 1)
71 drop_pagecache(); 71 drop_pagecache();
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 00b30a2d5466..542f625312f3 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -582,7 +582,7 @@ extern const struct inode_operations ecryptfs_dir_iops;
582extern const struct inode_operations ecryptfs_symlink_iops; 582extern const struct inode_operations ecryptfs_symlink_iops;
583extern const struct super_operations ecryptfs_sops; 583extern const struct super_operations ecryptfs_sops;
584extern const struct dentry_operations ecryptfs_dops; 584extern const struct dentry_operations ecryptfs_dops;
585extern struct address_space_operations ecryptfs_aops; 585extern const struct address_space_operations ecryptfs_aops;
586extern int ecryptfs_verbosity; 586extern int ecryptfs_verbosity;
587extern unsigned int ecryptfs_message_buf_len; 587extern unsigned int ecryptfs_message_buf_len;
588extern signed long ecryptfs_message_wait_timeout; 588extern signed long ecryptfs_message_wait_timeout;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 5c6bab9786e3..05772aeaa8f4 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -545,7 +545,7 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block)
545 return rc; 545 return rc;
546} 546}
547 547
548struct address_space_operations ecryptfs_aops = { 548const struct address_space_operations ecryptfs_aops = {
549 .writepage = ecryptfs_writepage, 549 .writepage = ecryptfs_writepage,
550 .readpage = ecryptfs_readpage, 550 .readpage = ecryptfs_readpage,
551 .write_begin = ecryptfs_write_begin, 551 .write_begin = ecryptfs_write_begin,
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 31d12de83a2a..8b47e4200e65 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -68,11 +68,16 @@ int eventfd_signal(struct eventfd_ctx *ctx, int n)
68} 68}
69EXPORT_SYMBOL_GPL(eventfd_signal); 69EXPORT_SYMBOL_GPL(eventfd_signal);
70 70
71static void eventfd_free_ctx(struct eventfd_ctx *ctx)
72{
73 kfree(ctx);
74}
75
71static void eventfd_free(struct kref *kref) 76static void eventfd_free(struct kref *kref)
72{ 77{
73 struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref); 78 struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
74 79
75 kfree(ctx); 80 eventfd_free_ctx(ctx);
76} 81}
77 82
78/** 83/**
@@ -298,9 +303,23 @@ struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
298} 303}
299EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); 304EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
300 305
301SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) 306/**
307 * eventfd_file_create - Creates an eventfd file pointer.
308 * @count: Initial eventfd counter value.
309 * @flags: Flags for the eventfd file.
310 *
311 * This function creates an eventfd file pointer, w/out installing it into
312 * the fd table. This is useful when the eventfd file is used during the
313 * initialization of data structures that require extra setup after the eventfd
314 * creation. So the eventfd creation is split into the file pointer creation
315 * phase, and the file descriptor installation phase.
316 * In this way races with userspace closing the newly installed file descriptor
317 * can be avoided.
318 * Returns an eventfd file pointer, or a proper error pointer.
319 */
320struct file *eventfd_file_create(unsigned int count, int flags)
302{ 321{
303 int fd; 322 struct file *file;
304 struct eventfd_ctx *ctx; 323 struct eventfd_ctx *ctx;
305 324
306 /* Check the EFD_* constants for consistency. */ 325 /* Check the EFD_* constants for consistency. */
@@ -308,26 +327,48 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
308 BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); 327 BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
309 328
310 if (flags & ~EFD_FLAGS_SET) 329 if (flags & ~EFD_FLAGS_SET)
311 return -EINVAL; 330 return ERR_PTR(-EINVAL);
312 331
313 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 332 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
314 if (!ctx) 333 if (!ctx)
315 return -ENOMEM; 334 return ERR_PTR(-ENOMEM);
316 335
317 kref_init(&ctx->kref); 336 kref_init(&ctx->kref);
318 init_waitqueue_head(&ctx->wqh); 337 init_waitqueue_head(&ctx->wqh);
319 ctx->count = count; 338 ctx->count = count;
320 ctx->flags = flags; 339 ctx->flags = flags;
321 340
322 /* 341 file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
323 * When we call this, the initialization must be complete, since 342 flags & EFD_SHARED_FCNTL_FLAGS);
324 * anon_inode_getfd() will install the fd. 343 if (IS_ERR(file))
325 */ 344 eventfd_free_ctx(ctx);
326 fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx, 345
327 flags & EFD_SHARED_FCNTL_FLAGS); 346 return file;
328 if (fd < 0) 347}
329 kfree(ctx); 348
349SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
350{
351 int fd, error;
352 struct file *file;
353
354 error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS);
355 if (error < 0)
356 return error;
357 fd = error;
358
359 file = eventfd_file_create(count, flags);
360 if (IS_ERR(file)) {
361 error = PTR_ERR(file);
362 goto err_put_unused_fd;
363 }
364 fd_install(fd, file);
365
330 return fd; 366 return fd;
367
368err_put_unused_fd:
369 put_unused_fd(fd);
370
371 return error;
331} 372}
332 373
333SYSCALL_DEFINE1(eventfd, unsigned int, count) 374SYSCALL_DEFINE1(eventfd, unsigned int, count)
diff --git a/fs/exec.c b/fs/exec.c
index 172ceb6edde4..d49be6bc1793 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,7 +33,7 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/perf_counter.h> 36#include <linux/perf_event.h>
37#include <linux/highmem.h> 37#include <linux/highmem.h>
38#include <linux/spinlock.h> 38#include <linux/spinlock.h>
39#include <linux/key.h> 39#include <linux/key.h>
@@ -55,6 +55,7 @@
55#include <linux/kmod.h> 55#include <linux/kmod.h>
56#include <linux/fsnotify.h> 56#include <linux/fsnotify.h>
57#include <linux/fs_struct.h> 57#include <linux/fs_struct.h>
58#include <linux/pipe_fs_i.h>
58 59
59#include <asm/uaccess.h> 60#include <asm/uaccess.h>
60#include <asm/mmu_context.h> 61#include <asm/mmu_context.h>
@@ -63,6 +64,7 @@
63 64
64int core_uses_pid; 65int core_uses_pid;
65char core_pattern[CORENAME_MAX_SIZE] = "core"; 66char core_pattern[CORENAME_MAX_SIZE] = "core";
67unsigned int core_pipe_limit;
66int suid_dumpable = 0; 68int suid_dumpable = 0;
67 69
68/* The maximal length of core_pattern is also specified in sysctl.c */ 70/* The maximal length of core_pattern is also specified in sysctl.c */
@@ -845,6 +847,9 @@ static int de_thread(struct task_struct *tsk)
845 sig->notify_count = 0; 847 sig->notify_count = 0;
846 848
847no_thread_group: 849no_thread_group:
850 if (current->mm)
851 setmax_mm_hiwater_rss(&sig->maxrss, current->mm);
852
848 exit_itimers(sig); 853 exit_itimers(sig);
849 flush_itimer_signals(); 854 flush_itimer_signals();
850 855
@@ -923,7 +928,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
923 task_lock(tsk); 928 task_lock(tsk);
924 strlcpy(tsk->comm, buf, sizeof(tsk->comm)); 929 strlcpy(tsk->comm, buf, sizeof(tsk->comm));
925 task_unlock(tsk); 930 task_unlock(tsk);
926 perf_counter_comm(tsk); 931 perf_event_comm(tsk);
927} 932}
928 933
929int flush_old_exec(struct linux_binprm * bprm) 934int flush_old_exec(struct linux_binprm * bprm)
@@ -997,7 +1002,7 @@ int flush_old_exec(struct linux_binprm * bprm)
997 * security domain: 1002 * security domain:
998 */ 1003 */
999 if (!get_dumpable(current->mm)) 1004 if (!get_dumpable(current->mm))
1000 perf_counter_exit_task(current); 1005 perf_event_exit_task(current);
1001 1006
1002 /* An exec changes our domain. We are no longer part of the thread 1007 /* An exec changes our domain. We are no longer part of the thread
1003 group */ 1008 group */
@@ -1354,6 +1359,8 @@ int do_execve(char * filename,
1354 if (retval < 0) 1359 if (retval < 0)
1355 goto out; 1360 goto out;
1356 1361
1362 current->stack_start = current->mm->start_stack;
1363
1357 /* execve succeeded */ 1364 /* execve succeeded */
1358 current->fs->in_exec = 0; 1365 current->fs->in_exec = 0;
1359 current->in_execve = 0; 1366 current->in_execve = 0;
@@ -1388,18 +1395,16 @@ out_ret:
1388 return retval; 1395 return retval;
1389} 1396}
1390 1397
1391int set_binfmt(struct linux_binfmt *new) 1398void set_binfmt(struct linux_binfmt *new)
1392{ 1399{
1393 struct linux_binfmt *old = current->binfmt; 1400 struct mm_struct *mm = current->mm;
1394 1401
1395 if (new) { 1402 if (mm->binfmt)
1396 if (!try_module_get(new->module)) 1403 module_put(mm->binfmt->module);
1397 return -1; 1404
1398 } 1405 mm->binfmt = new;
1399 current->binfmt = new; 1406 if (new)
1400 if (old) 1407 __module_get(new->module);
1401 module_put(old->module);
1402 return 0;
1403} 1408}
1404 1409
1405EXPORT_SYMBOL(set_binfmt); 1410EXPORT_SYMBOL(set_binfmt);
@@ -1723,6 +1728,29 @@ int get_dumpable(struct mm_struct *mm)
1723 return (ret >= 2) ? 2 : ret; 1728 return (ret >= 2) ? 2 : ret;
1724} 1729}
1725 1730
1731static void wait_for_dump_helpers(struct file *file)
1732{
1733 struct pipe_inode_info *pipe;
1734
1735 pipe = file->f_path.dentry->d_inode->i_pipe;
1736
1737 pipe_lock(pipe);
1738 pipe->readers++;
1739 pipe->writers--;
1740
1741 while ((pipe->readers > 1) && (!signal_pending(current))) {
1742 wake_up_interruptible_sync(&pipe->wait);
1743 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
1744 pipe_wait(pipe);
1745 }
1746
1747 pipe->readers--;
1748 pipe->writers++;
1749 pipe_unlock(pipe);
1750
1751}
1752
1753
1726void do_coredump(long signr, int exit_code, struct pt_regs *regs) 1754void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1727{ 1755{
1728 struct core_state core_state; 1756 struct core_state core_state;
@@ -1739,11 +1767,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1739 unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur; 1767 unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
1740 char **helper_argv = NULL; 1768 char **helper_argv = NULL;
1741 int helper_argc = 0; 1769 int helper_argc = 0;
1742 char *delimit; 1770 int dump_count = 0;
1771 static atomic_t core_dump_count = ATOMIC_INIT(0);
1743 1772
1744 audit_core_dumps(signr); 1773 audit_core_dumps(signr);
1745 1774
1746 binfmt = current->binfmt; 1775 binfmt = mm->binfmt;
1747 if (!binfmt || !binfmt->core_dump) 1776 if (!binfmt || !binfmt->core_dump)
1748 goto fail; 1777 goto fail;
1749 1778
@@ -1794,54 +1823,63 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1794 lock_kernel(); 1823 lock_kernel();
1795 ispipe = format_corename(corename, signr); 1824 ispipe = format_corename(corename, signr);
1796 unlock_kernel(); 1825 unlock_kernel();
1797 /* 1826
1798 * Don't bother to check the RLIMIT_CORE value if core_pattern points
1799 * to a pipe. Since we're not writing directly to the filesystem
1800 * RLIMIT_CORE doesn't really apply, as no actual core file will be
1801 * created unless the pipe reader choses to write out the core file
1802 * at which point file size limits and permissions will be imposed
1803 * as it does with any other process
1804 */
1805 if ((!ispipe) && (core_limit < binfmt->min_coredump)) 1827 if ((!ispipe) && (core_limit < binfmt->min_coredump))
1806 goto fail_unlock; 1828 goto fail_unlock;
1807 1829
1808 if (ispipe) { 1830 if (ispipe) {
1831 if (core_limit == 0) {
1832 /*
1833 * Normally core limits are irrelevant to pipes, since
1834 * we're not writing to the file system, but we use
1835 * core_limit of 0 here as a speacial value. Any
1836 * non-zero limit gets set to RLIM_INFINITY below, but
1837 * a limit of 0 skips the dump. This is a consistent
1838 * way to catch recursive crashes. We can still crash
1839 * if the core_pattern binary sets RLIM_CORE = !0
1840 * but it runs as root, and can do lots of stupid things
1841 * Note that we use task_tgid_vnr here to grab the pid
1842 * of the process group leader. That way we get the
1843 * right pid if a thread in a multi-threaded
1844 * core_pattern process dies.
1845 */
1846 printk(KERN_WARNING
1847 "Process %d(%s) has RLIMIT_CORE set to 0\n",
1848 task_tgid_vnr(current), current->comm);
1849 printk(KERN_WARNING "Aborting core\n");
1850 goto fail_unlock;
1851 }
1852
1853 dump_count = atomic_inc_return(&core_dump_count);
1854 if (core_pipe_limit && (core_pipe_limit < dump_count)) {
1855 printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
1856 task_tgid_vnr(current), current->comm);
1857 printk(KERN_WARNING "Skipping core dump\n");
1858 goto fail_dropcount;
1859 }
1860
1809 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); 1861 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
1810 if (!helper_argv) { 1862 if (!helper_argv) {
1811 printk(KERN_WARNING "%s failed to allocate memory\n", 1863 printk(KERN_WARNING "%s failed to allocate memory\n",
1812 __func__); 1864 __func__);
1813 goto fail_unlock; 1865 goto fail_dropcount;
1814 }
1815 /* Terminate the string before the first option */
1816 delimit = strchr(corename, ' ');
1817 if (delimit)
1818 *delimit = '\0';
1819 delimit = strrchr(helper_argv[0], '/');
1820 if (delimit)
1821 delimit++;
1822 else
1823 delimit = helper_argv[0];
1824 if (!strcmp(delimit, current->comm)) {
1825 printk(KERN_NOTICE "Recursive core dump detected, "
1826 "aborting\n");
1827 goto fail_unlock;
1828 } 1866 }
1829 1867
1830 core_limit = RLIM_INFINITY; 1868 core_limit = RLIM_INFINITY;
1831 1869
1832 /* SIGPIPE can happen, but it's just never processed */ 1870 /* SIGPIPE can happen, but it's just never processed */
1833 if (call_usermodehelper_pipe(corename+1, helper_argv, NULL, 1871 if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
1834 &file)) { 1872 &file)) {
1835 printk(KERN_INFO "Core dump to %s pipe failed\n", 1873 printk(KERN_INFO "Core dump to %s pipe failed\n",
1836 corename); 1874 corename);
1837 goto fail_unlock; 1875 goto fail_dropcount;
1838 } 1876 }
1839 } else 1877 } else
1840 file = filp_open(corename, 1878 file = filp_open(corename,
1841 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 1879 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1842 0600); 1880 0600);
1843 if (IS_ERR(file)) 1881 if (IS_ERR(file))
1844 goto fail_unlock; 1882 goto fail_dropcount;
1845 inode = file->f_path.dentry->d_inode; 1883 inode = file->f_path.dentry->d_inode;
1846 if (inode->i_nlink > 1) 1884 if (inode->i_nlink > 1)
1847 goto close_fail; /* multiple links - don't dump */ 1885 goto close_fail; /* multiple links - don't dump */
@@ -1870,7 +1908,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1870 if (retval) 1908 if (retval)
1871 current->signal->group_exit_code |= 0x80; 1909 current->signal->group_exit_code |= 0x80;
1872close_fail: 1910close_fail:
1911 if (ispipe && core_pipe_limit)
1912 wait_for_dump_helpers(file);
1873 filp_close(file, NULL); 1913 filp_close(file, NULL);
1914fail_dropcount:
1915 if (dump_count)
1916 atomic_dec(&core_dump_count);
1874fail_unlock: 1917fail_unlock:
1875 if (helper_argv) 1918 if (helper_argv)
1876 argv_free(helper_argv); 1919 argv_free(helper_argv);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 23701f289e98..dd7175ce5606 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -70,7 +70,7 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str
70 if (PTR_ERR(inode) == -ESTALE) { 70 if (PTR_ERR(inode) == -ESTALE) {
71 ext2_error(dir->i_sb, __func__, 71 ext2_error(dir->i_sb, __func__,
72 "deleted inode referenced: %lu", 72 "deleted inode referenced: %lu",
73 ino); 73 (unsigned long) ino);
74 return ERR_PTR(-EIO); 74 return ERR_PTR(-EIO);
75 } else { 75 } else {
76 return ERR_CAST(inode); 76 return ERR_CAST(inode);
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index b72b85884223..c18fbf3e4068 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -20,7 +20,7 @@ __inode_direct_access(struct inode *inode, sector_t block,
20 void **kaddr, unsigned long *pfn) 20 void **kaddr, unsigned long *pfn)
21{ 21{
22 struct block_device *bdev = inode->i_sb->s_bdev; 22 struct block_device *bdev = inode->i_sb->s_bdev;
23 struct block_device_operations *ops = bdev->bd_disk->fops; 23 const struct block_device_operations *ops = bdev->bd_disk->fops;
24 sector_t sector; 24 sector_t sector;
25 25
26 sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */ 26 sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d33634119e17..451d166bbe93 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/time.h> 25#include <linux/time.h>
26#include <linux/blkdev.h>
26#include <linux/fs.h> 27#include <linux/fs.h>
27#include <linux/sched.h> 28#include <linux/sched.h>
28#include <linux/writeback.h> 29#include <linux/writeback.h>
@@ -73,7 +74,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
73 } 74 }
74 75
75 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 76 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
76 goto out; 77 goto flush;
77 78
78 /* 79 /*
79 * The VFS has written the file data. If the inode is unaltered 80 * The VFS has written the file data. If the inode is unaltered
@@ -85,7 +86,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
85 .nr_to_write = 0, /* sys_fsync did this */ 86 .nr_to_write = 0, /* sys_fsync did this */
86 }; 87 };
87 ret = sync_inode(inode, &wbc); 88 ret = sync_inode(inode, &wbc);
89 goto out;
88 } 90 }
91flush:
92 /*
93 * In case we didn't commit a transaction, we have to flush
94 * disk caches manually so that data really is on persistent
95 * storage
96 */
97 if (test_opt(inode->i_sb, BARRIER))
98 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
89out: 99out:
90 return ret; 100 return ret;
91} 101}
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 953b430f92e3..acf1b1423327 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -172,10 +172,21 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
172 * so before we call here everything must be consistently dirtied against 172 * so before we call here everything must be consistently dirtied against
173 * this transaction. 173 * this transaction.
174 */ 174 */
175static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) 175static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
176{ 176{
177 int ret;
178
177 jbd_debug(2, "restarting handle %p\n", handle); 179 jbd_debug(2, "restarting handle %p\n", handle);
178 return ext3_journal_restart(handle, blocks_for_truncate(inode)); 180 /*
181 * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
182 * At this moment, get_block can be called only for blocks inside
183 * i_size since page cache has been already dropped and writes are
184 * blocked by i_mutex. So we can safely drop the truncate_mutex.
185 */
186 mutex_unlock(&EXT3_I(inode)->truncate_mutex);
187 ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
188 mutex_lock(&EXT3_I(inode)->truncate_mutex);
189 return ret;
179} 190}
180 191
181/* 192/*
@@ -2075,7 +2086,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
2075 ext3_journal_dirty_metadata(handle, bh); 2086 ext3_journal_dirty_metadata(handle, bh);
2076 } 2087 }
2077 ext3_mark_inode_dirty(handle, inode); 2088 ext3_mark_inode_dirty(handle, inode);
2078 ext3_journal_test_restart(handle, inode); 2089 truncate_restart_transaction(handle, inode);
2079 if (bh) { 2090 if (bh) {
2080 BUFFER_TRACE(bh, "retaking write access"); 2091 BUFFER_TRACE(bh, "retaking write access");
2081 ext3_journal_get_write_access(handle, bh); 2092 ext3_journal_get_write_access(handle, bh);
@@ -2285,7 +2296,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
2285 return; 2296 return;
2286 if (try_to_extend_transaction(handle, inode)) { 2297 if (try_to_extend_transaction(handle, inode)) {
2287 ext3_mark_inode_dirty(handle, inode); 2298 ext3_mark_inode_dirty(handle, inode);
2288 ext3_journal_test_restart(handle, inode); 2299 truncate_restart_transaction(handle, inode);
2289 } 2300 }
2290 2301
2291 ext3_free_blocks(handle, inode, nr, 1); 2302 ext3_free_blocks(handle, inode, nr, 1);
@@ -2895,6 +2906,10 @@ static int ext3_do_update_inode(handle_t *handle,
2895 struct buffer_head *bh = iloc->bh; 2906 struct buffer_head *bh = iloc->bh;
2896 int err = 0, rc, block; 2907 int err = 0, rc, block;
2897 2908
2909again:
2910 /* we can't allow multiple procs in here at once, its a bit racey */
2911 lock_buffer(bh);
2912
2898 /* For fields not not tracking in the in-memory inode, 2913 /* For fields not not tracking in the in-memory inode,
2899 * initialise them to zero for new inodes. */ 2914 * initialise them to zero for new inodes. */
2900 if (ei->i_state & EXT3_STATE_NEW) 2915 if (ei->i_state & EXT3_STATE_NEW)
@@ -2954,16 +2969,20 @@ static int ext3_do_update_inode(handle_t *handle,
2954 /* If this is the first large file 2969 /* If this is the first large file
2955 * created, add a flag to the superblock. 2970 * created, add a flag to the superblock.
2956 */ 2971 */
2972 unlock_buffer(bh);
2957 err = ext3_journal_get_write_access(handle, 2973 err = ext3_journal_get_write_access(handle,
2958 EXT3_SB(sb)->s_sbh); 2974 EXT3_SB(sb)->s_sbh);
2959 if (err) 2975 if (err)
2960 goto out_brelse; 2976 goto out_brelse;
2977
2961 ext3_update_dynamic_rev(sb); 2978 ext3_update_dynamic_rev(sb);
2962 EXT3_SET_RO_COMPAT_FEATURE(sb, 2979 EXT3_SET_RO_COMPAT_FEATURE(sb,
2963 EXT3_FEATURE_RO_COMPAT_LARGE_FILE); 2980 EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2964 handle->h_sync = 1; 2981 handle->h_sync = 1;
2965 err = ext3_journal_dirty_metadata(handle, 2982 err = ext3_journal_dirty_metadata(handle,
2966 EXT3_SB(sb)->s_sbh); 2983 EXT3_SB(sb)->s_sbh);
2984 /* get our lock and start over */
2985 goto again;
2967 } 2986 }
2968 } 2987 }
2969 } 2988 }
@@ -2986,6 +3005,7 @@ static int ext3_do_update_inode(handle_t *handle,
2986 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 3005 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2987 3006
2988 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 3007 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
3008 unlock_buffer(bh);
2989 rc = ext3_journal_dirty_metadata(handle, bh); 3009 rc = ext3_journal_dirty_metadata(handle, bh);
2990 if (!err) 3010 if (!err)
2991 err = rc; 3011 err = rc;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index a8d80a7f1105..72743d360509 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -720,7 +720,7 @@ static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
720static ssize_t ext3_quota_write(struct super_block *sb, int type, 720static ssize_t ext3_quota_write(struct super_block *sb, int type,
721 const char *data, size_t len, loff_t off); 721 const char *data, size_t len, loff_t off);
722 722
723static struct dquot_operations ext3_quota_operations = { 723static const struct dquot_operations ext3_quota_operations = {
724 .initialize = dquot_initialize, 724 .initialize = dquot_initialize,
725 .drop = dquot_drop, 725 .drop = dquot_drop,
726 .alloc_space = dquot_alloc_space, 726 .alloc_space = dquot_alloc_space,
@@ -737,7 +737,7 @@ static struct dquot_operations ext3_quota_operations = {
737 .destroy_dquot = dquot_destroy, 737 .destroy_dquot = dquot_destroy,
738}; 738};
739 739
740static struct quotactl_ops ext3_qctl_operations = { 740static const struct quotactl_ops ext3_qctl_operations = {
741 .quota_on = ext3_quota_on, 741 .quota_on = ext3_quota_on,
742 .quota_off = vfs_quota_off, 742 .quota_off = vfs_quota_off,
743 .quota_sync = vfs_quota_sync, 743 .quota_sync = vfs_quota_sync,
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 418b6f3b0ae8..d5c0ea2e8f2d 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -37,7 +37,7 @@ config EXT4DEV_COMPAT
37 37
38 To enable backwards compatibility so that systems that are 38 To enable backwards compatibility so that systems that are
39 still expecting to mount ext4 filesystems using ext4dev, 39 still expecting to mount ext4 filesystems using ext4dev,
40 chose Y here. This feature will go away by 2.6.31, so 40 choose Y here. This feature will go away by 2.6.31, so
41 please arrange to get your userspace programs fixed! 41 please arrange to get your userspace programs fixed!
42 42
43config EXT4_FS_XATTR 43config EXT4_FS_XATTR
@@ -77,3 +77,12 @@ config EXT4_FS_SECURITY
77 77
78 If you are not using a security module that requires using 78 If you are not using a security module that requires using
79 extended attributes for file security labels, say N. 79 extended attributes for file security labels, say N.
80
81config EXT4_DEBUG
82 bool "EXT4 debugging support"
83 depends on EXT4_FS
84 help
85 Enables run-time debugging support for the ext4 filesystem.
86
87 If you select Y here, then you will be able to turn on debugging
88 with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e2126d70dff5..1d0418980f8d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -478,7 +478,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
478 * new bitmap information 478 * new bitmap information
479 */ 479 */
480 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 480 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
481 ext4_mb_update_group_info(grp, blocks_freed); 481 grp->bb_free += blocks_freed;
482 up_write(&grp->alloc_sem); 482 up_write(&grp->alloc_sem);
483 483
484 /* We dirtied the bitmap block */ 484 /* We dirtied the bitmap block */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9714db393efe..e227eea23f05 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -67,27 +67,29 @@ typedef unsigned int ext4_group_t;
67 67
68 68
69/* prefer goal again. length */ 69/* prefer goal again. length */
70#define EXT4_MB_HINT_MERGE 1 70#define EXT4_MB_HINT_MERGE 0x0001
71/* blocks already reserved */ 71/* blocks already reserved */
72#define EXT4_MB_HINT_RESERVED 2 72#define EXT4_MB_HINT_RESERVED 0x0002
73/* metadata is being allocated */ 73/* metadata is being allocated */
74#define EXT4_MB_HINT_METADATA 4 74#define EXT4_MB_HINT_METADATA 0x0004
75/* first blocks in the file */ 75/* first blocks in the file */
76#define EXT4_MB_HINT_FIRST 8 76#define EXT4_MB_HINT_FIRST 0x0008
77/* search for the best chunk */ 77/* search for the best chunk */
78#define EXT4_MB_HINT_BEST 16 78#define EXT4_MB_HINT_BEST 0x0010
79/* data is being allocated */ 79/* data is being allocated */
80#define EXT4_MB_HINT_DATA 32 80#define EXT4_MB_HINT_DATA 0x0020
81/* don't preallocate (for tails) */ 81/* don't preallocate (for tails) */
82#define EXT4_MB_HINT_NOPREALLOC 64 82#define EXT4_MB_HINT_NOPREALLOC 0x0040
83/* allocate for locality group */ 83/* allocate for locality group */
84#define EXT4_MB_HINT_GROUP_ALLOC 128 84#define EXT4_MB_HINT_GROUP_ALLOC 0x0080
85/* allocate goal blocks or none */ 85/* allocate goal blocks or none */
86#define EXT4_MB_HINT_GOAL_ONLY 256 86#define EXT4_MB_HINT_GOAL_ONLY 0x0100
87/* goal is meaningful */ 87/* goal is meaningful */
88#define EXT4_MB_HINT_TRY_GOAL 512 88#define EXT4_MB_HINT_TRY_GOAL 0x0200
89/* blocks already pre-reserved by delayed allocation */ 89/* blocks already pre-reserved by delayed allocation */
90#define EXT4_MB_DELALLOC_RESERVED 1024 90#define EXT4_MB_DELALLOC_RESERVED 0x0400
91/* We are doing stream allocation */
92#define EXT4_MB_STREAM_ALLOC 0x0800
91 93
92 94
93struct ext4_allocation_request { 95struct ext4_allocation_request {
@@ -112,6 +114,21 @@ struct ext4_allocation_request {
112}; 114};
113 115
114/* 116/*
117 * For delayed allocation tracking
118 */
119struct mpage_da_data {
120 struct inode *inode;
121 sector_t b_blocknr; /* start block number of extent */
122 size_t b_size; /* size of extent */
123 unsigned long b_state; /* state of the extent */
124 unsigned long first_page, next_page; /* extent of pages */
125 struct writeback_control *wbc;
126 int io_done;
127 int pages_written;
128 int retval;
129};
130
131/*
115 * Special inodes numbers 132 * Special inodes numbers
116 */ 133 */
117#define EXT4_BAD_INO 1 /* Bad blocks inode */ 134#define EXT4_BAD_INO 1 /* Bad blocks inode */
@@ -251,7 +268,6 @@ struct flex_groups {
251#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ 268#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
252#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ 269#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
253#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ 270#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
254#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */
255#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 271#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
256 272
257#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ 273#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
@@ -289,6 +305,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
289#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ 305#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
290#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ 306#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
291#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ 307#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
308#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
292 309
293/* Used to pass group descriptor data when online resize is done */ 310/* Used to pass group descriptor data when online resize is done */
294struct ext4_new_group_input { 311struct ext4_new_group_input {
@@ -386,6 +403,9 @@ struct ext4_mount_options {
386#endif 403#endif
387}; 404};
388 405
406/* Max physical block we can addres w/o extents */
407#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
408
389/* 409/*
390 * Structure of an inode on the disk 410 * Structure of an inode on the disk
391 */ 411 */
@@ -456,7 +476,6 @@ struct move_extent {
456 __u64 len; /* block length to be moved */ 476 __u64 len; /* block length to be moved */
457 __u64 moved_len; /* moved block length */ 477 __u64 moved_len; /* moved block length */
458}; 478};
459#define MAX_DEFRAG_SIZE ((1UL<<31) - 1)
460 479
461#define EXT4_EPOCH_BITS 2 480#define EXT4_EPOCH_BITS 2
462#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) 481#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -694,7 +713,6 @@ struct ext4_inode_info {
694#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 713#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
695#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 714#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
696#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 715#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
697#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
698#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 716#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
699#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 717#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
700#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 718#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
@@ -841,6 +859,7 @@ struct ext4_sb_info {
841 unsigned long s_gdb_count; /* Number of group descriptor blocks */ 859 unsigned long s_gdb_count; /* Number of group descriptor blocks */
842 unsigned long s_desc_per_block; /* Number of group descriptors per block */ 860 unsigned long s_desc_per_block; /* Number of group descriptors per block */
843 ext4_group_t s_groups_count; /* Number of groups in the fs */ 861 ext4_group_t s_groups_count; /* Number of groups in the fs */
862 ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
844 unsigned long s_overhead_last; /* Last calculated overhead */ 863 unsigned long s_overhead_last; /* Last calculated overhead */
845 unsigned long s_blocks_last; /* Last seen block count */ 864 unsigned long s_blocks_last; /* Last seen block count */
846 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ 865 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
@@ -950,6 +969,7 @@ struct ext4_sb_info {
950 atomic_t s_mb_lost_chunks; 969 atomic_t s_mb_lost_chunks;
951 atomic_t s_mb_preallocated; 970 atomic_t s_mb_preallocated;
952 atomic_t s_mb_discarded; 971 atomic_t s_mb_discarded;
972 atomic_t s_lock_busy;
953 973
954 /* locality groups */ 974 /* locality groups */
955 struct ext4_locality_group *s_locality_groups; 975 struct ext4_locality_group *s_locality_groups;
@@ -1340,8 +1360,6 @@ extern void ext4_mb_free_blocks(handle_t *, struct inode *,
1340 ext4_fsblk_t, unsigned long, int, unsigned long *); 1360 ext4_fsblk_t, unsigned long, int, unsigned long *);
1341extern int ext4_mb_add_groupinfo(struct super_block *sb, 1361extern int ext4_mb_add_groupinfo(struct super_block *sb,
1342 ext4_group_t i, struct ext4_group_desc *desc); 1362 ext4_group_t i, struct ext4_group_desc *desc);
1343extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
1344 ext4_grpblk_t add);
1345extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); 1363extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
1346extern void ext4_mb_put_buddy_cache_lock(struct super_block *, 1364extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
1347 ext4_group_t, int); 1365 ext4_group_t, int);
@@ -1367,6 +1385,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
1367extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1385extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1368extern int ext4_can_truncate(struct inode *inode); 1386extern int ext4_can_truncate(struct inode *inode);
1369extern void ext4_truncate(struct inode *); 1387extern void ext4_truncate(struct inode *);
1388extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
1370extern void ext4_set_inode_flags(struct inode *); 1389extern void ext4_set_inode_flags(struct inode *);
1371extern void ext4_get_inode_flags(struct ext4_inode_info *); 1390extern void ext4_get_inode_flags(struct ext4_inode_info *);
1372extern int ext4_alloc_da_blocks(struct inode *inode); 1391extern int ext4_alloc_da_blocks(struct inode *inode);
@@ -1575,15 +1594,18 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
1575struct ext4_group_info { 1594struct ext4_group_info {
1576 unsigned long bb_state; 1595 unsigned long bb_state;
1577 struct rb_root bb_free_root; 1596 struct rb_root bb_free_root;
1578 unsigned short bb_first_free; 1597 ext4_grpblk_t bb_first_free; /* first free block */
1579 unsigned short bb_free; 1598 ext4_grpblk_t bb_free; /* total free blocks */
1580 unsigned short bb_fragments; 1599 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
1581 struct list_head bb_prealloc_list; 1600 struct list_head bb_prealloc_list;
1582#ifdef DOUBLE_CHECK 1601#ifdef DOUBLE_CHECK
1583 void *bb_bitmap; 1602 void *bb_bitmap;
1584#endif 1603#endif
1585 struct rw_semaphore alloc_sem; 1604 struct rw_semaphore alloc_sem;
1586 unsigned short bb_counters[]; 1605 ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
1606 * regions, index is order.
1607 * bb_counters[3] = 5 means
1608 * 5 free 8-block regions. */
1587}; 1609};
1588 1610
1589#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 1611#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
@@ -1591,15 +1613,42 @@ struct ext4_group_info {
1591#define EXT4_MB_GRP_NEED_INIT(grp) \ 1613#define EXT4_MB_GRP_NEED_INIT(grp) \
1592 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) 1614 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
1593 1615
1616#define EXT4_MAX_CONTENTION 8
1617#define EXT4_CONTENTION_THRESHOLD 2
1618
1594static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, 1619static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
1595 ext4_group_t group) 1620 ext4_group_t group)
1596{ 1621{
1597 return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); 1622 return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
1598} 1623}
1599 1624
1625/*
1626 * Returns true if the filesystem is busy enough that attempts to
1627 * access the block group locks has run into contention.
1628 */
1629static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
1630{
1631 return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
1632}
1633
1600static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) 1634static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
1601{ 1635{
1602 spin_lock(ext4_group_lock_ptr(sb, group)); 1636 spinlock_t *lock = ext4_group_lock_ptr(sb, group);
1637 if (spin_trylock(lock))
1638 /*
1639 * We're able to grab the lock right away, so drop the
1640 * lock contention counter.
1641 */
1642 atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
1643 else {
1644 /*
1645 * The lock is busy, so bump the contention counter,
1646 * and then wait on the spin lock.
1647 */
1648 atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
1649 EXT4_MAX_CONTENTION);
1650 spin_lock(lock);
1651 }
1603} 1652}
1604 1653
1605static inline void ext4_unlock_group(struct super_block *sb, 1654static inline void ext4_unlock_group(struct super_block *sb,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 20a84105a10b..61652f1d15e6 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,8 +43,7 @@
43#define CHECK_BINSEARCH__ 43#define CHECK_BINSEARCH__
44 44
45/* 45/*
46 * If EXT_DEBUG is defined you can use the 'extdebug' mount option 46 * Turn on EXT_DEBUG to get lots of info about extents operations.
47 * to get lots of info about what's going on.
48 */ 47 */
49#define EXT_DEBUG__ 48#define EXT_DEBUG__
50#ifdef EXT_DEBUG 49#ifdef EXT_DEBUG
@@ -138,6 +137,7 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
138#define EXT_BREAK 1 137#define EXT_BREAK 1
139#define EXT_REPEAT 2 138#define EXT_REPEAT 2
140 139
140/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */
141#define EXT_MAX_BLOCK 0xffffffff 141#define EXT_MAX_BLOCK 0xffffffff
142 142
143/* 143/*
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index eb27fd0f2ee8..6a9409920dee 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
44 handle, err); 44 handle, err);
45 } 45 }
46 else 46 else
47 brelse(bh); 47 bforget(bh);
48 return err; 48 return err;
49} 49}
50 50
@@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
60 handle, err); 60 handle, err);
61 } 61 }
62 else 62 else
63 brelse(bh); 63 bforget(bh);
64 return err; 64 return err;
65} 65}
66 66
@@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
89 ext4_journal_abort_handle(where, __func__, bh, 89 ext4_journal_abort_handle(where, __func__, bh,
90 handle, err); 90 handle, err);
91 } else { 91 } else {
92 mark_buffer_dirty(bh); 92 if (inode && bh)
93 mark_buffer_dirty_inode(bh, inode);
94 else
95 mark_buffer_dirty(bh);
93 if (inode && inode_needs_sync(inode)) { 96 if (inode && inode_needs_sync(inode)) {
94 sync_dirty_buffer(bh); 97 sync_dirty_buffer(bh);
95 if (buffer_req(bh) && !buffer_uptodate(bh)) { 98 if (buffer_req(bh) && !buffer_uptodate(bh)) {
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 73ebfb44ad75..7a3832577923 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
93 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); 93 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
94} 94}
95 95
96static int ext4_ext_journal_restart(handle_t *handle, int needed) 96static int ext4_ext_truncate_extend_restart(handle_t *handle,
97 struct inode *inode,
98 int needed)
97{ 99{
98 int err; 100 int err;
99 101
@@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
104 err = ext4_journal_extend(handle, needed); 106 err = ext4_journal_extend(handle, needed);
105 if (err <= 0) 107 if (err <= 0)
106 return err; 108 return err;
107 return ext4_journal_restart(handle, needed); 109 err = ext4_truncate_restart_trans(handle, inode, needed);
110 /*
111 * We have dropped i_data_sem so someone might have cached again
112 * an extent we are going to truncate.
113 */
114 ext4_ext_invalidate_cache(inode);
115
116 return err;
108} 117}
109 118
110/* 119/*
@@ -220,57 +229,65 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
220 return newblock; 229 return newblock;
221} 230}
222 231
223static int ext4_ext_space_block(struct inode *inode) 232static inline int ext4_ext_space_block(struct inode *inode, int check)
224{ 233{
225 int size; 234 int size;
226 235
227 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 236 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
228 / sizeof(struct ext4_extent); 237 / sizeof(struct ext4_extent);
238 if (!check) {
229#ifdef AGGRESSIVE_TEST 239#ifdef AGGRESSIVE_TEST
230 if (size > 6) 240 if (size > 6)
231 size = 6; 241 size = 6;
232#endif 242#endif
243 }
233 return size; 244 return size;
234} 245}
235 246
236static int ext4_ext_space_block_idx(struct inode *inode) 247static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
237{ 248{
238 int size; 249 int size;
239 250
240 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 251 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
241 / sizeof(struct ext4_extent_idx); 252 / sizeof(struct ext4_extent_idx);
253 if (!check) {
242#ifdef AGGRESSIVE_TEST 254#ifdef AGGRESSIVE_TEST
243 if (size > 5) 255 if (size > 5)
244 size = 5; 256 size = 5;
245#endif 257#endif
258 }
246 return size; 259 return size;
247} 260}
248 261
249static int ext4_ext_space_root(struct inode *inode) 262static inline int ext4_ext_space_root(struct inode *inode, int check)
250{ 263{
251 int size; 264 int size;
252 265
253 size = sizeof(EXT4_I(inode)->i_data); 266 size = sizeof(EXT4_I(inode)->i_data);
254 size -= sizeof(struct ext4_extent_header); 267 size -= sizeof(struct ext4_extent_header);
255 size /= sizeof(struct ext4_extent); 268 size /= sizeof(struct ext4_extent);
269 if (!check) {
256#ifdef AGGRESSIVE_TEST 270#ifdef AGGRESSIVE_TEST
257 if (size > 3) 271 if (size > 3)
258 size = 3; 272 size = 3;
259#endif 273#endif
274 }
260 return size; 275 return size;
261} 276}
262 277
263static int ext4_ext_space_root_idx(struct inode *inode) 278static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
264{ 279{
265 int size; 280 int size;
266 281
267 size = sizeof(EXT4_I(inode)->i_data); 282 size = sizeof(EXT4_I(inode)->i_data);
268 size -= sizeof(struct ext4_extent_header); 283 size -= sizeof(struct ext4_extent_header);
269 size /= sizeof(struct ext4_extent_idx); 284 size /= sizeof(struct ext4_extent_idx);
285 if (!check) {
270#ifdef AGGRESSIVE_TEST 286#ifdef AGGRESSIVE_TEST
271 if (size > 4) 287 if (size > 4)
272 size = 4; 288 size = 4;
273#endif 289#endif
290 }
274 return size; 291 return size;
275} 292}
276 293
@@ -284,9 +301,9 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
284 int lcap, icap, rcap, leafs, idxs, num; 301 int lcap, icap, rcap, leafs, idxs, num;
285 int newextents = blocks; 302 int newextents = blocks;
286 303
287 rcap = ext4_ext_space_root_idx(inode); 304 rcap = ext4_ext_space_root_idx(inode, 0);
288 lcap = ext4_ext_space_block(inode); 305 lcap = ext4_ext_space_block(inode, 0);
289 icap = ext4_ext_space_block_idx(inode); 306 icap = ext4_ext_space_block_idx(inode, 0);
290 307
291 /* number of new leaf blocks needed */ 308 /* number of new leaf blocks needed */
292 num = leafs = (newextents + lcap - 1) / lcap; 309 num = leafs = (newextents + lcap - 1) / lcap;
@@ -311,14 +328,14 @@ ext4_ext_max_entries(struct inode *inode, int depth)
311 328
312 if (depth == ext_depth(inode)) { 329 if (depth == ext_depth(inode)) {
313 if (depth == 0) 330 if (depth == 0)
314 max = ext4_ext_space_root(inode); 331 max = ext4_ext_space_root(inode, 1);
315 else 332 else
316 max = ext4_ext_space_root_idx(inode); 333 max = ext4_ext_space_root_idx(inode, 1);
317 } else { 334 } else {
318 if (depth == 0) 335 if (depth == 0)
319 max = ext4_ext_space_block(inode); 336 max = ext4_ext_space_block(inode, 1);
320 else 337 else
321 max = ext4_ext_space_block_idx(inode); 338 max = ext4_ext_space_block_idx(inode, 1);
322 } 339 }
323 340
324 return max; 341 return max;
@@ -437,8 +454,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
437 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 454 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
438 idx_pblock(path->p_idx)); 455 idx_pblock(path->p_idx));
439 } else if (path->p_ext) { 456 } else if (path->p_ext) {
440 ext_debug(" %d:%d:%llu ", 457 ext_debug(" %d:[%d]%d:%llu ",
441 le32_to_cpu(path->p_ext->ee_block), 458 le32_to_cpu(path->p_ext->ee_block),
459 ext4_ext_is_uninitialized(path->p_ext),
442 ext4_ext_get_actual_len(path->p_ext), 460 ext4_ext_get_actual_len(path->p_ext),
443 ext_pblock(path->p_ext)); 461 ext_pblock(path->p_ext));
444 } else 462 } else
@@ -460,8 +478,11 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
460 eh = path[depth].p_hdr; 478 eh = path[depth].p_hdr;
461 ex = EXT_FIRST_EXTENT(eh); 479 ex = EXT_FIRST_EXTENT(eh);
462 480
481 ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
482
463 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 483 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
464 ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block), 484 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
485 ext4_ext_is_uninitialized(ex),
465 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 486 ext4_ext_get_actual_len(ex), ext_pblock(ex));
466 } 487 }
467 ext_debug("\n"); 488 ext_debug("\n");
@@ -580,9 +601,10 @@ ext4_ext_binsearch(struct inode *inode,
580 } 601 }
581 602
582 path->p_ext = l - 1; 603 path->p_ext = l - 1;
583 ext_debug(" -> %d:%llu:%d ", 604 ext_debug(" -> %d:%llu:[%d]%d ",
584 le32_to_cpu(path->p_ext->ee_block), 605 le32_to_cpu(path->p_ext->ee_block),
585 ext_pblock(path->p_ext), 606 ext_pblock(path->p_ext),
607 ext4_ext_is_uninitialized(path->p_ext),
586 ext4_ext_get_actual_len(path->p_ext)); 608 ext4_ext_get_actual_len(path->p_ext));
587 609
588#ifdef CHECK_BINSEARCH 610#ifdef CHECK_BINSEARCH
@@ -612,7 +634,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
612 eh->eh_depth = 0; 634 eh->eh_depth = 0;
613 eh->eh_entries = 0; 635 eh->eh_entries = 0;
614 eh->eh_magic = EXT4_EXT_MAGIC; 636 eh->eh_magic = EXT4_EXT_MAGIC;
615 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode)); 637 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
616 ext4_mark_inode_dirty(handle, inode); 638 ext4_mark_inode_dirty(handle, inode);
617 ext4_ext_invalidate_cache(inode); 639 ext4_ext_invalidate_cache(inode);
618 return 0; 640 return 0;
@@ -837,7 +859,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
837 859
838 neh = ext_block_hdr(bh); 860 neh = ext_block_hdr(bh);
839 neh->eh_entries = 0; 861 neh->eh_entries = 0;
840 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); 862 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
841 neh->eh_magic = EXT4_EXT_MAGIC; 863 neh->eh_magic = EXT4_EXT_MAGIC;
842 neh->eh_depth = 0; 864 neh->eh_depth = 0;
843 ex = EXT_FIRST_EXTENT(neh); 865 ex = EXT_FIRST_EXTENT(neh);
@@ -850,9 +872,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
850 path[depth].p_ext++; 872 path[depth].p_ext++;
851 while (path[depth].p_ext <= 873 while (path[depth].p_ext <=
852 EXT_MAX_EXTENT(path[depth].p_hdr)) { 874 EXT_MAX_EXTENT(path[depth].p_hdr)) {
853 ext_debug("move %d:%llu:%d in new leaf %llu\n", 875 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
854 le32_to_cpu(path[depth].p_ext->ee_block), 876 le32_to_cpu(path[depth].p_ext->ee_block),
855 ext_pblock(path[depth].p_ext), 877 ext_pblock(path[depth].p_ext),
878 ext4_ext_is_uninitialized(path[depth].p_ext),
856 ext4_ext_get_actual_len(path[depth].p_ext), 879 ext4_ext_get_actual_len(path[depth].p_ext),
857 newblock); 880 newblock);
858 /*memmove(ex++, path[depth].p_ext++, 881 /*memmove(ex++, path[depth].p_ext++,
@@ -912,7 +935,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
912 neh = ext_block_hdr(bh); 935 neh = ext_block_hdr(bh);
913 neh->eh_entries = cpu_to_le16(1); 936 neh->eh_entries = cpu_to_le16(1);
914 neh->eh_magic = EXT4_EXT_MAGIC; 937 neh->eh_magic = EXT4_EXT_MAGIC;
915 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); 938 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
916 neh->eh_depth = cpu_to_le16(depth - i); 939 neh->eh_depth = cpu_to_le16(depth - i);
917 fidx = EXT_FIRST_INDEX(neh); 940 fidx = EXT_FIRST_INDEX(neh);
918 fidx->ei_block = border; 941 fidx->ei_block = border;
@@ -1037,9 +1060,9 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1037 /* old root could have indexes or leaves 1060 /* old root could have indexes or leaves
1038 * so calculate e_max right way */ 1061 * so calculate e_max right way */
1039 if (ext_depth(inode)) 1062 if (ext_depth(inode))
1040 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); 1063 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1041 else 1064 else
1042 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); 1065 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1043 neh->eh_magic = EXT4_EXT_MAGIC; 1066 neh->eh_magic = EXT4_EXT_MAGIC;
1044 set_buffer_uptodate(bh); 1067 set_buffer_uptodate(bh);
1045 unlock_buffer(bh); 1068 unlock_buffer(bh);
@@ -1054,7 +1077,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1054 goto out; 1077 goto out;
1055 1078
1056 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; 1079 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
1057 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode)); 1080 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1058 curp->p_hdr->eh_entries = cpu_to_le16(1); 1081 curp->p_hdr->eh_entries = cpu_to_le16(1);
1059 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); 1082 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
1060 1083
@@ -1580,9 +1603,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1580 1603
1581 /* try to insert block into found extent and return */ 1604 /* try to insert block into found extent and return */
1582 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { 1605 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
1583 ext_debug("append %d block to %d:%d (from %llu)\n", 1606 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1607 ext4_ext_is_uninitialized(newext),
1584 ext4_ext_get_actual_len(newext), 1608 ext4_ext_get_actual_len(newext),
1585 le32_to_cpu(ex->ee_block), 1609 le32_to_cpu(ex->ee_block),
1610 ext4_ext_is_uninitialized(ex),
1586 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 1611 ext4_ext_get_actual_len(ex), ext_pblock(ex));
1587 err = ext4_ext_get_access(handle, inode, path + depth); 1612 err = ext4_ext_get_access(handle, inode, path + depth);
1588 if (err) 1613 if (err)
@@ -1651,9 +1676,10 @@ has_space:
1651 1676
1652 if (!nearex) { 1677 if (!nearex) {
1653 /* there is no extent in this leaf, create first one */ 1678 /* there is no extent in this leaf, create first one */
1654 ext_debug("first extent in the leaf: %d:%llu:%d\n", 1679 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1655 le32_to_cpu(newext->ee_block), 1680 le32_to_cpu(newext->ee_block),
1656 ext_pblock(newext), 1681 ext_pblock(newext),
1682 ext4_ext_is_uninitialized(newext),
1657 ext4_ext_get_actual_len(newext)); 1683 ext4_ext_get_actual_len(newext));
1658 path[depth].p_ext = EXT_FIRST_EXTENT(eh); 1684 path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1659 } else if (le32_to_cpu(newext->ee_block) 1685 } else if (le32_to_cpu(newext->ee_block)
@@ -1663,10 +1689,11 @@ has_space:
1663 len = EXT_MAX_EXTENT(eh) - nearex; 1689 len = EXT_MAX_EXTENT(eh) - nearex;
1664 len = (len - 1) * sizeof(struct ext4_extent); 1690 len = (len - 1) * sizeof(struct ext4_extent);
1665 len = len < 0 ? 0 : len; 1691 len = len < 0 ? 0 : len;
1666 ext_debug("insert %d:%llu:%d after: nearest 0x%p, " 1692 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1667 "move %d from 0x%p to 0x%p\n", 1693 "move %d from 0x%p to 0x%p\n",
1668 le32_to_cpu(newext->ee_block), 1694 le32_to_cpu(newext->ee_block),
1669 ext_pblock(newext), 1695 ext_pblock(newext),
1696 ext4_ext_is_uninitialized(newext),
1670 ext4_ext_get_actual_len(newext), 1697 ext4_ext_get_actual_len(newext),
1671 nearex, len, nearex + 1, nearex + 2); 1698 nearex, len, nearex + 1, nearex + 2);
1672 memmove(nearex + 2, nearex + 1, len); 1699 memmove(nearex + 2, nearex + 1, len);
@@ -1676,10 +1703,11 @@ has_space:
1676 BUG_ON(newext->ee_block == nearex->ee_block); 1703 BUG_ON(newext->ee_block == nearex->ee_block);
1677 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); 1704 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1678 len = len < 0 ? 0 : len; 1705 len = len < 0 ? 0 : len;
1679 ext_debug("insert %d:%llu:%d before: nearest 0x%p, " 1706 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1680 "move %d from 0x%p to 0x%p\n", 1707 "move %d from 0x%p to 0x%p\n",
1681 le32_to_cpu(newext->ee_block), 1708 le32_to_cpu(newext->ee_block),
1682 ext_pblock(newext), 1709 ext_pblock(newext),
1710 ext4_ext_is_uninitialized(newext),
1683 ext4_ext_get_actual_len(newext), 1711 ext4_ext_get_actual_len(newext),
1684 nearex, len, nearex + 1, nearex + 2); 1712 nearex, len, nearex + 1, nearex + 2);
1685 memmove(nearex + 1, nearex, len); 1713 memmove(nearex + 1, nearex, len);
@@ -2094,7 +2122,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2094 else 2122 else
2095 uninitialized = 0; 2123 uninitialized = 0;
2096 2124
2097 ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len); 2125 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2126 uninitialized, ex_ee_len);
2098 path[depth].p_ext = ex; 2127 path[depth].p_ext = ex;
2099 2128
2100 a = ex_ee_block > start ? ex_ee_block : start; 2129 a = ex_ee_block > start ? ex_ee_block : start;
@@ -2138,7 +2167,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2138 } 2167 }
2139 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 2168 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
2140 2169
2141 err = ext4_ext_journal_restart(handle, credits); 2170 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2142 if (err) 2171 if (err)
2143 goto out; 2172 goto out;
2144 2173
@@ -2327,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2327 if (err == 0) { 2356 if (err == 0) {
2328 ext_inode_hdr(inode)->eh_depth = 0; 2357 ext_inode_hdr(inode)->eh_depth = 0;
2329 ext_inode_hdr(inode)->eh_max = 2358 ext_inode_hdr(inode)->eh_max =
2330 cpu_to_le16(ext4_ext_space_root(inode)); 2359 cpu_to_le16(ext4_ext_space_root(inode, 0));
2331 err = ext4_ext_dirty(handle, inode, path); 2360 err = ext4_ext_dirty(handle, inode, path);
2332 } 2361 }
2333 } 2362 }
@@ -2743,6 +2772,7 @@ insert:
2743 } else if (err) 2772 } else if (err)
2744 goto fix_extent_len; 2773 goto fix_extent_len;
2745out: 2774out:
2775 ext4_ext_show_leaf(inode, path);
2746 return err ? err : allocated; 2776 return err ? err : allocated;
2747 2777
2748fix_extent_len: 2778fix_extent_len:
@@ -2786,7 +2816,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2786 struct ext4_allocation_request ar; 2816 struct ext4_allocation_request ar;
2787 2817
2788 __clear_bit(BH_New, &bh_result->b_state); 2818 __clear_bit(BH_New, &bh_result->b_state);
2789 ext_debug("blocks %u/%u requested for inode %u\n", 2819 ext_debug("blocks %u/%u requested for inode %lu\n",
2790 iblock, max_blocks, inode->i_ino); 2820 iblock, max_blocks, inode->i_ino);
2791 2821
2792 /* check in cache */ 2822 /* check in cache */
@@ -2849,7 +2879,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2849 newblock = iblock - ee_block + ee_start; 2879 newblock = iblock - ee_block + ee_start;
2850 /* number of remaining blocks in the extent */ 2880 /* number of remaining blocks in the extent */
2851 allocated = ee_len - (iblock - ee_block); 2881 allocated = ee_len - (iblock - ee_block);
2852 ext_debug("%u fit into %lu:%d -> %llu\n", iblock, 2882 ext_debug("%u fit into %u:%d -> %llu\n", iblock,
2853 ee_block, ee_len, newblock); 2883 ee_block, ee_len, newblock);
2854 2884
2855 /* Do not put uninitialized extent in the cache */ 2885 /* Do not put uninitialized extent in the cache */
@@ -2950,7 +2980,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2950 newblock = ext4_mb_new_blocks(handle, &ar, &err); 2980 newblock = ext4_mb_new_blocks(handle, &ar, &err);
2951 if (!newblock) 2981 if (!newblock)
2952 goto out2; 2982 goto out2;
2953 ext_debug("allocate new block: goal %llu, found %llu/%lu\n", 2983 ext_debug("allocate new block: goal %llu, found %llu/%u\n",
2954 ar.goal, newblock, allocated); 2984 ar.goal, newblock, allocated);
2955 2985
2956 /* try to insert new extent into found leaf and return */ 2986 /* try to insert new extent into found leaf and return */
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 83cf6415f599..07475740b512 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -50,7 +50,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
50{ 50{
51 struct inode *inode = dentry->d_inode; 51 struct inode *inode = dentry->d_inode;
52 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 52 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
53 int ret = 0; 53 int err, ret = 0;
54 54
55 J_ASSERT(ext4_journal_current_handle() == NULL); 55 J_ASSERT(ext4_journal_current_handle() == NULL);
56 56
@@ -79,6 +79,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
79 goto out; 79 goto out;
80 } 80 }
81 81
82 if (!journal)
83 ret = sync_mapping_buffers(inode->i_mapping);
84
82 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 85 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
83 goto out; 86 goto out;
84 87
@@ -91,10 +94,12 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
91 .sync_mode = WB_SYNC_ALL, 94 .sync_mode = WB_SYNC_ALL,
92 .nr_to_write = 0, /* sys_fsync did this */ 95 .nr_to_write = 0, /* sys_fsync did this */
93 }; 96 };
94 ret = sync_inode(inode, &wbc); 97 err = sync_inode(inode, &wbc);
95 if (journal && (journal->j_flags & JBD2_BARRIER)) 98 if (ret == 0)
96 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 99 ret = err;
97 } 100 }
98out: 101out:
102 if (journal && (journal->j_flags & JBD2_BARRIER))
103 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
99 return ret; 104 return ret;
100} 105}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 29e6dc7299b8..f3624ead4f6c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1189,7 +1189,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
1189 1189
1190 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); 1190 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
1191 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", 1191 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
1192 i, ext4_free_inodes_count(sb, gdp), x); 1192 (unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
1193 bitmap_count += x; 1193 bitmap_count += x;
1194 } 1194 }
1195 brelse(bitmap_bh); 1195 brelse(bitmap_bh);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 349dd6b4da47..064746fad581 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -192,11 +192,24 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
192 * so before we call here everything must be consistently dirtied against 192 * so before we call here everything must be consistently dirtied against
193 * this transaction. 193 * this transaction.
194 */ 194 */
195static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) 195 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
196 int nblocks)
196{ 197{
198 int ret;
199
200 /*
201 * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
202 * moment, get_block can be called only for blocks inside i_size since
203 * page cache has been already dropped and writes are blocked by
204 * i_mutex. So we can safely drop the i_data_sem here.
205 */
197 BUG_ON(EXT4_JOURNAL(inode) == NULL); 206 BUG_ON(EXT4_JOURNAL(inode) == NULL);
198 jbd_debug(2, "restarting handle %p\n", handle); 207 jbd_debug(2, "restarting handle %p\n", handle);
199 return ext4_journal_restart(handle, blocks_for_truncate(inode)); 208 up_write(&EXT4_I(inode)->i_data_sem);
209 ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
210 down_write(&EXT4_I(inode)->i_data_sem);
211
212 return ret;
200} 213}
201 214
202/* 215/*
@@ -341,9 +354,7 @@ static int ext4_block_to_path(struct inode *inode,
341 int n = 0; 354 int n = 0;
342 int final = 0; 355 int final = 0;
343 356
344 if (i_block < 0) { 357 if (i_block < direct_blocks) {
345 ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
346 } else if (i_block < direct_blocks) {
347 offsets[n++] = i_block; 358 offsets[n++] = i_block;
348 final = direct_blocks; 359 final = direct_blocks;
349 } else if ((i_block -= direct_blocks) < indirect_blocks) { 360 } else if ((i_block -= direct_blocks) < indirect_blocks) {
@@ -551,15 +562,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
551 * 562 *
552 * Normally this function find the preferred place for block allocation, 563 * Normally this function find the preferred place for block allocation,
553 * returns it. 564 * returns it.
565 * Because this is only used for non-extent files, we limit the block nr
566 * to 32 bits.
554 */ 567 */
555static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 568static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
556 Indirect *partial) 569 Indirect *partial)
557{ 570{
571 ext4_fsblk_t goal;
572
558 /* 573 /*
559 * XXX need to get goal block from mballoc's data structures 574 * XXX need to get goal block from mballoc's data structures
560 */ 575 */
561 576
562 return ext4_find_near(inode, partial); 577 goal = ext4_find_near(inode, partial);
578 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
579 return goal;
563} 580}
564 581
565/** 582/**
@@ -640,6 +657,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
640 if (*err) 657 if (*err)
641 goto failed_out; 658 goto failed_out;
642 659
660 BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
661
643 target -= count; 662 target -= count;
644 /* allocate blocks for indirect blocks */ 663 /* allocate blocks for indirect blocks */
645 while (index < indirect_blks && count) { 664 while (index < indirect_blks && count) {
@@ -674,6 +693,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
674 ar.flags = EXT4_MB_HINT_DATA; 693 ar.flags = EXT4_MB_HINT_DATA;
675 694
676 current_block = ext4_mb_new_blocks(handle, &ar, err); 695 current_block = ext4_mb_new_blocks(handle, &ar, err);
696 BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
677 697
678 if (*err && (target == blks)) { 698 if (*err && (target == blks)) {
679 /* 699 /*
@@ -762,8 +782,9 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
762 BUFFER_TRACE(bh, "call get_create_access"); 782 BUFFER_TRACE(bh, "call get_create_access");
763 err = ext4_journal_get_create_access(handle, bh); 783 err = ext4_journal_get_create_access(handle, bh);
764 if (err) { 784 if (err) {
785 /* Don't brelse(bh) here; it's done in
786 * ext4_journal_forget() below */
765 unlock_buffer(bh); 787 unlock_buffer(bh);
766 brelse(bh);
767 goto failed; 788 goto failed;
768 } 789 }
769 790
@@ -1109,16 +1130,15 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1109 ext4_discard_preallocations(inode); 1130 ext4_discard_preallocations(inode);
1110} 1131}
1111 1132
1112static int check_block_validity(struct inode *inode, sector_t logical, 1133static int check_block_validity(struct inode *inode, const char *msg,
1113 sector_t phys, int len) 1134 sector_t logical, sector_t phys, int len)
1114{ 1135{
1115 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { 1136 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
1116 ext4_error(inode->i_sb, "check_block_validity", 1137 ext4_error(inode->i_sb, msg,
1117 "inode #%lu logical block %llu mapped to %llu " 1138 "inode #%lu logical block %llu mapped to %llu "
1118 "(size %d)", inode->i_ino, 1139 "(size %d)", inode->i_ino,
1119 (unsigned long long) logical, 1140 (unsigned long long) logical,
1120 (unsigned long long) phys, len); 1141 (unsigned long long) phys, len);
1121 WARN_ON(1);
1122 return -EIO; 1142 return -EIO;
1123 } 1143 }
1124 return 0; 1144 return 0;
@@ -1170,8 +1190,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1170 up_read((&EXT4_I(inode)->i_data_sem)); 1190 up_read((&EXT4_I(inode)->i_data_sem));
1171 1191
1172 if (retval > 0 && buffer_mapped(bh)) { 1192 if (retval > 0 && buffer_mapped(bh)) {
1173 int ret = check_block_validity(inode, block, 1193 int ret = check_block_validity(inode, "file system corruption",
1174 bh->b_blocknr, retval); 1194 block, bh->b_blocknr, retval);
1175 if (ret != 0) 1195 if (ret != 0)
1176 return ret; 1196 return ret;
1177 } 1197 }
@@ -1235,8 +1255,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1235 * i_data's format changing. Force the migrate 1255 * i_data's format changing. Force the migrate
1236 * to fail by clearing migrate flags 1256 * to fail by clearing migrate flags
1237 */ 1257 */
1238 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & 1258 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
1239 ~EXT4_EXT_MIGRATE;
1240 } 1259 }
1241 } 1260 }
1242 1261
@@ -1252,8 +1271,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1252 1271
1253 up_write((&EXT4_I(inode)->i_data_sem)); 1272 up_write((&EXT4_I(inode)->i_data_sem));
1254 if (retval > 0 && buffer_mapped(bh)) { 1273 if (retval > 0 && buffer_mapped(bh)) {
1255 int ret = check_block_validity(inode, block, 1274 int ret = check_block_validity(inode, "file system "
1256 bh->b_blocknr, retval); 1275 "corruption after allocation",
1276 block, bh->b_blocknr, retval);
1257 if (ret != 0) 1277 if (ret != 0)
1258 return ret; 1278 return ret;
1259 } 1279 }
@@ -1863,18 +1883,6 @@ static void ext4_da_page_release_reservation(struct page *page,
1863 * Delayed allocation stuff 1883 * Delayed allocation stuff
1864 */ 1884 */
1865 1885
1866struct mpage_da_data {
1867 struct inode *inode;
1868 sector_t b_blocknr; /* start block number of extent */
1869 size_t b_size; /* size of extent */
1870 unsigned long b_state; /* state of the extent */
1871 unsigned long first_page, next_page; /* extent of pages */
1872 struct writeback_control *wbc;
1873 int io_done;
1874 int pages_written;
1875 int retval;
1876};
1877
1878/* 1886/*
1879 * mpage_da_submit_io - walks through extent of pages and try to write 1887 * mpage_da_submit_io - walks through extent of pages and try to write
1880 * them with writepage() call back 1888 * them with writepage() call back
@@ -2329,7 +2337,7 @@ static int __mpage_da_writepage(struct page *page,
2329 /* 2337 /*
2330 * Rest of the page in the page_vec 2338 * Rest of the page in the page_vec
2331 * redirty then and skip then. We will 2339 * redirty then and skip then. We will
2332 * try to to write them again after 2340 * try to write them again after
2333 * starting a new transaction 2341 * starting a new transaction
2334 */ 2342 */
2335 redirty_page_for_writepage(wbc, page); 2343 redirty_page_for_writepage(wbc, page);
@@ -2737,6 +2745,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2737 long pages_skipped; 2745 long pages_skipped;
2738 int range_cyclic, cycled = 1, io_done = 0; 2746 int range_cyclic, cycled = 1, io_done = 0;
2739 int needed_blocks, ret = 0, nr_to_writebump = 0; 2747 int needed_blocks, ret = 0, nr_to_writebump = 0;
2748 loff_t range_start = wbc->range_start;
2740 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2749 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2741 2750
2742 trace_ext4_da_writepages(inode, wbc); 2751 trace_ext4_da_writepages(inode, wbc);
@@ -2850,6 +2859,7 @@ retry:
2850 mpd.io_done = 1; 2859 mpd.io_done = 1;
2851 ret = MPAGE_DA_EXTENT_TAIL; 2860 ret = MPAGE_DA_EXTENT_TAIL;
2852 } 2861 }
2862 trace_ext4_da_write_pages(inode, &mpd);
2853 wbc->nr_to_write -= mpd.pages_written; 2863 wbc->nr_to_write -= mpd.pages_written;
2854 2864
2855 ext4_journal_stop(handle); 2865 ext4_journal_stop(handle);
@@ -2905,6 +2915,7 @@ out_writepages:
2905 if (!no_nrwrite_index_update) 2915 if (!no_nrwrite_index_update)
2906 wbc->no_nrwrite_index_update = 0; 2916 wbc->no_nrwrite_index_update = 0;
2907 wbc->nr_to_write -= nr_to_writebump; 2917 wbc->nr_to_write -= nr_to_writebump;
2918 wbc->range_start = range_start;
2908 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 2919 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2909 return ret; 2920 return ret;
2910} 2921}
@@ -3117,6 +3128,8 @@ out:
3117 */ 3128 */
3118int ext4_alloc_da_blocks(struct inode *inode) 3129int ext4_alloc_da_blocks(struct inode *inode)
3119{ 3130{
3131 trace_ext4_alloc_da_blocks(inode);
3132
3120 if (!EXT4_I(inode)->i_reserved_data_blocks && 3133 if (!EXT4_I(inode)->i_reserved_data_blocks &&
3121 !EXT4_I(inode)->i_reserved_meta_blocks) 3134 !EXT4_I(inode)->i_reserved_meta_blocks)
3122 return 0; 3135 return 0;
@@ -3663,7 +3676,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
3663 ext4_handle_dirty_metadata(handle, inode, bh); 3676 ext4_handle_dirty_metadata(handle, inode, bh);
3664 } 3677 }
3665 ext4_mark_inode_dirty(handle, inode); 3678 ext4_mark_inode_dirty(handle, inode);
3666 ext4_journal_test_restart(handle, inode); 3679 ext4_truncate_restart_trans(handle, inode,
3680 blocks_for_truncate(inode));
3667 if (bh) { 3681 if (bh) {
3668 BUFFER_TRACE(bh, "retaking write access"); 3682 BUFFER_TRACE(bh, "retaking write access");
3669 ext4_journal_get_write_access(handle, bh); 3683 ext4_journal_get_write_access(handle, bh);
@@ -3874,7 +3888,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3874 return; 3888 return;
3875 if (try_to_extend_transaction(handle, inode)) { 3889 if (try_to_extend_transaction(handle, inode)) {
3876 ext4_mark_inode_dirty(handle, inode); 3890 ext4_mark_inode_dirty(handle, inode);
3877 ext4_journal_test_restart(handle, inode); 3891 ext4_truncate_restart_trans(handle, inode,
3892 blocks_for_truncate(inode));
3878 } 3893 }
3879 3894
3880 ext4_free_blocks(handle, inode, nr, 1, 1); 3895 ext4_free_blocks(handle, inode, nr, 1, 1);
@@ -3962,8 +3977,7 @@ void ext4_truncate(struct inode *inode)
3962 if (!ext4_can_truncate(inode)) 3977 if (!ext4_can_truncate(inode))
3963 return; 3978 return;
3964 3979
3965 if (ei->i_disksize && inode->i_size == 0 && 3980 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3966 !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3967 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; 3981 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
3968 3982
3969 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 3983 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -4537,7 +4551,8 @@ static int ext4_inode_blocks_set(handle_t *handle,
4537 */ 4551 */
4538static int ext4_do_update_inode(handle_t *handle, 4552static int ext4_do_update_inode(handle_t *handle,
4539 struct inode *inode, 4553 struct inode *inode,
4540 struct ext4_iloc *iloc) 4554 struct ext4_iloc *iloc,
4555 int do_sync)
4541{ 4556{
4542 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 4557 struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
4543 struct ext4_inode_info *ei = EXT4_I(inode); 4558 struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4585,8 +4600,7 @@ static int ext4_do_update_inode(handle_t *handle,
4585 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 4600 if (ext4_inode_blocks_set(handle, raw_inode, ei))
4586 goto out_brelse; 4601 goto out_brelse;
4587 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 4602 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
4588 /* clear the migrate flag in the raw_inode */ 4603 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
4589 raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
4590 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4604 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
4591 cpu_to_le32(EXT4_OS_HURD)) 4605 cpu_to_le32(EXT4_OS_HURD))
4592 raw_inode->i_file_acl_high = 4606 raw_inode->i_file_acl_high =
@@ -4639,10 +4653,22 @@ static int ext4_do_update_inode(handle_t *handle,
4639 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4653 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
4640 } 4654 }
4641 4655
4642 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4656 /*
4643 rc = ext4_handle_dirty_metadata(handle, inode, bh); 4657 * If we're not using a journal and we were called from
4644 if (!err) 4658 * ext4_write_inode() to sync the inode (making do_sync true),
4645 err = rc; 4659 * we can just use sync_dirty_buffer() directly to do our dirty
4660 * work. Testing s_journal here is a bit redundant but it's
4661 * worth it to avoid potential future trouble.
4662 */
4663 if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
4664 BUFFER_TRACE(bh, "call sync_dirty_buffer");
4665 sync_dirty_buffer(bh);
4666 } else {
4667 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4668 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4669 if (!err)
4670 err = rc;
4671 }
4646 ei->i_state &= ~EXT4_STATE_NEW; 4672 ei->i_state &= ~EXT4_STATE_NEW;
4647 4673
4648out_brelse: 4674out_brelse:
@@ -4688,19 +4714,32 @@ out_brelse:
4688 */ 4714 */
4689int ext4_write_inode(struct inode *inode, int wait) 4715int ext4_write_inode(struct inode *inode, int wait)
4690{ 4716{
4717 int err;
4718
4691 if (current->flags & PF_MEMALLOC) 4719 if (current->flags & PF_MEMALLOC)
4692 return 0; 4720 return 0;
4693 4721
4694 if (ext4_journal_current_handle()) { 4722 if (EXT4_SB(inode->i_sb)->s_journal) {
4695 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); 4723 if (ext4_journal_current_handle()) {
4696 dump_stack(); 4724 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
4697 return -EIO; 4725 dump_stack();
4698 } 4726 return -EIO;
4727 }
4699 4728
4700 if (!wait) 4729 if (!wait)
4701 return 0; 4730 return 0;
4731
4732 err = ext4_force_commit(inode->i_sb);
4733 } else {
4734 struct ext4_iloc iloc;
4702 4735
4703 return ext4_force_commit(inode->i_sb); 4736 err = ext4_get_inode_loc(inode, &iloc);
4737 if (err)
4738 return err;
4739 err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
4740 inode, &iloc, wait);
4741 }
4742 return err;
4704} 4743}
4705 4744
4706/* 4745/*
@@ -4994,7 +5033,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
4994 get_bh(iloc->bh); 5033 get_bh(iloc->bh);
4995 5034
4996 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 5035 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
4997 err = ext4_do_update_inode(handle, inode, iloc); 5036 err = ext4_do_update_inode(handle, inode, iloc, 0);
4998 put_bh(iloc->bh); 5037 put_bh(iloc->bh);
4999 return err; 5038 return err;
5000} 5039}
@@ -5285,12 +5324,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5285 else 5324 else
5286 len = PAGE_CACHE_SIZE; 5325 len = PAGE_CACHE_SIZE;
5287 5326
5327 lock_page(page);
5328 /*
5329 * return if we have all the buffers mapped. This avoid
5330 * the need to call write_begin/write_end which does a
5331 * journal_start/journal_stop which can block and take
5332 * long time
5333 */
5288 if (page_has_buffers(page)) { 5334 if (page_has_buffers(page)) {
5289 /* return if we have all the buffers mapped */
5290 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 5335 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
5291 ext4_bh_unmapped)) 5336 ext4_bh_unmapped)) {
5337 unlock_page(page);
5292 goto out_unlock; 5338 goto out_unlock;
5339 }
5293 } 5340 }
5341 unlock_page(page);
5294 /* 5342 /*
5295 * OK, we need to fill the hole... Do write_begin write_end 5343 * OK, we need to fill the hole... Do write_begin write_end
5296 * to do block allocation/reservation.We are not holding 5344 * to do block allocation/reservation.We are not holding
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7050a9cd04a4..c1cdf613e725 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -243,10 +243,9 @@ setversion_out:
243 me.donor_start, me.len, &me.moved_len); 243 me.donor_start, me.len, &me.moved_len);
244 fput(donor_filp); 244 fput(donor_filp);
245 245
246 if (!err) 246 if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
247 if (copy_to_user((struct move_extent *)arg, 247 return -EFAULT;
248 &me, sizeof(me))) 248
249 return -EFAULT;
250 return err; 249 return err;
251 } 250 }
252 251
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cd258463e2a9..e9c61896d605 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include "mballoc.h" 24#include "mballoc.h"
25#include <linux/debugfs.h>
25#include <trace/events/ext4.h> 26#include <trace/events/ext4.h>
26 27
27/* 28/*
@@ -622,13 +623,13 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
622 623
623/* FIXME!! need more doc */ 624/* FIXME!! need more doc */
624static void ext4_mb_mark_free_simple(struct super_block *sb, 625static void ext4_mb_mark_free_simple(struct super_block *sb,
625 void *buddy, unsigned first, int len, 626 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
626 struct ext4_group_info *grp) 627 struct ext4_group_info *grp)
627{ 628{
628 struct ext4_sb_info *sbi = EXT4_SB(sb); 629 struct ext4_sb_info *sbi = EXT4_SB(sb);
629 unsigned short min; 630 ext4_grpblk_t min;
630 unsigned short max; 631 ext4_grpblk_t max;
631 unsigned short chunk; 632 ext4_grpblk_t chunk;
632 unsigned short border; 633 unsigned short border;
633 634
634 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); 635 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
@@ -662,10 +663,10 @@ void ext4_mb_generate_buddy(struct super_block *sb,
662 void *buddy, void *bitmap, ext4_group_t group) 663 void *buddy, void *bitmap, ext4_group_t group)
663{ 664{
664 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 665 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
665 unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); 666 ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
666 unsigned short i = 0; 667 ext4_grpblk_t i = 0;
667 unsigned short first; 668 ext4_grpblk_t first;
668 unsigned short len; 669 ext4_grpblk_t len;
669 unsigned free = 0; 670 unsigned free = 0;
670 unsigned fragments = 0; 671 unsigned fragments = 0;
671 unsigned long long period = get_cycles(); 672 unsigned long long period = get_cycles();
@@ -743,7 +744,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
743 char *data; 744 char *data;
744 char *bitmap; 745 char *bitmap;
745 746
746 mb_debug("init page %lu\n", page->index); 747 mb_debug(1, "init page %lu\n", page->index);
747 748
748 inode = page->mapping->host; 749 inode = page->mapping->host;
749 sb = inode->i_sb; 750 sb = inode->i_sb;
@@ -822,7 +823,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
822 set_bitmap_uptodate(bh[i]); 823 set_bitmap_uptodate(bh[i]);
823 bh[i]->b_end_io = end_buffer_read_sync; 824 bh[i]->b_end_io = end_buffer_read_sync;
824 submit_bh(READ, bh[i]); 825 submit_bh(READ, bh[i]);
825 mb_debug("read bitmap for group %u\n", first_group + i); 826 mb_debug(1, "read bitmap for group %u\n", first_group + i);
826 } 827 }
827 828
828 /* wait for I/O completion */ 829 /* wait for I/O completion */
@@ -862,12 +863,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
862 if ((first_block + i) & 1) { 863 if ((first_block + i) & 1) {
863 /* this is block of buddy */ 864 /* this is block of buddy */
864 BUG_ON(incore == NULL); 865 BUG_ON(incore == NULL);
865 mb_debug("put buddy for group %u in page %lu/%x\n", 866 mb_debug(1, "put buddy for group %u in page %lu/%x\n",
866 group, page->index, i * blocksize); 867 group, page->index, i * blocksize);
867 grinfo = ext4_get_group_info(sb, group); 868 grinfo = ext4_get_group_info(sb, group);
868 grinfo->bb_fragments = 0; 869 grinfo->bb_fragments = 0;
869 memset(grinfo->bb_counters, 0, 870 memset(grinfo->bb_counters, 0,
870 sizeof(unsigned short)*(sb->s_blocksize_bits+2)); 871 sizeof(*grinfo->bb_counters) *
872 (sb->s_blocksize_bits+2));
871 /* 873 /*
872 * incore got set to the group block bitmap below 874 * incore got set to the group block bitmap below
873 */ 875 */
@@ -878,7 +880,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
878 } else { 880 } else {
879 /* this is block of bitmap */ 881 /* this is block of bitmap */
880 BUG_ON(incore != NULL); 882 BUG_ON(incore != NULL);
881 mb_debug("put bitmap for group %u in page %lu/%x\n", 883 mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
882 group, page->index, i * blocksize); 884 group, page->index, i * blocksize);
883 885
884 /* see comments in ext4_mb_put_pa() */ 886 /* see comments in ext4_mb_put_pa() */
@@ -908,6 +910,100 @@ out:
908 return err; 910 return err;
909} 911}
910 912
913static noinline_for_stack
914int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
915{
916
917 int ret = 0;
918 void *bitmap;
919 int blocks_per_page;
920 int block, pnum, poff;
921 int num_grp_locked = 0;
922 struct ext4_group_info *this_grp;
923 struct ext4_sb_info *sbi = EXT4_SB(sb);
924 struct inode *inode = sbi->s_buddy_cache;
925 struct page *page = NULL, *bitmap_page = NULL;
926
927 mb_debug(1, "init group %u\n", group);
928 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
929 this_grp = ext4_get_group_info(sb, group);
930 /*
931 * This ensures that we don't reinit the buddy cache
932 * page which map to the group from which we are already
933 * allocating. If we are looking at the buddy cache we would
934 * have taken a reference using ext4_mb_load_buddy and that
935 * would have taken the alloc_sem lock.
936 */
937 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
938 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
939 /*
940 * somebody initialized the group
941 * return without doing anything
942 */
943 ret = 0;
944 goto err;
945 }
946 /*
947 * the buddy cache inode stores the block bitmap
948 * and buddy information in consecutive blocks.
949 * So for each group we need two blocks.
950 */
951 block = group * 2;
952 pnum = block / blocks_per_page;
953 poff = block % blocks_per_page;
954 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
955 if (page) {
956 BUG_ON(page->mapping != inode->i_mapping);
957 ret = ext4_mb_init_cache(page, NULL);
958 if (ret) {
959 unlock_page(page);
960 goto err;
961 }
962 unlock_page(page);
963 }
964 if (page == NULL || !PageUptodate(page)) {
965 ret = -EIO;
966 goto err;
967 }
968 mark_page_accessed(page);
969 bitmap_page = page;
970 bitmap = page_address(page) + (poff * sb->s_blocksize);
971
972 /* init buddy cache */
973 block++;
974 pnum = block / blocks_per_page;
975 poff = block % blocks_per_page;
976 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
977 if (page == bitmap_page) {
978 /*
979 * If both the bitmap and buddy are in
980 * the same page we don't need to force
981 * init the buddy
982 */
983 unlock_page(page);
984 } else if (page) {
985 BUG_ON(page->mapping != inode->i_mapping);
986 ret = ext4_mb_init_cache(page, bitmap);
987 if (ret) {
988 unlock_page(page);
989 goto err;
990 }
991 unlock_page(page);
992 }
993 if (page == NULL || !PageUptodate(page)) {
994 ret = -EIO;
995 goto err;
996 }
997 mark_page_accessed(page);
998err:
999 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1000 if (bitmap_page)
1001 page_cache_release(bitmap_page);
1002 if (page)
1003 page_cache_release(page);
1004 return ret;
1005}
1006
911static noinline_for_stack int 1007static noinline_for_stack int
912ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1008ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
913 struct ext4_buddy *e4b) 1009 struct ext4_buddy *e4b)
@@ -922,7 +1018,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
922 struct ext4_sb_info *sbi = EXT4_SB(sb); 1018 struct ext4_sb_info *sbi = EXT4_SB(sb);
923 struct inode *inode = sbi->s_buddy_cache; 1019 struct inode *inode = sbi->s_buddy_cache;
924 1020
925 mb_debug("load group %u\n", group); 1021 mb_debug(1, "load group %u\n", group);
926 1022
927 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1023 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
928 grp = ext4_get_group_info(sb, group); 1024 grp = ext4_get_group_info(sb, group);
@@ -941,8 +1037,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
941 * groups mapped by the page is blocked 1037 * groups mapped by the page is blocked
942 * till we are done with allocation 1038 * till we are done with allocation
943 */ 1039 */
1040repeat_load_buddy:
944 down_read(e4b->alloc_semp); 1041 down_read(e4b->alloc_semp);
945 1042
1043 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1044 /* we need to check for group need init flag
1045 * with alloc_semp held so that we can be sure
1046 * that new blocks didn't get added to the group
1047 * when we are loading the buddy cache
1048 */
1049 up_read(e4b->alloc_semp);
1050 /*
1051 * we need full data about the group
1052 * to make a good selection
1053 */
1054 ret = ext4_mb_init_group(sb, group);
1055 if (ret)
1056 return ret;
1057 goto repeat_load_buddy;
1058 }
1059
946 /* 1060 /*
947 * the buddy cache inode stores the block bitmap 1061 * the buddy cache inode stores the block bitmap
948 * and buddy information in consecutive blocks. 1062 * and buddy information in consecutive blocks.
@@ -1360,7 +1474,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1360 ac->alloc_semp = e4b->alloc_semp; 1474 ac->alloc_semp = e4b->alloc_semp;
1361 e4b->alloc_semp = NULL; 1475 e4b->alloc_semp = NULL;
1362 /* store last allocated for subsequent stream allocation */ 1476 /* store last allocated for subsequent stream allocation */
1363 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { 1477 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1364 spin_lock(&sbi->s_md_lock); 1478 spin_lock(&sbi->s_md_lock);
1365 sbi->s_mb_last_group = ac->ac_f_ex.fe_group; 1479 sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1366 sbi->s_mb_last_start = ac->ac_f_ex.fe_start; 1480 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
@@ -1837,97 +1951,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1837 1951
1838} 1952}
1839 1953
1840static noinline_for_stack
1841int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1842{
1843
1844 int ret;
1845 void *bitmap;
1846 int blocks_per_page;
1847 int block, pnum, poff;
1848 int num_grp_locked = 0;
1849 struct ext4_group_info *this_grp;
1850 struct ext4_sb_info *sbi = EXT4_SB(sb);
1851 struct inode *inode = sbi->s_buddy_cache;
1852 struct page *page = NULL, *bitmap_page = NULL;
1853
1854 mb_debug("init group %lu\n", group);
1855 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1856 this_grp = ext4_get_group_info(sb, group);
1857 /*
1858 * This ensures we don't add group
1859 * to this buddy cache via resize
1860 */
1861 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
1862 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1863 /*
1864 * somebody initialized the group
1865 * return without doing anything
1866 */
1867 ret = 0;
1868 goto err;
1869 }
1870 /*
1871 * the buddy cache inode stores the block bitmap
1872 * and buddy information in consecutive blocks.
1873 * So for each group we need two blocks.
1874 */
1875 block = group * 2;
1876 pnum = block / blocks_per_page;
1877 poff = block % blocks_per_page;
1878 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1879 if (page) {
1880 BUG_ON(page->mapping != inode->i_mapping);
1881 ret = ext4_mb_init_cache(page, NULL);
1882 if (ret) {
1883 unlock_page(page);
1884 goto err;
1885 }
1886 unlock_page(page);
1887 }
1888 if (page == NULL || !PageUptodate(page)) {
1889 ret = -EIO;
1890 goto err;
1891 }
1892 mark_page_accessed(page);
1893 bitmap_page = page;
1894 bitmap = page_address(page) + (poff * sb->s_blocksize);
1895
1896 /* init buddy cache */
1897 block++;
1898 pnum = block / blocks_per_page;
1899 poff = block % blocks_per_page;
1900 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1901 if (page == bitmap_page) {
1902 /*
1903 * If both the bitmap and buddy are in
1904 * the same page we don't need to force
1905 * init the buddy
1906 */
1907 unlock_page(page);
1908 } else if (page) {
1909 BUG_ON(page->mapping != inode->i_mapping);
1910 ret = ext4_mb_init_cache(page, bitmap);
1911 if (ret) {
1912 unlock_page(page);
1913 goto err;
1914 }
1915 unlock_page(page);
1916 }
1917 if (page == NULL || !PageUptodate(page)) {
1918 ret = -EIO;
1919 goto err;
1920 }
1921 mark_page_accessed(page);
1922err:
1923 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1924 if (bitmap_page)
1925 page_cache_release(bitmap_page);
1926 if (page)
1927 page_cache_release(page);
1928 return ret;
1929}
1930
1931static noinline_for_stack int 1954static noinline_for_stack int
1932ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 1955ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1933{ 1956{
@@ -1938,11 +1961,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1938 struct ext4_sb_info *sbi; 1961 struct ext4_sb_info *sbi;
1939 struct super_block *sb; 1962 struct super_block *sb;
1940 struct ext4_buddy e4b; 1963 struct ext4_buddy e4b;
1941 loff_t size, isize;
1942 1964
1943 sb = ac->ac_sb; 1965 sb = ac->ac_sb;
1944 sbi = EXT4_SB(sb); 1966 sbi = EXT4_SB(sb);
1945 ngroups = ext4_get_groups_count(sb); 1967 ngroups = ext4_get_groups_count(sb);
1968 /* non-extent files are limited to low blocks/groups */
1969 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
1970 ngroups = sbi->s_blockfile_groups;
1971
1946 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1972 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1947 1973
1948 /* first, try the goal */ 1974 /* first, try the goal */
@@ -1974,20 +2000,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1974 } 2000 }
1975 2001
1976 bsbits = ac->ac_sb->s_blocksize_bits; 2002 bsbits = ac->ac_sb->s_blocksize_bits;
1977 /* if stream allocation is enabled, use global goal */
1978 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
1979 isize = i_size_read(ac->ac_inode) >> bsbits;
1980 if (size < isize)
1981 size = isize;
1982 2003
1983 if (size < sbi->s_mb_stream_request && 2004 /* if stream allocation is enabled, use global goal */
1984 (ac->ac_flags & EXT4_MB_HINT_DATA)) { 2005 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1985 /* TBD: may be hot point */ 2006 /* TBD: may be hot point */
1986 spin_lock(&sbi->s_md_lock); 2007 spin_lock(&sbi->s_md_lock);
1987 ac->ac_g_ex.fe_group = sbi->s_mb_last_group; 2008 ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
1988 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 2009 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
1989 spin_unlock(&sbi->s_md_lock); 2010 spin_unlock(&sbi->s_md_lock);
1990 } 2011 }
2012
1991 /* Let's just scan groups to find more-less suitable blocks */ 2013 /* Let's just scan groups to find more-less suitable blocks */
1992 cr = ac->ac_2order ? 0 : 1; 2014 cr = ac->ac_2order ? 0 : 1;
1993 /* 2015 /*
@@ -2015,27 +2037,6 @@ repeat:
2015 if (grp->bb_free == 0) 2037 if (grp->bb_free == 0)
2016 continue; 2038 continue;
2017 2039
2018 /*
2019 * if the group is already init we check whether it is
2020 * a good group and if not we don't load the buddy
2021 */
2022 if (EXT4_MB_GRP_NEED_INIT(grp)) {
2023 /*
2024 * we need full data about the group
2025 * to make a good selection
2026 */
2027 err = ext4_mb_init_group(sb, group);
2028 if (err)
2029 goto out;
2030 }
2031
2032 /*
2033 * If the particular group doesn't satisfy our
2034 * criteria we continue with the next group
2035 */
2036 if (!ext4_mb_good_group(ac, group, cr))
2037 continue;
2038
2039 err = ext4_mb_load_buddy(sb, group, &e4b); 2040 err = ext4_mb_load_buddy(sb, group, &e4b);
2040 if (err) 2041 if (err)
2041 goto out; 2042 goto out;
@@ -2156,7 +2157,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2156 2157
2157 if (v == SEQ_START_TOKEN) { 2158 if (v == SEQ_START_TOKEN) {
2158 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s " 2159 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
2159 "%-5s %-2s %-5s %-5s %-5s %-6s\n", 2160 "%-5s %-2s %-6s %-5s %-5s %-6s\n",
2160 "pid", "inode", "original", "goal", "result", "found", 2161 "pid", "inode", "original", "goal", "result", "found",
2161 "grps", "cr", "flags", "merge", "tail", "broken"); 2162 "grps", "cr", "flags", "merge", "tail", "broken");
2162 return 0; 2163 return 0;
@@ -2164,7 +2165,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2164 2165
2165 if (hs->op == EXT4_MB_HISTORY_ALLOC) { 2166 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
2166 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " 2167 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
2167 "%-5u %-5s %-5u %-6u\n"; 2168 "0x%04x %-5s %-5u %-6u\n";
2168 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, 2169 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2169 hs->result.fe_start, hs->result.fe_len, 2170 hs->result.fe_start, hs->result.fe_len,
2170 hs->result.fe_logical); 2171 hs->result.fe_logical);
@@ -2205,7 +2206,7 @@ static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
2205{ 2206{
2206} 2207}
2207 2208
2208static struct seq_operations ext4_mb_seq_history_ops = { 2209static const struct seq_operations ext4_mb_seq_history_ops = {
2209 .start = ext4_mb_seq_history_start, 2210 .start = ext4_mb_seq_history_start,
2210 .next = ext4_mb_seq_history_next, 2211 .next = ext4_mb_seq_history_next,
2211 .stop = ext4_mb_seq_history_stop, 2212 .stop = ext4_mb_seq_history_stop,
@@ -2287,7 +2288,7 @@ static ssize_t ext4_mb_seq_history_write(struct file *file,
2287 return count; 2288 return count;
2288} 2289}
2289 2290
2290static struct file_operations ext4_mb_seq_history_fops = { 2291static const struct file_operations ext4_mb_seq_history_fops = {
2291 .owner = THIS_MODULE, 2292 .owner = THIS_MODULE,
2292 .open = ext4_mb_seq_history_open, 2293 .open = ext4_mb_seq_history_open,
2293 .read = seq_read, 2294 .read = seq_read,
@@ -2328,7 +2329,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2328 struct ext4_buddy e4b; 2329 struct ext4_buddy e4b;
2329 struct sg { 2330 struct sg {
2330 struct ext4_group_info info; 2331 struct ext4_group_info info;
2331 unsigned short counters[16]; 2332 ext4_grpblk_t counters[16];
2332 } sg; 2333 } sg;
2333 2334
2334 group--; 2335 group--;
@@ -2366,7 +2367,7 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2366{ 2367{
2367} 2368}
2368 2369
2369static struct seq_operations ext4_mb_seq_groups_ops = { 2370static const struct seq_operations ext4_mb_seq_groups_ops = {
2370 .start = ext4_mb_seq_groups_start, 2371 .start = ext4_mb_seq_groups_start,
2371 .next = ext4_mb_seq_groups_next, 2372 .next = ext4_mb_seq_groups_next,
2372 .stop = ext4_mb_seq_groups_stop, 2373 .stop = ext4_mb_seq_groups_stop,
@@ -2387,7 +2388,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2387 2388
2388} 2389}
2389 2390
2390static struct file_operations ext4_mb_seq_groups_fops = { 2391static const struct file_operations ext4_mb_seq_groups_fops = {
2391 .owner = THIS_MODULE, 2392 .owner = THIS_MODULE,
2392 .open = ext4_mb_seq_groups_open, 2393 .open = ext4_mb_seq_groups_open,
2393 .read = seq_read, 2394 .read = seq_read,
@@ -2532,7 +2533,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2532 2533
2533 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2534 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2534 init_rwsem(&meta_group_info[i]->alloc_sem); 2535 init_rwsem(&meta_group_info[i]->alloc_sem);
2535 meta_group_info[i]->bb_free_root.rb_node = NULL;; 2536 meta_group_info[i]->bb_free_root.rb_node = NULL;
2536 2537
2537#ifdef DOUBLE_CHECK 2538#ifdef DOUBLE_CHECK
2538 { 2539 {
@@ -2558,26 +2559,15 @@ exit_meta_group_info:
2558 return -ENOMEM; 2559 return -ENOMEM;
2559} /* ext4_mb_add_groupinfo */ 2560} /* ext4_mb_add_groupinfo */
2560 2561
2561/*
2562 * Update an existing group.
2563 * This function is used for online resize
2564 */
2565void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
2566{
2567 grp->bb_free += add;
2568}
2569
2570static int ext4_mb_init_backend(struct super_block *sb) 2562static int ext4_mb_init_backend(struct super_block *sb)
2571{ 2563{
2572 ext4_group_t ngroups = ext4_get_groups_count(sb); 2564 ext4_group_t ngroups = ext4_get_groups_count(sb);
2573 ext4_group_t i; 2565 ext4_group_t i;
2574 int metalen;
2575 struct ext4_sb_info *sbi = EXT4_SB(sb); 2566 struct ext4_sb_info *sbi = EXT4_SB(sb);
2576 struct ext4_super_block *es = sbi->s_es; 2567 struct ext4_super_block *es = sbi->s_es;
2577 int num_meta_group_infos; 2568 int num_meta_group_infos;
2578 int num_meta_group_infos_max; 2569 int num_meta_group_infos_max;
2579 int array_size; 2570 int array_size;
2580 struct ext4_group_info **meta_group_info;
2581 struct ext4_group_desc *desc; 2571 struct ext4_group_desc *desc;
2582 2572
2583 /* This is the number of blocks used by GDT */ 2573 /* This is the number of blocks used by GDT */
@@ -2622,22 +2612,6 @@ static int ext4_mb_init_backend(struct super_block *sb)
2622 goto err_freesgi; 2612 goto err_freesgi;
2623 } 2613 }
2624 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2614 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2625
2626 metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
2627 for (i = 0; i < num_meta_group_infos; i++) {
2628 if ((i + 1) == num_meta_group_infos)
2629 metalen = sizeof(*meta_group_info) *
2630 (ngroups -
2631 (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
2632 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2633 if (meta_group_info == NULL) {
2634 printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
2635 "buddy group\n");
2636 goto err_freemeta;
2637 }
2638 sbi->s_group_info[i] = meta_group_info;
2639 }
2640
2641 for (i = 0; i < ngroups; i++) { 2615 for (i = 0; i < ngroups; i++) {
2642 desc = ext4_get_group_desc(sb, i, NULL); 2616 desc = ext4_get_group_desc(sb, i, NULL);
2643 if (desc == NULL) { 2617 if (desc == NULL) {
@@ -2655,7 +2629,6 @@ err_freebuddy:
2655 while (i-- > 0) 2629 while (i-- > 0)
2656 kfree(ext4_get_group_info(sb, i)); 2630 kfree(ext4_get_group_info(sb, i));
2657 i = num_meta_group_infos; 2631 i = num_meta_group_infos;
2658err_freemeta:
2659 while (i-- > 0) 2632 while (i-- > 0)
2660 kfree(sbi->s_group_info[i]); 2633 kfree(sbi->s_group_info[i]);
2661 iput(sbi->s_buddy_cache); 2634 iput(sbi->s_buddy_cache);
@@ -2672,14 +2645,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2672 unsigned max; 2645 unsigned max;
2673 int ret; 2646 int ret;
2674 2647
2675 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); 2648 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2676 2649
2677 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2650 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2678 if (sbi->s_mb_offsets == NULL) { 2651 if (sbi->s_mb_offsets == NULL) {
2679 return -ENOMEM; 2652 return -ENOMEM;
2680 } 2653 }
2681 2654
2682 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); 2655 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2683 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2656 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2684 if (sbi->s_mb_maxs == NULL) { 2657 if (sbi->s_mb_maxs == NULL) {
2685 kfree(sbi->s_mb_offsets); 2658 kfree(sbi->s_mb_offsets);
@@ -2758,7 +2731,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2758 kmem_cache_free(ext4_pspace_cachep, pa); 2731 kmem_cache_free(ext4_pspace_cachep, pa);
2759 } 2732 }
2760 if (count) 2733 if (count)
2761 mb_debug("mballoc: %u PAs left\n", count); 2734 mb_debug(1, "mballoc: %u PAs left\n", count);
2762 2735
2763} 2736}
2764 2737
@@ -2839,7 +2812,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2839 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2812 list_for_each_safe(l, ltmp, &txn->t_private_list) {
2840 entry = list_entry(l, struct ext4_free_data, list); 2813 entry = list_entry(l, struct ext4_free_data, list);
2841 2814
2842 mb_debug("gonna free %u blocks in group %u (0x%p):", 2815 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2843 entry->count, entry->group, entry); 2816 entry->count, entry->group, entry);
2844 2817
2845 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2818 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2874,9 +2847,43 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2874 ext4_mb_release_desc(&e4b); 2847 ext4_mb_release_desc(&e4b);
2875 } 2848 }
2876 2849
2877 mb_debug("freed %u blocks in %u structures\n", count, count2); 2850 mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2851}
2852
2853#ifdef CONFIG_EXT4_DEBUG
2854u8 mb_enable_debug __read_mostly;
2855
2856static struct dentry *debugfs_dir;
2857static struct dentry *debugfs_debug;
2858
2859static void __init ext4_create_debugfs_entry(void)
2860{
2861 debugfs_dir = debugfs_create_dir("ext4", NULL);
2862 if (debugfs_dir)
2863 debugfs_debug = debugfs_create_u8("mballoc-debug",
2864 S_IRUGO | S_IWUSR,
2865 debugfs_dir,
2866 &mb_enable_debug);
2867}
2868
2869static void ext4_remove_debugfs_entry(void)
2870{
2871 debugfs_remove(debugfs_debug);
2872 debugfs_remove(debugfs_dir);
2878} 2873}
2879 2874
2875#else
2876
2877static void __init ext4_create_debugfs_entry(void)
2878{
2879}
2880
2881static void ext4_remove_debugfs_entry(void)
2882{
2883}
2884
2885#endif
2886
2880int __init init_ext4_mballoc(void) 2887int __init init_ext4_mballoc(void)
2881{ 2888{
2882 ext4_pspace_cachep = 2889 ext4_pspace_cachep =
@@ -2904,6 +2911,7 @@ int __init init_ext4_mballoc(void)
2904 kmem_cache_destroy(ext4_ac_cachep); 2911 kmem_cache_destroy(ext4_ac_cachep);
2905 return -ENOMEM; 2912 return -ENOMEM;
2906 } 2913 }
2914 ext4_create_debugfs_entry();
2907 return 0; 2915 return 0;
2908} 2916}
2909 2917
@@ -2917,6 +2925,7 @@ void exit_ext4_mballoc(void)
2917 kmem_cache_destroy(ext4_pspace_cachep); 2925 kmem_cache_destroy(ext4_pspace_cachep);
2918 kmem_cache_destroy(ext4_ac_cachep); 2926 kmem_cache_destroy(ext4_ac_cachep);
2919 kmem_cache_destroy(ext4_free_ext_cachep); 2927 kmem_cache_destroy(ext4_free_ext_cachep);
2928 ext4_remove_debugfs_entry();
2920} 2929}
2921 2930
2922 2931
@@ -3061,7 +3070,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
3061 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; 3070 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
3062 else 3071 else
3063 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; 3072 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
3064 mb_debug("#%u: goal %u blocks for locality group\n", 3073 mb_debug(1, "#%u: goal %u blocks for locality group\n",
3065 current->pid, ac->ac_g_ex.fe_len); 3074 current->pid, ac->ac_g_ex.fe_len);
3066} 3075}
3067 3076
@@ -3180,23 +3189,18 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3180 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || 3189 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3181 ac->ac_o_ex.fe_logical < pa->pa_lstart)); 3190 ac->ac_o_ex.fe_logical < pa->pa_lstart));
3182 3191
3183 /* skip PA normalized request doesn't overlap with */ 3192 /* skip PAs this normalized request doesn't overlap with */
3184 if (pa->pa_lstart >= end) { 3193 if (pa->pa_lstart >= end || pa_end <= start) {
3185 spin_unlock(&pa->pa_lock);
3186 continue;
3187 }
3188 if (pa_end <= start) {
3189 spin_unlock(&pa->pa_lock); 3194 spin_unlock(&pa->pa_lock);
3190 continue; 3195 continue;
3191 } 3196 }
3192 BUG_ON(pa->pa_lstart <= start && pa_end >= end); 3197 BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3193 3198
3199 /* adjust start or end to be adjacent to this pa */
3194 if (pa_end <= ac->ac_o_ex.fe_logical) { 3200 if (pa_end <= ac->ac_o_ex.fe_logical) {
3195 BUG_ON(pa_end < start); 3201 BUG_ON(pa_end < start);
3196 start = pa_end; 3202 start = pa_end;
3197 } 3203 } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3198
3199 if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3200 BUG_ON(pa->pa_lstart > end); 3204 BUG_ON(pa->pa_lstart > end);
3201 end = pa->pa_lstart; 3205 end = pa->pa_lstart;
3202 } 3206 }
@@ -3251,7 +3255,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3251 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 3255 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3252 } 3256 }
3253 3257
3254 mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size, 3258 mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
3255 (unsigned) orig_size, (unsigned) start); 3259 (unsigned) orig_size, (unsigned) start);
3256} 3260}
3257 3261
@@ -3300,7 +3304,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3300 BUG_ON(pa->pa_free < len); 3304 BUG_ON(pa->pa_free < len);
3301 pa->pa_free -= len; 3305 pa->pa_free -= len;
3302 3306
3303 mb_debug("use %llu/%u from inode pa %p\n", start, len, pa); 3307 mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
3304} 3308}
3305 3309
3306/* 3310/*
@@ -3324,7 +3328,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3324 * in on-disk bitmap -- see ext4_mb_release_context() 3328 * in on-disk bitmap -- see ext4_mb_release_context()
3325 * Other CPUs are prevented from allocating from this pa by lg_mutex 3329 * Other CPUs are prevented from allocating from this pa by lg_mutex
3326 */ 3330 */
3327 mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); 3331 mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3328} 3332}
3329 3333
3330/* 3334/*
@@ -3382,6 +3386,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3382 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) 3386 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
3383 continue; 3387 continue;
3384 3388
3389 /* non-extent files can't have physical blocks past 2^32 */
3390 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
3391 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
3392 continue;
3393
3385 /* found preallocated blocks, use them */ 3394 /* found preallocated blocks, use them */
3386 spin_lock(&pa->pa_lock); 3395 spin_lock(&pa->pa_lock);
3387 if (pa->pa_deleted == 0 && pa->pa_free) { 3396 if (pa->pa_deleted == 0 && pa->pa_free) {
@@ -3503,7 +3512,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3503 preallocated += len; 3512 preallocated += len;
3504 count++; 3513 count++;
3505 } 3514 }
3506 mb_debug("prellocated %u for group %u\n", preallocated, group); 3515 mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
3507} 3516}
3508 3517
3509static void ext4_mb_pa_callback(struct rcu_head *head) 3518static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3638,7 +3647,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3638 pa->pa_deleted = 0; 3647 pa->pa_deleted = 0;
3639 pa->pa_type = MB_INODE_PA; 3648 pa->pa_type = MB_INODE_PA;
3640 3649
3641 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3650 mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
3642 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3651 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3643 trace_ext4_mb_new_inode_pa(ac, pa); 3652 trace_ext4_mb_new_inode_pa(ac, pa);
3644 3653
@@ -3698,7 +3707,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3698 pa->pa_deleted = 0; 3707 pa->pa_deleted = 0;
3699 pa->pa_type = MB_GROUP_PA; 3708 pa->pa_type = MB_GROUP_PA;
3700 3709
3701 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3710 mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
3702 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3711 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3703 trace_ext4_mb_new_group_pa(ac, pa); 3712 trace_ext4_mb_new_group_pa(ac, pa);
3704 3713
@@ -3777,7 +3786,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3777 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3786 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3778 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + 3787 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
3779 le32_to_cpu(sbi->s_es->s_first_data_block); 3788 le32_to_cpu(sbi->s_es->s_first_data_block);
3780 mb_debug(" free preallocated %u/%u in group %u\n", 3789 mb_debug(1, " free preallocated %u/%u in group %u\n",
3781 (unsigned) start, (unsigned) next - bit, 3790 (unsigned) start, (unsigned) next - bit,
3782 (unsigned) group); 3791 (unsigned) group);
3783 free += next - bit; 3792 free += next - bit;
@@ -3868,7 +3877,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3868 int busy = 0; 3877 int busy = 0;
3869 int free = 0; 3878 int free = 0;
3870 3879
3871 mb_debug("discard preallocation for group %u\n", group); 3880 mb_debug(1, "discard preallocation for group %u\n", group);
3872 3881
3873 if (list_empty(&grp->bb_prealloc_list)) 3882 if (list_empty(&grp->bb_prealloc_list))
3874 return 0; 3883 return 0;
@@ -3992,7 +4001,7 @@ void ext4_discard_preallocations(struct inode *inode)
3992 return; 4001 return;
3993 } 4002 }
3994 4003
3995 mb_debug("discard preallocation for inode %lu\n", inode->i_ino); 4004 mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
3996 trace_ext4_discard_preallocations(inode); 4005 trace_ext4_discard_preallocations(inode);
3997 4006
3998 INIT_LIST_HEAD(&list); 4007 INIT_LIST_HEAD(&list);
@@ -4097,7 +4106,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
4097{ 4106{
4098 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); 4107 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
4099} 4108}
4100#ifdef MB_DEBUG 4109#ifdef CONFIG_EXT4_DEBUG
4101static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 4110static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4102{ 4111{
4103 struct super_block *sb = ac->ac_sb; 4112 struct super_block *sb = ac->ac_sb;
@@ -4139,14 +4148,14 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4139 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 4148 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4140 NULL, &start); 4149 NULL, &start);
4141 spin_unlock(&pa->pa_lock); 4150 spin_unlock(&pa->pa_lock);
4142 printk(KERN_ERR "PA:%lu:%d:%u \n", i, 4151 printk(KERN_ERR "PA:%u:%d:%u \n", i,
4143 start, pa->pa_len); 4152 start, pa->pa_len);
4144 } 4153 }
4145 ext4_unlock_group(sb, i); 4154 ext4_unlock_group(sb, i);
4146 4155
4147 if (grp->bb_free == 0) 4156 if (grp->bb_free == 0)
4148 continue; 4157 continue;
4149 printk(KERN_ERR "%lu: %d/%d \n", 4158 printk(KERN_ERR "%u: %d/%d \n",
4150 i, grp->bb_free, grp->bb_fragments); 4159 i, grp->bb_free, grp->bb_fragments);
4151 } 4160 }
4152 printk(KERN_ERR "\n"); 4161 printk(KERN_ERR "\n");
@@ -4174,16 +4183,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4174 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 4183 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4175 return; 4184 return;
4176 4185
4186 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4187 return;
4188
4177 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 4189 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
4178 isize = i_size_read(ac->ac_inode) >> bsbits; 4190 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4191 >> bsbits;
4179 size = max(size, isize); 4192 size = max(size, isize);
4180 4193
4181 /* don't use group allocation for large files */ 4194 if ((size == isize) &&
4182 if (size >= sbi->s_mb_stream_request) 4195 !ext4_fs_is_busy(sbi) &&
4196 (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
4197 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
4183 return; 4198 return;
4199 }
4184 4200
4185 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 4201 /* don't use group allocation for large files */
4202 if (size >= sbi->s_mb_stream_request) {
4203 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4186 return; 4204 return;
4205 }
4187 4206
4188 BUG_ON(ac->ac_lg != NULL); 4207 BUG_ON(ac->ac_lg != NULL);
4189 /* 4208 /*
@@ -4246,7 +4265,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4246 * locality group. this is a policy, actually */ 4265 * locality group. this is a policy, actually */
4247 ext4_mb_group_or_file(ac); 4266 ext4_mb_group_or_file(ac);
4248 4267
4249 mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " 4268 mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4250 "left: %u/%u, right %u/%u to %swritable\n", 4269 "left: %u/%u, right %u/%u to %swritable\n",
4251 (unsigned) ar->len, (unsigned) ar->logical, 4270 (unsigned) ar->len, (unsigned) ar->logical,
4252 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, 4271 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
@@ -4268,7 +4287,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4268 struct ext4_prealloc_space *pa, *tmp; 4287 struct ext4_prealloc_space *pa, *tmp;
4269 struct ext4_allocation_context *ac; 4288 struct ext4_allocation_context *ac;
4270 4289
4271 mb_debug("discard locality group preallocation\n"); 4290 mb_debug(1, "discard locality group preallocation\n");
4272 4291
4273 INIT_LIST_HEAD(&discard_list); 4292 INIT_LIST_HEAD(&discard_list);
4274 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4293 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c96bb19f58f9..188d3d709b24 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -37,11 +37,19 @@
37 37
38/* 38/*
39 */ 39 */
40#define MB_DEBUG__ 40#ifdef CONFIG_EXT4_DEBUG
41#ifdef MB_DEBUG 41extern u8 mb_enable_debug;
42#define mb_debug(fmt, a...) printk(fmt, ##a) 42
43#define mb_debug(n, fmt, a...) \
44 do { \
45 if ((n) <= mb_enable_debug) { \
46 printk(KERN_DEBUG "(%s, %d): %s: ", \
47 __FILE__, __LINE__, __func__); \
48 printk(fmt, ## a); \
49 } \
50 } while (0)
43#else 51#else
44#define mb_debug(fmt, a...) 52#define mb_debug(n, fmt, a...)
45#endif 53#endif
46 54
47/* 55/*
@@ -128,8 +136,8 @@ struct ext4_prealloc_space {
128 unsigned pa_deleted; 136 unsigned pa_deleted;
129 ext4_fsblk_t pa_pstart; /* phys. block */ 137 ext4_fsblk_t pa_pstart; /* phys. block */
130 ext4_lblk_t pa_lstart; /* log. block */ 138 ext4_lblk_t pa_lstart; /* log. block */
131 unsigned short pa_len; /* len of preallocated chunk */ 139 ext4_grpblk_t pa_len; /* len of preallocated chunk */
132 unsigned short pa_free; /* how many blocks are free */ 140 ext4_grpblk_t pa_free; /* how many blocks are free */
133 unsigned short pa_type; /* pa type. inode or group */ 141 unsigned short pa_type; /* pa type. inode or group */
134 spinlock_t *pa_obj_lock; 142 spinlock_t *pa_obj_lock;
135 struct inode *pa_inode; /* hack, for history only */ 143 struct inode *pa_inode; /* hack, for history only */
@@ -144,7 +152,7 @@ struct ext4_free_extent {
144 ext4_lblk_t fe_logical; 152 ext4_lblk_t fe_logical;
145 ext4_grpblk_t fe_start; 153 ext4_grpblk_t fe_start;
146 ext4_group_t fe_group; 154 ext4_group_t fe_group;
147 int fe_len; 155 ext4_grpblk_t fe_len;
148}; 156};
149 157
150/* 158/*
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 313a50b39741..bf519f239ae6 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
353 353
354 down_write(&EXT4_I(inode)->i_data_sem); 354 down_write(&EXT4_I(inode)->i_data_sem);
355 /* 355 /*
356 * if EXT4_EXT_MIGRATE is cleared a block allocation 356 * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
357 * happened after we started the migrate. We need to 357 * happened after we started the migrate. We need to
358 * fail the migrate 358 * fail the migrate
359 */ 359 */
360 if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) { 360 if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
361 retval = -EAGAIN; 361 retval = -EAGAIN;
362 up_write(&EXT4_I(inode)->i_data_sem); 362 up_write(&EXT4_I(inode)->i_data_sem);
363 goto err_out; 363 goto err_out;
364 } else 364 } else
365 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & 365 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
366 ~EXT4_EXT_MIGRATE;
367 /* 366 /*
368 * We have the extent map build with the tmp inode. 367 * We have the extent map build with the tmp inode.
369 * Now copy the i_data across 368 * Now copy the i_data across
@@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode)
517 * when we add extents we extent the journal 516 * when we add extents we extent the journal
518 */ 517 */
519 /* 518 /*
520 * Even though we take i_mutex we can still cause block allocation 519 * Even though we take i_mutex we can still cause block
521 * via mmap write to holes. If we have allocated new blocks we fail 520 * allocation via mmap write to holes. If we have allocated
522 * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. 521 * new blocks we fail migrate. New block allocation will
523 * The flag is updated with i_data_sem held to prevent racing with 522 * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
524 * block allocation. 523 * with i_data_sem held to prevent racing with block
524 * allocation.
525 */ 525 */
526 down_read((&EXT4_I(inode)->i_data_sem)); 526 down_read((&EXT4_I(inode)->i_data_sem));
527 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE; 527 EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
528 up_read((&EXT4_I(inode)->i_data_sem)); 528 up_read((&EXT4_I(inode)->i_data_sem));
529 529
530 handle = ext4_journal_start(inode, 1); 530 handle = ext4_journal_start(inode, 1);
@@ -618,7 +618,7 @@ err_out:
618 tmp_inode->i_nlink = 0; 618 tmp_inode->i_nlink = 0;
619 619
620 ext4_journal_stop(handle); 620 ext4_journal_stop(handle);
621 621 unlock_new_inode(tmp_inode);
622 iput(tmp_inode); 622 iput(tmp_inode);
623 623
624 return retval; 624 return retval;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index bbf2dd9404dc..c07a2915e40b 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -19,14 +19,31 @@
19#include "ext4_extents.h" 19#include "ext4_extents.h"
20#include "ext4.h" 20#include "ext4.h"
21 21
22#define get_ext_path(path, inode, block, ret) \ 22/**
23 do { \ 23 * get_ext_path - Find an extent path for designated logical block number.
24 path = ext4_ext_find_extent(inode, block, path); \ 24 *
25 if (IS_ERR(path)) { \ 25 * @inode: an inode which is searched
26 ret = PTR_ERR(path); \ 26 * @lblock: logical block number to find an extent path
27 path = NULL; \ 27 * @path: pointer to an extent path pointer (for output)
28 } \ 28 *
29 } while (0) 29 * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
30 * on failure.
31 */
32static inline int
33get_ext_path(struct inode *inode, ext4_lblk_t lblock,
34 struct ext4_ext_path **path)
35{
36 int ret = 0;
37
38 *path = ext4_ext_find_extent(inode, lblock, *path);
39 if (IS_ERR(*path)) {
40 ret = PTR_ERR(*path);
41 *path = NULL;
42 } else if ((*path)[ext_depth(inode)].p_ext == NULL)
43 ret = -ENODATA;
44
45 return ret;
46}
30 47
31/** 48/**
32 * copy_extent_status - Copy the extent's initialization status 49 * copy_extent_status - Copy the extent's initialization status
@@ -113,6 +130,31 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
113} 130}
114 131
115/** 132/**
133 * mext_check_null_inode - NULL check for two inodes
134 *
135 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
136 */
137static int
138mext_check_null_inode(struct inode *inode1, struct inode *inode2,
139 const char *function)
140{
141 int ret = 0;
142
143 if (inode1 == NULL) {
144 ext4_error(inode2->i_sb, function,
145 "Both inodes should not be NULL: "
146 "inode1 NULL inode2 %lu", inode2->i_ino);
147 ret = -EIO;
148 } else if (inode2 == NULL) {
149 ext4_error(inode1->i_sb, function,
150 "Both inodes should not be NULL: "
151 "inode1 %lu inode2 NULL", inode1->i_ino);
152 ret = -EIO;
153 }
154 return ret;
155}
156
157/**
116 * mext_double_down_read - Acquire two inodes' read semaphore 158 * mext_double_down_read - Acquire two inodes' read semaphore
117 * 159 *
118 * @orig_inode: original inode structure 160 * @orig_inode: original inode structure
@@ -124,8 +166,6 @@ mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
124{ 166{
125 struct inode *first = orig_inode, *second = donor_inode; 167 struct inode *first = orig_inode, *second = donor_inode;
126 168
127 BUG_ON(orig_inode == NULL || donor_inode == NULL);
128
129 /* 169 /*
130 * Use the inode number to provide the stable locking order instead 170 * Use the inode number to provide the stable locking order instead
131 * of its address, because the C language doesn't guarantee you can 171 * of its address, because the C language doesn't guarantee you can
@@ -152,8 +192,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
152{ 192{
153 struct inode *first = orig_inode, *second = donor_inode; 193 struct inode *first = orig_inode, *second = donor_inode;
154 194
155 BUG_ON(orig_inode == NULL || donor_inode == NULL);
156
157 /* 195 /*
158 * Use the inode number to provide the stable locking order instead 196 * Use the inode number to provide the stable locking order instead
159 * of its address, because the C language doesn't guarantee you can 197 * of its address, because the C language doesn't guarantee you can
@@ -178,8 +216,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
178static void 216static void
179mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) 217mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
180{ 218{
181 BUG_ON(orig_inode == NULL || donor_inode == NULL);
182
183 up_read(&EXT4_I(orig_inode)->i_data_sem); 219 up_read(&EXT4_I(orig_inode)->i_data_sem);
184 up_read(&EXT4_I(donor_inode)->i_data_sem); 220 up_read(&EXT4_I(donor_inode)->i_data_sem);
185} 221}
@@ -194,8 +230,6 @@ mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
194static void 230static void
195mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) 231mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
196{ 232{
197 BUG_ON(orig_inode == NULL || donor_inode == NULL);
198
199 up_write(&EXT4_I(orig_inode)->i_data_sem); 233 up_write(&EXT4_I(orig_inode)->i_data_sem);
200 up_write(&EXT4_I(donor_inode)->i_data_sem); 234 up_write(&EXT4_I(donor_inode)->i_data_sem);
201} 235}
@@ -283,8 +317,8 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
283 } 317 }
284 318
285 if (new_flag) { 319 if (new_flag) {
286 get_ext_path(orig_path, orig_inode, eblock, err); 320 err = get_ext_path(orig_inode, eblock, &orig_path);
287 if (orig_path == NULL) 321 if (err)
288 goto out; 322 goto out;
289 323
290 if (ext4_ext_insert_extent(handle, orig_inode, 324 if (ext4_ext_insert_extent(handle, orig_inode,
@@ -293,9 +327,9 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
293 } 327 }
294 328
295 if (end_flag) { 329 if (end_flag) {
296 get_ext_path(orig_path, orig_inode, 330 err = get_ext_path(orig_inode,
297 le32_to_cpu(end_ext->ee_block) - 1, err); 331 le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
298 if (orig_path == NULL) 332 if (err)
299 goto out; 333 goto out;
300 334
301 if (ext4_ext_insert_extent(handle, orig_inode, 335 if (ext4_ext_insert_extent(handle, orig_inode,
@@ -519,7 +553,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
519 * oext |-----------| 553 * oext |-----------|
520 * new_ext |-------| 554 * new_ext |-------|
521 */ 555 */
522 BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end); 556 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
557 ext4_error(orig_inode->i_sb, __func__,
558 "new_ext_end(%u) should be less than or equal to "
559 "oext->ee_block(%u) + oext_alen(%d) - 1",
560 new_ext_end, le32_to_cpu(oext->ee_block),
561 oext_alen);
562 ret = -EIO;
563 goto out;
564 }
523 565
524 /* 566 /*
525 * Case: new_ext is smaller than original extent 567 * Case: new_ext is smaller than original extent
@@ -543,6 +585,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
543 585
544 ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, 586 ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
545 o_end, &start_ext, &new_ext, &end_ext); 587 o_end, &start_ext, &new_ext, &end_ext);
588out:
546 return ret; 589 return ret;
547} 590}
548 591
@@ -554,8 +597,10 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
554 * @orig_off: block offset of original inode 597 * @orig_off: block offset of original inode
555 * @donor_off: block offset of donor inode 598 * @donor_off: block offset of donor inode
556 * @max_count: the maximun length of extents 599 * @max_count: the maximun length of extents
600 *
601 * Return 0 on success, or a negative error value on failure.
557 */ 602 */
558static void 603static int
559mext_calc_swap_extents(struct ext4_extent *tmp_dext, 604mext_calc_swap_extents(struct ext4_extent *tmp_dext,
560 struct ext4_extent *tmp_oext, 605 struct ext4_extent *tmp_oext,
561 ext4_lblk_t orig_off, ext4_lblk_t donor_off, 606 ext4_lblk_t orig_off, ext4_lblk_t donor_off,
@@ -564,6 +609,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
564 ext4_lblk_t diff, orig_diff; 609 ext4_lblk_t diff, orig_diff;
565 struct ext4_extent dext_old, oext_old; 610 struct ext4_extent dext_old, oext_old;
566 611
612 BUG_ON(orig_off != donor_off);
613
614 /* original and donor extents have to cover the same block offset */
615 if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
616 le32_to_cpu(tmp_oext->ee_block) +
617 ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
618 return -ENODATA;
619
620 if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
621 le32_to_cpu(tmp_dext->ee_block) +
622 ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
623 return -ENODATA;
624
567 dext_old = *tmp_dext; 625 dext_old = *tmp_dext;
568 oext_old = *tmp_oext; 626 oext_old = *tmp_oext;
569 627
@@ -591,6 +649,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
591 649
592 copy_extent_status(&oext_old, tmp_dext); 650 copy_extent_status(&oext_old, tmp_dext);
593 copy_extent_status(&dext_old, tmp_oext); 651 copy_extent_status(&dext_old, tmp_oext);
652
653 return 0;
594} 654}
595 655
596/** 656/**
@@ -631,13 +691,13 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
631 mext_double_down_write(orig_inode, donor_inode); 691 mext_double_down_write(orig_inode, donor_inode);
632 692
633 /* Get the original extent for the block "orig_off" */ 693 /* Get the original extent for the block "orig_off" */
634 get_ext_path(orig_path, orig_inode, orig_off, err); 694 err = get_ext_path(orig_inode, orig_off, &orig_path);
635 if (orig_path == NULL) 695 if (err)
636 goto out; 696 goto out;
637 697
638 /* Get the donor extent for the head */ 698 /* Get the donor extent for the head */
639 get_ext_path(donor_path, donor_inode, donor_off, err); 699 err = get_ext_path(donor_inode, donor_off, &donor_path);
640 if (donor_path == NULL) 700 if (err)
641 goto out; 701 goto out;
642 depth = ext_depth(orig_inode); 702 depth = ext_depth(orig_inode);
643 oext = orig_path[depth].p_ext; 703 oext = orig_path[depth].p_ext;
@@ -647,13 +707,28 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
647 dext = donor_path[depth].p_ext; 707 dext = donor_path[depth].p_ext;
648 tmp_dext = *dext; 708 tmp_dext = *dext;
649 709
650 mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 710 err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
651 donor_off, count); 711 donor_off, count);
712 if (err)
713 goto out;
652 714
653 /* Loop for the donor extents */ 715 /* Loop for the donor extents */
654 while (1) { 716 while (1) {
655 /* The extent for donor must be found. */ 717 /* The extent for donor must be found. */
656 BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block)); 718 if (!dext) {
719 ext4_error(donor_inode->i_sb, __func__,
720 "The extent for donor must be found");
721 err = -EIO;
722 goto out;
723 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
724 ext4_error(donor_inode->i_sb, __func__,
725 "Donor offset(%u) and the first block of donor "
726 "extent(%u) should be equal",
727 donor_off,
728 le32_to_cpu(tmp_dext.ee_block));
729 err = -EIO;
730 goto out;
731 }
657 732
658 /* Set donor extent to orig extent */ 733 /* Set donor extent to orig extent */
659 err = mext_leaf_block(handle, orig_inode, 734 err = mext_leaf_block(handle, orig_inode,
@@ -678,8 +753,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
678 753
679 if (orig_path) 754 if (orig_path)
680 ext4_ext_drop_refs(orig_path); 755 ext4_ext_drop_refs(orig_path);
681 get_ext_path(orig_path, orig_inode, orig_off, err); 756 err = get_ext_path(orig_inode, orig_off, &orig_path);
682 if (orig_path == NULL) 757 if (err)
683 goto out; 758 goto out;
684 depth = ext_depth(orig_inode); 759 depth = ext_depth(orig_inode);
685 oext = orig_path[depth].p_ext; 760 oext = orig_path[depth].p_ext;
@@ -692,9 +767,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
692 767
693 if (donor_path) 768 if (donor_path)
694 ext4_ext_drop_refs(donor_path); 769 ext4_ext_drop_refs(donor_path);
695 get_ext_path(donor_path, donor_inode, 770 err = get_ext_path(donor_inode, donor_off, &donor_path);
696 donor_off, err); 771 if (err)
697 if (donor_path == NULL)
698 goto out; 772 goto out;
699 depth = ext_depth(donor_inode); 773 depth = ext_depth(donor_inode);
700 dext = donor_path[depth].p_ext; 774 dext = donor_path[depth].p_ext;
@@ -705,9 +779,10 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
705 } 779 }
706 tmp_dext = *dext; 780 tmp_dext = *dext;
707 781
708 mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 782 err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
709 donor_off, 783 donor_off, count - replaced_count);
710 count - replaced_count); 784 if (err)
785 goto out;
711 } 786 }
712 787
713out: 788out:
@@ -740,7 +815,7 @@ out:
740 * on success, or a negative error value on failure. 815 * on success, or a negative error value on failure.
741 */ 816 */
742static int 817static int
743move_extent_par_page(struct file *o_filp, struct inode *donor_inode, 818move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
744 pgoff_t orig_page_offset, int data_offset_in_page, 819 pgoff_t orig_page_offset, int data_offset_in_page,
745 int block_len_in_page, int uninit) 820 int block_len_in_page, int uninit)
746{ 821{
@@ -871,6 +946,7 @@ out:
871 if (PageLocked(page)) 946 if (PageLocked(page))
872 unlock_page(page); 947 unlock_page(page);
873 page_cache_release(page); 948 page_cache_release(page);
949 ext4_journal_stop(handle);
874 } 950 }
875out2: 951out2:
876 ext4_journal_stop(handle); 952 ext4_journal_stop(handle);
@@ -897,6 +973,10 @@ mext_check_arguments(struct inode *orig_inode,
897 struct inode *donor_inode, __u64 orig_start, 973 struct inode *donor_inode, __u64 orig_start,
898 __u64 donor_start, __u64 *len, __u64 moved_len) 974 __u64 donor_start, __u64 *len, __u64 moved_len)
899{ 975{
976 ext4_lblk_t orig_blocks, donor_blocks;
977 unsigned int blkbits = orig_inode->i_blkbits;
978 unsigned int blocksize = 1 << blkbits;
979
900 /* Regular file check */ 980 /* Regular file check */
901 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { 981 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
902 ext4_debug("ext4 move extent: The argument files should be " 982 ext4_debug("ext4 move extent: The argument files should be "
@@ -960,54 +1040,58 @@ mext_check_arguments(struct inode *orig_inode,
960 return -EINVAL; 1040 return -EINVAL;
961 } 1041 }
962 1042
963 if ((orig_start > MAX_DEFRAG_SIZE) || 1043 if ((orig_start > EXT_MAX_BLOCK) ||
964 (donor_start > MAX_DEFRAG_SIZE) || 1044 (donor_start > EXT_MAX_BLOCK) ||
965 (*len > MAX_DEFRAG_SIZE) || 1045 (*len > EXT_MAX_BLOCK) ||
966 (orig_start + *len > MAX_DEFRAG_SIZE)) { 1046 (orig_start + *len > EXT_MAX_BLOCK)) {
967 ext4_debug("ext4 move extent: Can't handle over [%lu] blocks " 1047 ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
968 "[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE, 1048 "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK,
969 orig_inode->i_ino, donor_inode->i_ino); 1049 orig_inode->i_ino, donor_inode->i_ino);
970 return -EINVAL; 1050 return -EINVAL;
971 } 1051 }
972 1052
973 if (orig_inode->i_size > donor_inode->i_size) { 1053 if (orig_inode->i_size > donor_inode->i_size) {
974 if (orig_start >= donor_inode->i_size) { 1054 donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
1055 /* TODO: eliminate this artificial restriction */
1056 if (orig_start >= donor_blocks) {
975 ext4_debug("ext4 move extent: orig start offset " 1057 ext4_debug("ext4 move extent: orig start offset "
976 "[%llu] should be less than donor file size " 1058 "[%llu] should be less than donor file blocks "
977 "[%lld] [ino:orig %lu, donor_inode %lu]\n", 1059 "[%u] [ino:orig %lu, donor %lu]\n",
978 orig_start, donor_inode->i_size, 1060 orig_start, donor_blocks,
979 orig_inode->i_ino, donor_inode->i_ino); 1061 orig_inode->i_ino, donor_inode->i_ino);
980 return -EINVAL; 1062 return -EINVAL;
981 } 1063 }
982 1064
983 if (orig_start + *len > donor_inode->i_size) { 1065 /* TODO: eliminate this artificial restriction */
1066 if (orig_start + *len > donor_blocks) {
984 ext4_debug("ext4 move extent: End offset [%llu] should " 1067 ext4_debug("ext4 move extent: End offset [%llu] should "
985 "be less than donor file size [%lld]." 1068 "be less than donor file blocks [%u]."
986 "So adjust length from %llu to %lld " 1069 "So adjust length from %llu to %llu "
987 "[ino:orig %lu, donor %lu]\n", 1070 "[ino:orig %lu, donor %lu]\n",
988 orig_start + *len, donor_inode->i_size, 1071 orig_start + *len, donor_blocks,
989 *len, donor_inode->i_size - orig_start, 1072 *len, donor_blocks - orig_start,
990 orig_inode->i_ino, donor_inode->i_ino); 1073 orig_inode->i_ino, donor_inode->i_ino);
991 *len = donor_inode->i_size - orig_start; 1074 *len = donor_blocks - orig_start;
992 } 1075 }
993 } else { 1076 } else {
994 if (orig_start >= orig_inode->i_size) { 1077 orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
1078 if (orig_start >= orig_blocks) {
995 ext4_debug("ext4 move extent: start offset [%llu] " 1079 ext4_debug("ext4 move extent: start offset [%llu] "
996 "should be less than original file size " 1080 "should be less than original file blocks "
997 "[%lld] [inode:orig %lu, donor %lu]\n", 1081 "[%u] [ino:orig %lu, donor %lu]\n",
998 orig_start, orig_inode->i_size, 1082 orig_start, orig_blocks,
999 orig_inode->i_ino, donor_inode->i_ino); 1083 orig_inode->i_ino, donor_inode->i_ino);
1000 return -EINVAL; 1084 return -EINVAL;
1001 } 1085 }
1002 1086
1003 if (orig_start + *len > orig_inode->i_size) { 1087 if (orig_start + *len > orig_blocks) {
1004 ext4_debug("ext4 move extent: Adjust length " 1088 ext4_debug("ext4 move extent: Adjust length "
1005 "from %llu to %lld. Because it should be " 1089 "from %llu to %llu. Because it should be "
1006 "less than original file size " 1090 "less than original file blocks "
1007 "[ino:orig %lu, donor %lu]\n", 1091 "[ino:orig %lu, donor %lu]\n",
1008 *len, orig_inode->i_size - orig_start, 1092 *len, orig_blocks - orig_start,
1009 orig_inode->i_ino, donor_inode->i_ino); 1093 orig_inode->i_ino, donor_inode->i_ino);
1010 *len = orig_inode->i_size - orig_start; 1094 *len = orig_blocks - orig_start;
1011 } 1095 }
1012 } 1096 }
1013 1097
@@ -1027,18 +1111,23 @@ mext_check_arguments(struct inode *orig_inode,
1027 * @inode1: the inode structure 1111 * @inode1: the inode structure
1028 * @inode2: the inode structure 1112 * @inode2: the inode structure
1029 * 1113 *
1030 * Lock two inodes' i_mutex by i_ino order. This function is moved from 1114 * Lock two inodes' i_mutex by i_ino order.
1031 * fs/inode.c. 1115 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
1032 */ 1116 */
1033static void 1117static int
1034mext_inode_double_lock(struct inode *inode1, struct inode *inode2) 1118mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1035{ 1119{
1036 if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { 1120 int ret = 0;
1037 if (inode1) 1121
1038 mutex_lock(&inode1->i_mutex); 1122 BUG_ON(inode1 == NULL && inode2 == NULL);
1039 else if (inode2) 1123
1040 mutex_lock(&inode2->i_mutex); 1124 ret = mext_check_null_inode(inode1, inode2, __func__);
1041 return; 1125 if (ret < 0)
1126 goto out;
1127
1128 if (inode1 == inode2) {
1129 mutex_lock(&inode1->i_mutex);
1130 goto out;
1042 } 1131 }
1043 1132
1044 if (inode1->i_ino < inode2->i_ino) { 1133 if (inode1->i_ino < inode2->i_ino) {
@@ -1048,6 +1137,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1048 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); 1137 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
1049 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); 1138 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
1050 } 1139 }
1140
1141out:
1142 return ret;
1051} 1143}
1052 1144
1053/** 1145/**
@@ -1056,17 +1148,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1056 * @inode1: the inode that is released first 1148 * @inode1: the inode that is released first
1057 * @inode2: the inode that is released second 1149 * @inode2: the inode that is released second
1058 * 1150 *
1059 * This function is moved from fs/inode.c. 1151 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
1060 */ 1152 */
1061 1153
1062static void 1154static int
1063mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) 1155mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
1064{ 1156{
1157 int ret = 0;
1158
1159 BUG_ON(inode1 == NULL && inode2 == NULL);
1160
1161 ret = mext_check_null_inode(inode1, inode2, __func__);
1162 if (ret < 0)
1163 goto out;
1164
1065 if (inode1) 1165 if (inode1)
1066 mutex_unlock(&inode1->i_mutex); 1166 mutex_unlock(&inode1->i_mutex);
1067 1167
1068 if (inode2 && inode2 != inode1) 1168 if (inode2 && inode2 != inode1)
1069 mutex_unlock(&inode2->i_mutex); 1169 mutex_unlock(&inode2->i_mutex);
1170
1171out:
1172 return ret;
1070} 1173}
1071 1174
1072/** 1175/**
@@ -1123,70 +1226,76 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1123 ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; 1226 ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
1124 ext4_lblk_t rest_blocks; 1227 ext4_lblk_t rest_blocks;
1125 pgoff_t orig_page_offset = 0, seq_end_page; 1228 pgoff_t orig_page_offset = 0, seq_end_page;
1126 int ret, depth, last_extent = 0; 1229 int ret1, ret2, depth, last_extent = 0;
1127 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 1230 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
1128 int data_offset_in_page; 1231 int data_offset_in_page;
1129 int block_len_in_page; 1232 int block_len_in_page;
1130 int uninit; 1233 int uninit;
1131 1234
1132 /* protect orig and donor against a truncate */ 1235 /* protect orig and donor against a truncate */
1133 mext_inode_double_lock(orig_inode, donor_inode); 1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode);
1237 if (ret1 < 0)
1238 return ret1;
1134 1239
1135 mext_double_down_read(orig_inode, donor_inode); 1240 mext_double_down_read(orig_inode, donor_inode);
1136 /* Check the filesystem environment whether move_extent can be done */ 1241 /* Check the filesystem environment whether move_extent can be done */
1137 ret = mext_check_arguments(orig_inode, donor_inode, orig_start, 1242 ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
1138 donor_start, &len, *moved_len); 1243 donor_start, &len, *moved_len);
1139 mext_double_up_read(orig_inode, donor_inode); 1244 mext_double_up_read(orig_inode, donor_inode);
1140 if (ret) 1245 if (ret1)
1141 goto out2; 1246 goto out;
1142 1247
1143 file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; 1248 file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
1144 block_end = block_start + len - 1; 1249 block_end = block_start + len - 1;
1145 if (file_end < block_end) 1250 if (file_end < block_end)
1146 len -= block_end - file_end; 1251 len -= block_end - file_end;
1147 1252
1148 get_ext_path(orig_path, orig_inode, block_start, ret); 1253 ret1 = get_ext_path(orig_inode, block_start, &orig_path);
1149 if (orig_path == NULL) 1254 if (ret1)
1150 goto out2; 1255 goto out;
1151 1256
1152 /* Get path structure to check the hole */ 1257 /* Get path structure to check the hole */
1153 get_ext_path(holecheck_path, orig_inode, block_start, ret); 1258 ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
1154 if (holecheck_path == NULL) 1259 if (ret1)
1155 goto out; 1260 goto out;
1156 1261
1157 depth = ext_depth(orig_inode); 1262 depth = ext_depth(orig_inode);
1158 ext_cur = holecheck_path[depth].p_ext; 1263 ext_cur = holecheck_path[depth].p_ext;
1159 if (ext_cur == NULL) {
1160 ret = -EINVAL;
1161 goto out;
1162 }
1163 1264
1164 /* 1265 /*
1165 * Get proper extent whose ee_block is beyond block_start 1266 * Get proper starting location of block replacement if block_start was
1166 * if block_start was within the hole. 1267 * within the hole.
1167 */ 1268 */
1168 if (le32_to_cpu(ext_cur->ee_block) + 1269 if (le32_to_cpu(ext_cur->ee_block) +
1169 ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { 1270 ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
1271 /*
1272 * The hole exists between extents or the tail of
1273 * original file.
1274 */
1170 last_extent = mext_next_extent(orig_inode, 1275 last_extent = mext_next_extent(orig_inode,
1171 holecheck_path, &ext_cur); 1276 holecheck_path, &ext_cur);
1172 if (last_extent < 0) { 1277 if (last_extent < 0) {
1173 ret = last_extent; 1278 ret1 = last_extent;
1174 goto out; 1279 goto out;
1175 } 1280 }
1176 last_extent = mext_next_extent(orig_inode, orig_path, 1281 last_extent = mext_next_extent(orig_inode, orig_path,
1177 &ext_dummy); 1282 &ext_dummy);
1178 if (last_extent < 0) { 1283 if (last_extent < 0) {
1179 ret = last_extent; 1284 ret1 = last_extent;
1180 goto out; 1285 goto out;
1181 } 1286 }
1182 } 1287 seq_start = le32_to_cpu(ext_cur->ee_block);
1183 seq_start = block_start; 1288 } else if (le32_to_cpu(ext_cur->ee_block) > block_start)
1289 /* The hole exists at the beginning of original file. */
1290 seq_start = le32_to_cpu(ext_cur->ee_block);
1291 else
1292 seq_start = block_start;
1184 1293
1185 /* No blocks within the specified range. */ 1294 /* No blocks within the specified range. */
1186 if (le32_to_cpu(ext_cur->ee_block) > block_end) { 1295 if (le32_to_cpu(ext_cur->ee_block) > block_end) {
1187 ext4_debug("ext4 move extent: The specified range of file " 1296 ext4_debug("ext4 move extent: The specified range of file "
1188 "may be the hole\n"); 1297 "may be the hole\n");
1189 ret = -EINVAL; 1298 ret1 = -EINVAL;
1190 goto out; 1299 goto out;
1191 } 1300 }
1192 1301
@@ -1206,7 +1315,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1206 last_extent = mext_next_extent(orig_inode, holecheck_path, 1315 last_extent = mext_next_extent(orig_inode, holecheck_path,
1207 &ext_cur); 1316 &ext_cur);
1208 if (last_extent < 0) { 1317 if (last_extent < 0) {
1209 ret = last_extent; 1318 ret1 = last_extent;
1210 break; 1319 break;
1211 } 1320 }
1212 add_blocks = ext4_ext_get_actual_len(ext_cur); 1321 add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1258,16 +1367,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1258 while (orig_page_offset <= seq_end_page) { 1367 while (orig_page_offset <= seq_end_page) {
1259 1368
1260 /* Swap original branches with new branches */ 1369 /* Swap original branches with new branches */
1261 ret = move_extent_par_page(o_filp, donor_inode, 1370 ret1 = move_extent_per_page(o_filp, donor_inode,
1262 orig_page_offset, 1371 orig_page_offset,
1263 data_offset_in_page, 1372 data_offset_in_page,
1264 block_len_in_page, uninit); 1373 block_len_in_page, uninit);
1265 if (ret < 0) 1374 if (ret1 < 0)
1266 goto out; 1375 goto out;
1267 orig_page_offset++; 1376 orig_page_offset++;
1268 /* Count how many blocks we have exchanged */ 1377 /* Count how many blocks we have exchanged */
1269 *moved_len += block_len_in_page; 1378 *moved_len += block_len_in_page;
1270 BUG_ON(*moved_len > len); 1379 if (*moved_len > len) {
1380 ext4_error(orig_inode->i_sb, __func__,
1381 "We replaced blocks too much! "
1382 "sum of replaced: %llu requested: %llu",
1383 *moved_len, len);
1384 ret1 = -EIO;
1385 goto out;
1386 }
1271 1387
1272 data_offset_in_page = 0; 1388 data_offset_in_page = 0;
1273 rest_blocks -= block_len_in_page; 1389 rest_blocks -= block_len_in_page;
@@ -1280,17 +1396,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1280 /* Decrease buffer counter */ 1396 /* Decrease buffer counter */
1281 if (holecheck_path) 1397 if (holecheck_path)
1282 ext4_ext_drop_refs(holecheck_path); 1398 ext4_ext_drop_refs(holecheck_path);
1283 get_ext_path(holecheck_path, orig_inode, 1399 ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
1284 seq_start, ret); 1400 if (ret1)
1285 if (holecheck_path == NULL)
1286 break; 1401 break;
1287 depth = holecheck_path->p_depth; 1402 depth = holecheck_path->p_depth;
1288 1403
1289 /* Decrease buffer counter */ 1404 /* Decrease buffer counter */
1290 if (orig_path) 1405 if (orig_path)
1291 ext4_ext_drop_refs(orig_path); 1406 ext4_ext_drop_refs(orig_path);
1292 get_ext_path(orig_path, orig_inode, seq_start, ret); 1407 ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
1293 if (orig_path == NULL) 1408 if (ret1)
1294 break; 1409 break;
1295 1410
1296 ext_cur = holecheck_path[depth].p_ext; 1411 ext_cur = holecheck_path[depth].p_ext;
@@ -1307,14 +1422,13 @@ out:
1307 ext4_ext_drop_refs(holecheck_path); 1422 ext4_ext_drop_refs(holecheck_path);
1308 kfree(holecheck_path); 1423 kfree(holecheck_path);
1309 } 1424 }
1310out2:
1311 mext_inode_double_unlock(orig_inode, donor_inode);
1312 1425
1313 if (ret) 1426 ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
1314 return ret;
1315 1427
1316 /* All of the specified blocks must be exchanged in succeed */ 1428 if (ret1)
1317 BUG_ON(*moved_len != len); 1429 return ret1;
1430 else if (ret2)
1431 return ret2;
1318 1432
1319 return 0; 1433 return 0;
1320} 1434}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 114abe5d2c1d..42f81d285cd5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1518,8 +1518,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1518 return retval; 1518 return retval;
1519 1519
1520 if (blocks == 1 && !dx_fallback && 1520 if (blocks == 1 && !dx_fallback &&
1521 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) 1521 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
1522 return make_indexed_dir(handle, dentry, inode, bh); 1522 retval = make_indexed_dir(handle, dentry, inode, bh);
1523 if (retval == -ENOSPC)
1524 brelse(bh);
1525 return retval;
1526 }
1523 brelse(bh); 1527 brelse(bh);
1524 } 1528 }
1525 bh = ext4_append(handle, dir, &block, &retval); 1529 bh = ext4_append(handle, dir, &block, &retval);
@@ -1528,7 +1532,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1528 de = (struct ext4_dir_entry_2 *) bh->b_data; 1532 de = (struct ext4_dir_entry_2 *) bh->b_data;
1529 de->inode = 0; 1533 de->inode = 0;
1530 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1534 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1531 return add_dirent_to_buf(handle, dentry, inode, de, bh); 1535 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1536 if (retval == -ENOSPC)
1537 brelse(bh);
1538 return retval;
1532} 1539}
1533 1540
1534/* 1541/*
@@ -1590,9 +1597,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1590 goto cleanup; 1597 goto cleanup;
1591 node2 = (struct dx_node *)(bh2->b_data); 1598 node2 = (struct dx_node *)(bh2->b_data);
1592 entries2 = node2->entries; 1599 entries2 = node2->entries;
1600 memset(&node2->fake, 0, sizeof(struct fake_dirent));
1593 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, 1601 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
1594 sb->s_blocksize); 1602 sb->s_blocksize);
1595 node2->fake.inode = 0;
1596 BUFFER_TRACE(frame->bh, "get_write_access"); 1603 BUFFER_TRACE(frame->bh, "get_write_access");
1597 err = ext4_journal_get_write_access(handle, frame->bh); 1604 err = ext4_journal_get_write_access(handle, frame->bh);
1598 if (err) 1605 if (err)
@@ -1657,7 +1664,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1657 if (!de) 1664 if (!de)
1658 goto cleanup; 1665 goto cleanup;
1659 err = add_dirent_to_buf(handle, dentry, inode, de, bh); 1666 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1660 bh = NULL; 1667 if (err != -ENOSPC)
1668 bh = NULL;
1661 goto cleanup; 1669 goto cleanup;
1662 1670
1663journal_error: 1671journal_error:
@@ -2310,7 +2318,7 @@ static int ext4_link(struct dentry *old_dentry,
2310 struct inode *inode = old_dentry->d_inode; 2318 struct inode *inode = old_dentry->d_inode;
2311 int err, retries = 0; 2319 int err, retries = 0;
2312 2320
2313 if (EXT4_DIR_LINK_MAX(inode)) 2321 if (inode->i_nlink >= EXT4_LINK_MAX)
2314 return -EMLINK; 2322 return -EMLINK;
2315 2323
2316 /* 2324 /*
@@ -2413,7 +2421,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2413 goto end_rename; 2421 goto end_rename;
2414 retval = -EMLINK; 2422 retval = -EMLINK;
2415 if (!new_inode && new_dir != old_dir && 2423 if (!new_inode && new_dir != old_dir &&
2416 new_dir->i_nlink >= EXT4_LINK_MAX) 2424 EXT4_DIR_LINK_MAX(new_dir))
2417 goto end_rename; 2425 goto end_rename;
2418 } 2426 }
2419 if (!new_bh) { 2427 if (!new_bh) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 68b0351fc647..3cfc343c41b5 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -746,7 +746,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
746 struct inode *inode = NULL; 746 struct inode *inode = NULL;
747 handle_t *handle; 747 handle_t *handle;
748 int gdb_off, gdb_num; 748 int gdb_off, gdb_num;
749 int num_grp_locked = 0;
750 int err, err2; 749 int err, err2;
751 750
752 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); 751 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -856,7 +855,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
856 * using the new disk blocks. 855 * using the new disk blocks.
857 */ 856 */
858 857
859 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
860 /* Update group descriptor block for new group */ 858 /* Update group descriptor block for new group */
861 gdp = (struct ext4_group_desc *)((char *)primary->b_data + 859 gdp = (struct ext4_group_desc *)((char *)primary->b_data +
862 gdb_off * EXT4_DESC_SIZE(sb)); 860 gdb_off * EXT4_DESC_SIZE(sb));
@@ -875,10 +873,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
875 * descriptor 873 * descriptor
876 */ 874 */
877 err = ext4_mb_add_groupinfo(sb, input->group, gdp); 875 err = ext4_mb_add_groupinfo(sb, input->group, gdp);
878 if (err) { 876 if (err)
879 ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
880 goto exit_journal; 877 goto exit_journal;
881 }
882 878
883 /* 879 /*
884 * Make the new blocks and inodes valid next. We do this before 880 * Make the new blocks and inodes valid next. We do this before
@@ -920,7 +916,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
920 916
921 /* Update the global fs size fields */ 917 /* Update the global fs size fields */
922 sbi->s_groups_count++; 918 sbi->s_groups_count++;
923 ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
924 919
925 ext4_handle_dirty_metadata(handle, NULL, primary); 920 ext4_handle_dirty_metadata(handle, NULL, primary);
926 921
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8f4f079e6b9a..df539ba27779 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
45#include "ext4_jbd2.h" 45#include "ext4_jbd2.h"
46#include "xattr.h" 46#include "xattr.h"
47#include "acl.h" 47#include "acl.h"
48#include "mballoc.h"
48 49
49#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
50#include <trace/events/ext4.h> 51#include <trace/events/ext4.h>
@@ -344,7 +345,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
344 errstr = "Out of memory"; 345 errstr = "Out of memory";
345 break; 346 break;
346 case -EROFS: 347 case -EROFS:
347 if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT) 348 if (!sb || (EXT4_SB(sb)->s_journal &&
349 EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
348 errstr = "Journal has aborted"; 350 errstr = "Journal has aborted";
349 else 351 else
350 errstr = "Readonly filesystem"; 352 errstr = "Readonly filesystem";
@@ -962,7 +964,7 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
962static ssize_t ext4_quota_write(struct super_block *sb, int type, 964static ssize_t ext4_quota_write(struct super_block *sb, int type,
963 const char *data, size_t len, loff_t off); 965 const char *data, size_t len, loff_t off);
964 966
965static struct dquot_operations ext4_quota_operations = { 967static const struct dquot_operations ext4_quota_operations = {
966 .initialize = dquot_initialize, 968 .initialize = dquot_initialize,
967 .drop = dquot_drop, 969 .drop = dquot_drop,
968 .alloc_space = dquot_alloc_space, 970 .alloc_space = dquot_alloc_space,
@@ -983,7 +985,7 @@ static struct dquot_operations ext4_quota_operations = {
983 .destroy_dquot = dquot_destroy, 985 .destroy_dquot = dquot_destroy,
984}; 986};
985 987
986static struct quotactl_ops ext4_qctl_operations = { 988static const struct quotactl_ops ext4_qctl_operations = {
987 .quota_on = ext4_quota_on, 989 .quota_on = ext4_quota_on,
988 .quota_off = vfs_quota_off, 990 .quota_off = vfs_quota_off,
989 .quota_sync = vfs_quota_sync, 991 .quota_sync = vfs_quota_sync,
@@ -1279,11 +1281,9 @@ static int parse_options(char *options, struct super_block *sb,
1279 *journal_devnum = option; 1281 *journal_devnum = option;
1280 break; 1282 break;
1281 case Opt_journal_checksum: 1283 case Opt_journal_checksum:
1282 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); 1284 break; /* Kept for backwards compatibility */
1283 break;
1284 case Opt_journal_async_commit: 1285 case Opt_journal_async_commit:
1285 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); 1286 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
1286 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1287 break; 1287 break;
1288 case Opt_noload: 1288 case Opt_noload:
1289 set_opt(sbi->s_mount_opt, NOLOAD); 1289 set_opt(sbi->s_mount_opt, NOLOAD);
@@ -1695,12 +1695,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
1695 gdp = ext4_get_group_desc(sb, i, NULL); 1695 gdp = ext4_get_group_desc(sb, i, NULL);
1696 1696
1697 flex_group = ext4_flex_group(sbi, i); 1697 flex_group = ext4_flex_group(sbi, i);
1698 atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, 1698 atomic_add(ext4_free_inodes_count(sb, gdp),
1699 ext4_free_inodes_count(sb, gdp)); 1699 &sbi->s_flex_groups[flex_group].free_inodes);
1700 atomic_set(&sbi->s_flex_groups[flex_group].free_blocks, 1700 atomic_add(ext4_free_blks_count(sb, gdp),
1701 ext4_free_blks_count(sb, gdp)); 1701 &sbi->s_flex_groups[flex_group].free_blocks);
1702 atomic_set(&sbi->s_flex_groups[flex_group].used_dirs, 1702 atomic_add(ext4_used_dirs_count(sb, gdp),
1703 ext4_used_dirs_count(sb, gdp)); 1703 &sbi->s_flex_groups[flex_group].used_dirs);
1704 } 1704 }
1705 1705
1706 return 1; 1706 return 1;
@@ -2253,6 +2253,49 @@ static struct kobj_type ext4_ktype = {
2253 .release = ext4_sb_release, 2253 .release = ext4_sb_release,
2254}; 2254};
2255 2255
2256/*
2257 * Check whether this filesystem can be mounted based on
2258 * the features present and the RDONLY/RDWR mount requested.
2259 * Returns 1 if this filesystem can be mounted as requested,
2260 * 0 if it cannot be.
2261 */
2262static int ext4_feature_set_ok(struct super_block *sb, int readonly)
2263{
2264 if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
2265 ext4_msg(sb, KERN_ERR,
2266 "Couldn't mount because of "
2267 "unsupported optional features (%x)",
2268 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2269 ~EXT4_FEATURE_INCOMPAT_SUPP));
2270 return 0;
2271 }
2272
2273 if (readonly)
2274 return 1;
2275
2276 /* Check that feature set is OK for a read-write mount */
2277 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
2278 ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
2279 "unsupported optional features (%x)",
2280 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2281 ~EXT4_FEATURE_RO_COMPAT_SUPP));
2282 return 0;
2283 }
2284 /*
2285 * Large file size enabled file system can only be mounted
2286 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
2287 */
2288 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
2289 if (sizeof(blkcnt_t) < sizeof(u64)) {
2290 ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
2291 "cannot be mounted RDWR without "
2292 "CONFIG_LBDAF");
2293 return 0;
2294 }
2295 }
2296 return 1;
2297}
2298
2256static int ext4_fill_super(struct super_block *sb, void *data, int silent) 2299static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2257 __releases(kernel_lock) 2300 __releases(kernel_lock)
2258 __acquires(kernel_lock) 2301 __acquires(kernel_lock)
@@ -2274,7 +2317,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2274 unsigned int db_count; 2317 unsigned int db_count;
2275 unsigned int i; 2318 unsigned int i;
2276 int needs_recovery, has_huge_files; 2319 int needs_recovery, has_huge_files;
2277 int features;
2278 __u64 blocks_count; 2320 __u64 blocks_count;
2279 int err; 2321 int err;
2280 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 2322 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -2401,39 +2443,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2401 * previously didn't change the revision level when setting the flags, 2443 * previously didn't change the revision level when setting the flags,
2402 * so there is a chance incompat flags are set on a rev 0 filesystem. 2444 * so there is a chance incompat flags are set on a rev 0 filesystem.
2403 */ 2445 */
2404 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); 2446 if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
2405 if (features) {
2406 ext4_msg(sb, KERN_ERR,
2407 "Couldn't mount because of "
2408 "unsupported optional features (%x)",
2409 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2410 ~EXT4_FEATURE_INCOMPAT_SUPP));
2411 goto failed_mount;
2412 }
2413 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
2414 if (!(sb->s_flags & MS_RDONLY) && features) {
2415 ext4_msg(sb, KERN_ERR,
2416 "Couldn't mount RDWR because of "
2417 "unsupported optional features (%x)",
2418 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2419 ~EXT4_FEATURE_RO_COMPAT_SUPP));
2420 goto failed_mount; 2447 goto failed_mount;
2421 } 2448
2422 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
2423 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
2424 if (has_huge_files) {
2425 /*
2426 * Large file size enabled file system can only be
2427 * mount if kernel is build with CONFIG_LBDAF
2428 */
2429 if (sizeof(root->i_blocks) < sizeof(u64) &&
2430 !(sb->s_flags & MS_RDONLY)) {
2431 ext4_msg(sb, KERN_ERR, "Filesystem with huge "
2432 "files cannot be mounted read-write "
2433 "without CONFIG_LBDAF");
2434 goto failed_mount;
2435 }
2436 }
2437 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); 2449 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
2438 2450
2439 if (blocksize < EXT4_MIN_BLOCK_SIZE || 2451 if (blocksize < EXT4_MIN_BLOCK_SIZE ||
@@ -2469,6 +2481,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2469 } 2481 }
2470 } 2482 }
2471 2483
2484 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
2485 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
2472 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, 2486 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
2473 has_huge_files); 2487 has_huge_files);
2474 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); 2488 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@ -2549,12 +2563,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2549 goto failed_mount; 2563 goto failed_mount;
2550 } 2564 }
2551 2565
2552 if (ext4_blocks_count(es) > 2566 /*
2553 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { 2567 * Test whether we have more sectors than will fit in sector_t,
2568 * and whether the max offset is addressable by the page cache.
2569 */
2570 if ((ext4_blocks_count(es) >
2571 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
2572 (ext4_blocks_count(es) >
2573 (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
2554 ext4_msg(sb, KERN_ERR, "filesystem" 2574 ext4_msg(sb, KERN_ERR, "filesystem"
2555 " too large to mount safely"); 2575 " too large to mount safely on this system");
2556 if (sizeof(sector_t) < 8) 2576 if (sizeof(sector_t) < 8)
2557 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); 2577 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
2578 ret = -EFBIG;
2558 goto failed_mount; 2579 goto failed_mount;
2559 } 2580 }
2560 2581
@@ -2595,6 +2616,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2595 goto failed_mount; 2616 goto failed_mount;
2596 } 2617 }
2597 sbi->s_groups_count = blocks_count; 2618 sbi->s_groups_count = blocks_count;
2619 sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
2620 (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
2598 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / 2621 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
2599 EXT4_DESC_PER_BLOCK(sb); 2622 EXT4_DESC_PER_BLOCK(sb);
2600 sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), 2623 sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
@@ -2729,20 +2752,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2729 goto failed_mount4; 2752 goto failed_mount4;
2730 } 2753 }
2731 2754
2732 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 2755 jbd2_journal_set_features(sbi->s_journal,
2733 jbd2_journal_set_features(sbi->s_journal, 2756 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
2734 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 2757 if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
2758 jbd2_journal_set_features(sbi->s_journal, 0, 0,
2735 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 2759 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2736 } else if (test_opt(sb, JOURNAL_CHECKSUM)) { 2760 else
2737 jbd2_journal_set_features(sbi->s_journal,
2738 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
2739 jbd2_journal_clear_features(sbi->s_journal, 0, 0, 2761 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
2740 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 2762 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2741 } else {
2742 jbd2_journal_clear_features(sbi->s_journal,
2743 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2744 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2745 }
2746 2763
2747 /* We have now updated the journal if required, so we can 2764 /* We have now updated the journal if required, so we can
2748 * validate the data journaling mode. */ 2765 * validate the data journaling mode. */
@@ -3208,7 +3225,18 @@ static int ext4_commit_super(struct super_block *sb, int sync)
3208 clear_buffer_write_io_error(sbh); 3225 clear_buffer_write_io_error(sbh);
3209 set_buffer_uptodate(sbh); 3226 set_buffer_uptodate(sbh);
3210 } 3227 }
3211 es->s_wtime = cpu_to_le32(get_seconds()); 3228 /*
3229 * If the file system is mounted read-only, don't update the
3230 * superblock write time. This avoids updating the superblock
3231 * write time when we are mounting the root file system
3232 * read/only but we need to replay the journal; at that point,
3233 * for people who are east of GMT and who make their clock
3234 * tick in localtime for Windows bug-for-bug compatibility,
3235 * the clock is set in the future, and this will cause e2fsck
3236 * to complain and force a full file system check.
3237 */
3238 if (!(sb->s_flags & MS_RDONLY))
3239 es->s_wtime = cpu_to_le32(get_seconds());
3212 es->s_kbytes_written = 3240 es->s_kbytes_written =
3213 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 3241 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
3214 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 3242 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
@@ -3477,18 +3505,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3477 if (sbi->s_journal) 3505 if (sbi->s_journal)
3478 ext4_mark_recovery_complete(sb, es); 3506 ext4_mark_recovery_complete(sb, es);
3479 } else { 3507 } else {
3480 int ret; 3508 /* Make sure we can mount this feature set readwrite */
3481 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, 3509 if (!ext4_feature_set_ok(sb, 0)) {
3482 ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
3483 ext4_msg(sb, KERN_WARNING, "couldn't "
3484 "remount RDWR because of unsupported "
3485 "optional features (%x)",
3486 (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
3487 ~EXT4_FEATURE_RO_COMPAT_SUPP));
3488 err = -EROFS; 3510 err = -EROFS;
3489 goto restore_opts; 3511 goto restore_opts;
3490 } 3512 }
3491
3492 /* 3513 /*
3493 * Make sure the group descriptor checksums 3514 * Make sure the group descriptor checksums
3494 * are sane. If they aren't, refuse to remount r/w. 3515 * are sane. If they aren't, refuse to remount r/w.
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 62b31c246994..fed5b01d7a8d 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,12 +810,23 @@ inserted:
810 get_bh(new_bh); 810 get_bh(new_bh);
811 } else { 811 } else {
812 /* We need to allocate a new block */ 812 /* We need to allocate a new block */
813 ext4_fsblk_t goal = ext4_group_first_block_no(sb, 813 ext4_fsblk_t goal, block;
814
815 goal = ext4_group_first_block_no(sb,
814 EXT4_I(inode)->i_block_group); 816 EXT4_I(inode)->i_block_group);
815 ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode, 817
818 /* non-extent files can't have physical blocks past 2^32 */
819 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
820 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
821
822 block = ext4_new_meta_blocks(handle, inode,
816 goal, NULL, &error); 823 goal, NULL, &error);
817 if (error) 824 if (error)
818 goto cleanup; 825 goto cleanup;
826
827 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
828 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
829
819 ea_idebug(inode, "creating block %d", block); 830 ea_idebug(inode, "creating block %d", block);
820 831
821 new_bh = sb_getblk(sb, block); 832 new_bh = sb_getblk(sb, block);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ae413086db97..fc089f2f7f56 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -263,6 +263,79 @@ pid_t f_getown(struct file *filp)
263 return pid; 263 return pid;
264} 264}
265 265
266static int f_setown_ex(struct file *filp, unsigned long arg)
267{
268 struct f_owner_ex * __user owner_p = (void * __user)arg;
269 struct f_owner_ex owner;
270 struct pid *pid;
271 int type;
272 int ret;
273
274 ret = copy_from_user(&owner, owner_p, sizeof(owner));
275 if (ret)
276 return ret;
277
278 switch (owner.type) {
279 case F_OWNER_TID:
280 type = PIDTYPE_MAX;
281 break;
282
283 case F_OWNER_PID:
284 type = PIDTYPE_PID;
285 break;
286
287 case F_OWNER_GID:
288 type = PIDTYPE_PGID;
289 break;
290
291 default:
292 return -EINVAL;
293 }
294
295 rcu_read_lock();
296 pid = find_vpid(owner.pid);
297 if (owner.pid && !pid)
298 ret = -ESRCH;
299 else
300 ret = __f_setown(filp, pid, type, 1);
301 rcu_read_unlock();
302
303 return ret;
304}
305
306static int f_getown_ex(struct file *filp, unsigned long arg)
307{
308 struct f_owner_ex * __user owner_p = (void * __user)arg;
309 struct f_owner_ex owner;
310 int ret = 0;
311
312 read_lock(&filp->f_owner.lock);
313 owner.pid = pid_vnr(filp->f_owner.pid);
314 switch (filp->f_owner.pid_type) {
315 case PIDTYPE_MAX:
316 owner.type = F_OWNER_TID;
317 break;
318
319 case PIDTYPE_PID:
320 owner.type = F_OWNER_PID;
321 break;
322
323 case PIDTYPE_PGID:
324 owner.type = F_OWNER_GID;
325 break;
326
327 default:
328 WARN_ON(1);
329 ret = -EINVAL;
330 break;
331 }
332 read_unlock(&filp->f_owner.lock);
333
334 if (!ret)
335 ret = copy_to_user(owner_p, &owner, sizeof(owner));
336 return ret;
337}
338
266static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, 339static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
267 struct file *filp) 340 struct file *filp)
268{ 341{
@@ -313,6 +386,12 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
313 case F_SETOWN: 386 case F_SETOWN:
314 err = f_setown(filp, arg, 1); 387 err = f_setown(filp, arg, 1);
315 break; 388 break;
389 case F_GETOWN_EX:
390 err = f_getown_ex(filp, arg);
391 break;
392 case F_SETOWN_EX:
393 err = f_setown_ex(filp, arg);
394 break;
316 case F_GETSIG: 395 case F_GETSIG:
317 err = filp->f_owner.signum; 396 err = filp->f_owner.signum;
318 break; 397 break;
@@ -428,8 +507,7 @@ static inline int sigio_perm(struct task_struct *p,
428 507
429static void send_sigio_to_task(struct task_struct *p, 508static void send_sigio_to_task(struct task_struct *p,
430 struct fown_struct *fown, 509 struct fown_struct *fown,
431 int fd, 510 int fd, int reason, int group)
432 int reason)
433{ 511{
434 /* 512 /*
435 * F_SETSIG can change ->signum lockless in parallel, make 513 * F_SETSIG can change ->signum lockless in parallel, make
@@ -461,11 +539,11 @@ static void send_sigio_to_task(struct task_struct *p,
461 else 539 else
462 si.si_band = band_table[reason - POLL_IN]; 540 si.si_band = band_table[reason - POLL_IN];
463 si.si_fd = fd; 541 si.si_fd = fd;
464 if (!group_send_sig_info(signum, &si, p)) 542 if (!do_send_sig_info(signum, &si, p, group))
465 break; 543 break;
466 /* fall-through: fall back on the old plain SIGIO signal */ 544 /* fall-through: fall back on the old plain SIGIO signal */
467 case 0: 545 case 0:
468 group_send_sig_info(SIGIO, SEND_SIG_PRIV, p); 546 do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, group);
469 } 547 }
470} 548}
471 549
@@ -474,16 +552,23 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
474 struct task_struct *p; 552 struct task_struct *p;
475 enum pid_type type; 553 enum pid_type type;
476 struct pid *pid; 554 struct pid *pid;
555 int group = 1;
477 556
478 read_lock(&fown->lock); 557 read_lock(&fown->lock);
558
479 type = fown->pid_type; 559 type = fown->pid_type;
560 if (type == PIDTYPE_MAX) {
561 group = 0;
562 type = PIDTYPE_PID;
563 }
564
480 pid = fown->pid; 565 pid = fown->pid;
481 if (!pid) 566 if (!pid)
482 goto out_unlock_fown; 567 goto out_unlock_fown;
483 568
484 read_lock(&tasklist_lock); 569 read_lock(&tasklist_lock);
485 do_each_pid_task(pid, type, p) { 570 do_each_pid_task(pid, type, p) {
486 send_sigio_to_task(p, fown, fd, band); 571 send_sigio_to_task(p, fown, fd, band, group);
487 } while_each_pid_task(pid, type, p); 572 } while_each_pid_task(pid, type, p);
488 read_unlock(&tasklist_lock); 573 read_unlock(&tasklist_lock);
489 out_unlock_fown: 574 out_unlock_fown:
@@ -491,10 +576,10 @@ void send_sigio(struct fown_struct *fown, int fd, int band)
491} 576}
492 577
493static void send_sigurg_to_task(struct task_struct *p, 578static void send_sigurg_to_task(struct task_struct *p,
494 struct fown_struct *fown) 579 struct fown_struct *fown, int group)
495{ 580{
496 if (sigio_perm(p, fown, SIGURG)) 581 if (sigio_perm(p, fown, SIGURG))
497 group_send_sig_info(SIGURG, SEND_SIG_PRIV, p); 582 do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, group);
498} 583}
499 584
500int send_sigurg(struct fown_struct *fown) 585int send_sigurg(struct fown_struct *fown)
@@ -502,10 +587,17 @@ int send_sigurg(struct fown_struct *fown)
502 struct task_struct *p; 587 struct task_struct *p;
503 enum pid_type type; 588 enum pid_type type;
504 struct pid *pid; 589 struct pid *pid;
590 int group = 1;
505 int ret = 0; 591 int ret = 0;
506 592
507 read_lock(&fown->lock); 593 read_lock(&fown->lock);
594
508 type = fown->pid_type; 595 type = fown->pid_type;
596 if (type == PIDTYPE_MAX) {
597 group = 0;
598 type = PIDTYPE_PID;
599 }
600
509 pid = fown->pid; 601 pid = fown->pid;
510 if (!pid) 602 if (!pid)
511 goto out_unlock_fown; 603 goto out_unlock_fown;
@@ -514,7 +606,7 @@ int send_sigurg(struct fown_struct *fown)
514 606
515 read_lock(&tasklist_lock); 607 read_lock(&tasklist_lock);
516 do_each_pid_task(pid, type, p) { 608 do_each_pid_task(pid, type, p) {
517 send_sigurg_to_task(p, fown); 609 send_sigurg_to_task(p, fown, group);
518 } while_each_pid_task(pid, type, p); 610 } while_each_pid_task(pid, type, p);
519 read_unlock(&tasklist_lock); 611 read_unlock(&tasklist_lock);
520 out_unlock_fown: 612 out_unlock_fown:
diff --git a/fs/file_table.c b/fs/file_table.c
index 334ce39881f8..8eb44042e009 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -74,14 +74,14 @@ EXPORT_SYMBOL_GPL(get_max_files);
74 * Handle nr_files sysctl 74 * Handle nr_files sysctl
75 */ 75 */
76#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) 76#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
77int proc_nr_files(ctl_table *table, int write, struct file *filp, 77int proc_nr_files(ctl_table *table, int write,
78 void __user *buffer, size_t *lenp, loff_t *ppos) 78 void __user *buffer, size_t *lenp, loff_t *ppos)
79{ 79{
80 files_stat.nr_files = get_nr_files(); 80 files_stat.nr_files = get_nr_files();
81 return proc_dointvec(table, write, filp, buffer, lenp, ppos); 81 return proc_dointvec(table, write, buffer, lenp, ppos);
82} 82}
83#else 83#else
84int proc_nr_files(ctl_table *table, int write, struct file *filp, 84int proc_nr_files(ctl_table *table, int write,
85 void __user *buffer, size_t *lenp, loff_t *ppos) 85 void __user *buffer, size_t *lenp, loff_t *ppos)
86{ 86{
87 return -ENOSYS; 87 return -ENOSYS;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 628235cf44b5..8e1e5e19d21e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -35,21 +35,29 @@
35int nr_pdflush_threads; 35int nr_pdflush_threads;
36 36
37/* 37/*
38 * Passed into wb_writeback(), essentially a subset of writeback_control
39 */
40struct wb_writeback_args {
41 long nr_pages;
42 struct super_block *sb;
43 enum writeback_sync_modes sync_mode;
44 int for_kupdate;
45 int range_cyclic;
46};
47
48/*
38 * Work items for the bdi_writeback threads 49 * Work items for the bdi_writeback threads
39 */ 50 */
40struct bdi_work { 51struct bdi_work {
41 struct list_head list; 52 struct list_head list; /* pending work list */
42 struct list_head wait_list; 53 struct rcu_head rcu_head; /* for RCU free/clear of work */
43 struct rcu_head rcu_head;
44 54
45 unsigned long seen; 55 unsigned long seen; /* threads that have seen this work */
46 atomic_t pending; 56 atomic_t pending; /* number of threads still to do work */
47 57
48 struct super_block *sb; 58 struct wb_writeback_args args; /* writeback arguments */
49 unsigned long nr_pages;
50 enum writeback_sync_modes sync_mode;
51 59
52 unsigned long state; 60 unsigned long state; /* flag bits, see WS_* */
53}; 61};
54 62
55enum { 63enum {
@@ -66,22 +74,13 @@ static inline bool bdi_work_on_stack(struct bdi_work *work)
66} 74}
67 75
68static inline void bdi_work_init(struct bdi_work *work, 76static inline void bdi_work_init(struct bdi_work *work,
69 struct writeback_control *wbc) 77 struct wb_writeback_args *args)
70{ 78{
71 INIT_RCU_HEAD(&work->rcu_head); 79 INIT_RCU_HEAD(&work->rcu_head);
72 work->sb = wbc->sb; 80 work->args = *args;
73 work->nr_pages = wbc->nr_to_write;
74 work->sync_mode = wbc->sync_mode;
75 work->state = WS_USED; 81 work->state = WS_USED;
76} 82}
77 83
78static inline void bdi_work_init_on_stack(struct bdi_work *work,
79 struct writeback_control *wbc)
80{
81 bdi_work_init(work, wbc);
82 work->state |= WS_ONSTACK;
83}
84
85/** 84/**
86 * writeback_in_progress - determine whether there is writeback in progress 85 * writeback_in_progress - determine whether there is writeback in progress
87 * @bdi: the device's backing_dev_info structure. 86 * @bdi: the device's backing_dev_info structure.
@@ -98,6 +97,11 @@ static void bdi_work_clear(struct bdi_work *work)
98{ 97{
99 clear_bit(WS_USED_B, &work->state); 98 clear_bit(WS_USED_B, &work->state);
100 smp_mb__after_clear_bit(); 99 smp_mb__after_clear_bit();
100 /*
101 * work can have disappeared at this point. bit waitq functions
102 * should be able to tolerate this, provided bdi_sched_wait does
103 * not dereference it's pointer argument.
104 */
101 wake_up_bit(&work->state, WS_USED_B); 105 wake_up_bit(&work->state, WS_USED_B);
102} 106}
103 107
@@ -113,7 +117,8 @@ static void bdi_work_free(struct rcu_head *head)
113 117
114static void wb_work_complete(struct bdi_work *work) 118static void wb_work_complete(struct bdi_work *work)
115{ 119{
116 const enum writeback_sync_modes sync_mode = work->sync_mode; 120 const enum writeback_sync_modes sync_mode = work->args.sync_mode;
121 int onstack = bdi_work_on_stack(work);
117 122
118 /* 123 /*
119 * For allocated work, we can clear the done/seen bit right here. 124 * For allocated work, we can clear the done/seen bit right here.
@@ -121,9 +126,9 @@ static void wb_work_complete(struct bdi_work *work)
121 * to after the RCU grace period, since the stack could be invalidated 126 * to after the RCU grace period, since the stack could be invalidated
122 * as soon as bdi_work_clear() has done the wakeup. 127 * as soon as bdi_work_clear() has done the wakeup.
123 */ 128 */
124 if (!bdi_work_on_stack(work)) 129 if (!onstack)
125 bdi_work_clear(work); 130 bdi_work_clear(work);
126 if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work)) 131 if (sync_mode == WB_SYNC_NONE || onstack)
127 call_rcu(&work->rcu_head, bdi_work_free); 132 call_rcu(&work->rcu_head, bdi_work_free);
128} 133}
129 134
@@ -146,21 +151,19 @@ static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
146 151
147static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) 152static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
148{ 153{
149 if (work) { 154 work->seen = bdi->wb_mask;
150 work->seen = bdi->wb_mask; 155 BUG_ON(!work->seen);
151 BUG_ON(!work->seen); 156 atomic_set(&work->pending, bdi->wb_cnt);
152 atomic_set(&work->pending, bdi->wb_cnt); 157 BUG_ON(!bdi->wb_cnt);
153 BUG_ON(!bdi->wb_cnt);
154
155 /*
156 * Make sure stores are seen before it appears on the list
157 */
158 smp_mb();
159 158
160 spin_lock(&bdi->wb_lock); 159 /*
161 list_add_tail_rcu(&work->list, &bdi->work_list); 160 * list_add_tail_rcu() contains the necessary barriers to
162 spin_unlock(&bdi->wb_lock); 161 * make sure the above stores are seen before the item is
163 } 162 * noticed on the list
163 */
164 spin_lock(&bdi->wb_lock);
165 list_add_tail_rcu(&work->list, &bdi->work_list);
166 spin_unlock(&bdi->wb_lock);
164 167
165 /* 168 /*
166 * If the default thread isn't there, make sure we add it. When 169 * If the default thread isn't there, make sure we add it. When
@@ -171,15 +174,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
171 else { 174 else {
172 struct bdi_writeback *wb = &bdi->wb; 175 struct bdi_writeback *wb = &bdi->wb;
173 176
174 /* 177 if (wb->task)
175 * If we failed allocating the bdi work item, wake up the wb
176 * thread always. As a safety precaution, it'll flush out
177 * everything
178 */
179 if (!wb_has_dirty_io(wb)) {
180 if (work)
181 wb_clear_pending(wb, work);
182 } else if (wb->task)
183 wake_up_process(wb->task); 178 wake_up_process(wb->task);
184 } 179 }
185} 180}
@@ -194,48 +189,75 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
194 TASK_UNINTERRUPTIBLE); 189 TASK_UNINTERRUPTIBLE);
195} 190}
196 191
197static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc) 192static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
193 struct wb_writeback_args *args)
198{ 194{
199 struct bdi_work *work; 195 struct bdi_work *work;
200 196
197 /*
198 * This is WB_SYNC_NONE writeback, so if allocation fails just
199 * wakeup the thread for old dirty data writeback
200 */
201 work = kmalloc(sizeof(*work), GFP_ATOMIC); 201 work = kmalloc(sizeof(*work), GFP_ATOMIC);
202 if (work) 202 if (work) {
203 bdi_work_init(work, wbc); 203 bdi_work_init(work, args);
204 bdi_queue_work(bdi, work);
205 } else {
206 struct bdi_writeback *wb = &bdi->wb;
204 207
205 return work; 208 if (wb->task)
209 wake_up_process(wb->task);
210 }
206} 211}
207 212
208void bdi_start_writeback(struct writeback_control *wbc) 213/**
214 * bdi_sync_writeback - start and wait for writeback
215 * @bdi: the backing device to write from
216 * @sb: write inodes from this super_block
217 *
218 * Description:
219 * This does WB_SYNC_ALL data integrity writeback and waits for the
220 * IO to complete. Callers must hold the sb s_umount semaphore for
221 * reading, to avoid having the super disappear before we are done.
222 */
223static void bdi_sync_writeback(struct backing_dev_info *bdi,
224 struct super_block *sb)
209{ 225{
210 const bool must_wait = wbc->sync_mode == WB_SYNC_ALL; 226 struct wb_writeback_args args = {
211 struct bdi_work work_stack, *work = NULL; 227 .sb = sb,
228 .sync_mode = WB_SYNC_ALL,
229 .nr_pages = LONG_MAX,
230 .range_cyclic = 0,
231 };
232 struct bdi_work work;
212 233
213 if (!must_wait) 234 bdi_work_init(&work, &args);
214 work = bdi_alloc_work(wbc); 235 work.state |= WS_ONSTACK;
215 236
216 if (!work) { 237 bdi_queue_work(bdi, &work);
217 work = &work_stack; 238 bdi_wait_on_work_clear(&work);
218 bdi_work_init_on_stack(work, wbc); 239}
219 }
220 240
221 bdi_queue_work(wbc->bdi, work); 241/**
242 * bdi_start_writeback - start writeback
243 * @bdi: the backing device to write from
244 * @nr_pages: the number of pages to write
245 *
246 * Description:
247 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
248 * started when this function returns, we make no guarentees on
249 * completion. Caller need not hold sb s_umount semaphore.
250 *
251 */
252void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
253{
254 struct wb_writeback_args args = {
255 .sync_mode = WB_SYNC_NONE,
256 .nr_pages = nr_pages,
257 .range_cyclic = 1,
258 };
222 259
223 /* 260 bdi_alloc_queue_work(bdi, &args);
224 * If the sync mode is WB_SYNC_ALL, block waiting for the work to
225 * complete. If not, we only need to wait for the work to be started,
226 * if we allocated it on-stack. We use the same mechanism, if the
227 * wait bit is set in the bdi_work struct, then threads will not
228 * clear pending until after they are done.
229 *
230 * Note that work == &work_stack if must_wait is true, so we don't
231 * need to do call_rcu() here ever, since the completion path will
232 * have done that for us.
233 */
234 if (must_wait || work == &work_stack) {
235 bdi_wait_on_work_clear(work);
236 if (work != &work_stack)
237 call_rcu(&work->rcu_head, bdi_work_free);
238 }
239} 261}
240 262
241/* 263/*
@@ -671,17 +693,16 @@ static inline bool over_bground_thresh(void)
671 * older_than_this takes precedence over nr_to_write. So we'll only write back 693 * older_than_this takes precedence over nr_to_write. So we'll only write back
672 * all dirty pages if they are all attached to "old" mappings. 694 * all dirty pages if they are all attached to "old" mappings.
673 */ 695 */
674static long wb_writeback(struct bdi_writeback *wb, long nr_pages, 696static long wb_writeback(struct bdi_writeback *wb,
675 struct super_block *sb, 697 struct wb_writeback_args *args)
676 enum writeback_sync_modes sync_mode, int for_kupdate)
677{ 698{
678 struct writeback_control wbc = { 699 struct writeback_control wbc = {
679 .bdi = wb->bdi, 700 .bdi = wb->bdi,
680 .sb = sb, 701 .sb = args->sb,
681 .sync_mode = sync_mode, 702 .sync_mode = args->sync_mode,
682 .older_than_this = NULL, 703 .older_than_this = NULL,
683 .for_kupdate = for_kupdate, 704 .for_kupdate = args->for_kupdate,
684 .range_cyclic = 1, 705 .range_cyclic = args->range_cyclic,
685 }; 706 };
686 unsigned long oldest_jif; 707 unsigned long oldest_jif;
687 long wrote = 0; 708 long wrote = 0;
@@ -691,13 +712,18 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
691 oldest_jif = jiffies - 712 oldest_jif = jiffies -
692 msecs_to_jiffies(dirty_expire_interval * 10); 713 msecs_to_jiffies(dirty_expire_interval * 10);
693 } 714 }
715 if (!wbc.range_cyclic) {
716 wbc.range_start = 0;
717 wbc.range_end = LLONG_MAX;
718 }
694 719
695 for (;;) { 720 for (;;) {
696 /* 721 /*
697 * Don't flush anything for non-integrity writeback where 722 * Don't flush anything for non-integrity writeback where
698 * no nr_pages was given 723 * no nr_pages was given
699 */ 724 */
700 if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE) 725 if (!args->for_kupdate && args->nr_pages <= 0 &&
726 args->sync_mode == WB_SYNC_NONE)
701 break; 727 break;
702 728
703 /* 729 /*
@@ -705,7 +731,8 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
705 * periodic background writeout and we are below the 731 * periodic background writeout and we are below the
706 * background dirty threshold, don't do anything 732 * background dirty threshold, don't do anything
707 */ 733 */
708 if (for_kupdate && nr_pages <= 0 && !over_bground_thresh()) 734 if (args->for_kupdate && args->nr_pages <= 0 &&
735 !over_bground_thresh())
709 break; 736 break;
710 737
711 wbc.more_io = 0; 738 wbc.more_io = 0;
@@ -713,7 +740,7 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
713 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 740 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
714 wbc.pages_skipped = 0; 741 wbc.pages_skipped = 0;
715 writeback_inodes_wb(wb, &wbc); 742 writeback_inodes_wb(wb, &wbc);
716 nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; 743 args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
717 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; 744 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
718 745
719 /* 746 /*
@@ -731,7 +758,11 @@ static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
731 758
732/* 759/*
733 * Return the next bdi_work struct that hasn't been processed by this 760 * Return the next bdi_work struct that hasn't been processed by this
734 * wb thread yet 761 * wb thread yet. ->seen is initially set for each thread that exists
762 * for this device, when a thread first notices a piece of work it
763 * clears its bit. Depending on writeback type, the thread will notify
764 * completion on either receiving the work (WB_SYNC_NONE) or after
765 * it is done (WB_SYNC_ALL).
735 */ 766 */
736static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, 767static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
737 struct bdi_writeback *wb) 768 struct bdi_writeback *wb)
@@ -741,8 +772,9 @@ static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
741 rcu_read_lock(); 772 rcu_read_lock();
742 773
743 list_for_each_entry_rcu(work, &bdi->work_list, list) { 774 list_for_each_entry_rcu(work, &bdi->work_list, list) {
744 if (!test_and_clear_bit(wb->nr, &work->seen)) 775 if (!test_bit(wb->nr, &work->seen))
745 continue; 776 continue;
777 clear_bit(wb->nr, &work->seen);
746 778
747 ret = work; 779 ret = work;
748 break; 780 break;
@@ -767,8 +799,16 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
767 global_page_state(NR_UNSTABLE_NFS) + 799 global_page_state(NR_UNSTABLE_NFS) +
768 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 800 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
769 801
770 if (nr_pages) 802 if (nr_pages) {
771 return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1); 803 struct wb_writeback_args args = {
804 .nr_pages = nr_pages,
805 .sync_mode = WB_SYNC_NONE,
806 .for_kupdate = 1,
807 .range_cyclic = 1,
808 };
809
810 return wb_writeback(wb, &args);
811 }
772 812
773 return 0; 813 return 0;
774} 814}
@@ -780,35 +820,31 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
780{ 820{
781 struct backing_dev_info *bdi = wb->bdi; 821 struct backing_dev_info *bdi = wb->bdi;
782 struct bdi_work *work; 822 struct bdi_work *work;
783 long nr_pages, wrote = 0; 823 long wrote = 0;
784 824
785 while ((work = get_next_work_item(bdi, wb)) != NULL) { 825 while ((work = get_next_work_item(bdi, wb)) != NULL) {
786 enum writeback_sync_modes sync_mode; 826 struct wb_writeback_args args = work->args;
787
788 nr_pages = work->nr_pages;
789 827
790 /* 828 /*
791 * Override sync mode, in case we must wait for completion 829 * Override sync mode, in case we must wait for completion
792 */ 830 */
793 if (force_wait) 831 if (force_wait)
794 work->sync_mode = sync_mode = WB_SYNC_ALL; 832 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
795 else
796 sync_mode = work->sync_mode;
797 833
798 /* 834 /*
799 * If this isn't a data integrity operation, just notify 835 * If this isn't a data integrity operation, just notify
800 * that we have seen this work and we are now starting it. 836 * that we have seen this work and we are now starting it.
801 */ 837 */
802 if (sync_mode == WB_SYNC_NONE) 838 if (args.sync_mode == WB_SYNC_NONE)
803 wb_clear_pending(wb, work); 839 wb_clear_pending(wb, work);
804 840
805 wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0); 841 wrote += wb_writeback(wb, &args);
806 842
807 /* 843 /*
808 * This is a data integrity writeback, so only do the 844 * This is a data integrity writeback, so only do the
809 * notification when we have completed the work. 845 * notification when we have completed the work.
810 */ 846 */
811 if (sync_mode == WB_SYNC_ALL) 847 if (args.sync_mode == WB_SYNC_ALL)
812 wb_clear_pending(wb, work); 848 wb_clear_pending(wb, work);
813 } 849 }
814 850
@@ -849,8 +885,7 @@ int bdi_writeback_task(struct bdi_writeback *wb)
849 } 885 }
850 886
851 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); 887 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
852 set_current_state(TASK_INTERRUPTIBLE); 888 schedule_timeout_interruptible(wait_jiffies);
853 schedule_timeout(wait_jiffies);
854 try_to_freeze(); 889 try_to_freeze();
855 } 890 }
856 891
@@ -858,67 +893,28 @@ int bdi_writeback_task(struct bdi_writeback *wb)
858} 893}
859 894
860/* 895/*
861 * Schedule writeback for all backing devices. Expensive! If this is a data 896 * Schedule writeback for all backing devices. This does WB_SYNC_NONE
862 * integrity operation, writeback will be complete when this returns. If 897 * writeback, for integrity writeback see bdi_sync_writeback().
863 * we are simply called for WB_SYNC_NONE, then writeback will merely be
864 * scheduled to run.
865 */ 898 */
866static void bdi_writeback_all(struct writeback_control *wbc) 899static void bdi_writeback_all(struct super_block *sb, long nr_pages)
867{ 900{
868 const bool must_wait = wbc->sync_mode == WB_SYNC_ALL; 901 struct wb_writeback_args args = {
902 .sb = sb,
903 .nr_pages = nr_pages,
904 .sync_mode = WB_SYNC_NONE,
905 };
869 struct backing_dev_info *bdi; 906 struct backing_dev_info *bdi;
870 struct bdi_work *work;
871 LIST_HEAD(list);
872
873restart:
874 spin_lock(&bdi_lock);
875 907
876 list_for_each_entry(bdi, &bdi_list, bdi_list) { 908 rcu_read_lock();
877 struct bdi_work *work;
878 909
910 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
879 if (!bdi_has_dirty_io(bdi)) 911 if (!bdi_has_dirty_io(bdi))
880 continue; 912 continue;
881 913
882 /* 914 bdi_alloc_queue_work(bdi, &args);
883 * If work allocation fails, do the writes inline. We drop
884 * the lock and restart the list writeout. This should be OK,
885 * since this happens rarely and because the writeout should
886 * eventually make more free memory available.
887 */
888 work = bdi_alloc_work(wbc);
889 if (!work) {
890 struct writeback_control __wbc;
891
892 /*
893 * Not a data integrity writeout, just continue
894 */
895 if (!must_wait)
896 continue;
897
898 spin_unlock(&bdi_lock);
899 __wbc = *wbc;
900 __wbc.bdi = bdi;
901 writeback_inodes_wbc(&__wbc);
902 goto restart;
903 }
904 if (must_wait)
905 list_add_tail(&work->wait_list, &list);
906
907 bdi_queue_work(bdi, work);
908 } 915 }
909 916
910 spin_unlock(&bdi_lock); 917 rcu_read_unlock();
911
912 /*
913 * If this is for WB_SYNC_ALL, wait for pending work to complete
914 * before returning.
915 */
916 while (!list_empty(&list)) {
917 work = list_entry(list.next, struct bdi_work, wait_list);
918 list_del(&work->wait_list);
919 bdi_wait_on_work_clear(work);
920 call_rcu(&work->rcu_head, bdi_work_free);
921 }
922} 918}
923 919
924/* 920/*
@@ -927,17 +923,10 @@ restart:
927 */ 923 */
928void wakeup_flusher_threads(long nr_pages) 924void wakeup_flusher_threads(long nr_pages)
929{ 925{
930 struct writeback_control wbc = {
931 .sync_mode = WB_SYNC_NONE,
932 .older_than_this = NULL,
933 .range_cyclic = 1,
934 };
935
936 if (nr_pages == 0) 926 if (nr_pages == 0)
937 nr_pages = global_page_state(NR_FILE_DIRTY) + 927 nr_pages = global_page_state(NR_FILE_DIRTY) +
938 global_page_state(NR_UNSTABLE_NFS); 928 global_page_state(NR_UNSTABLE_NFS);
939 wbc.nr_to_write = nr_pages; 929 bdi_writeback_all(NULL, nr_pages);
940 bdi_writeback_all(&wbc);
941} 930}
942 931
943static noinline void block_dump___mark_inode_dirty(struct inode *inode) 932static noinline void block_dump___mark_inode_dirty(struct inode *inode)
@@ -1084,7 +1073,7 @@ EXPORT_SYMBOL(__mark_inode_dirty);
1084 * on the writer throttling path, and we get decent balancing between many 1073 * on the writer throttling path, and we get decent balancing between many
1085 * throttled threads: we don't want them all piling up on inode_sync_wait. 1074 * throttled threads: we don't want them all piling up on inode_sync_wait.
1086 */ 1075 */
1087static void wait_sb_inodes(struct writeback_control *wbc) 1076static void wait_sb_inodes(struct super_block *sb)
1088{ 1077{
1089 struct inode *inode, *old_inode = NULL; 1078 struct inode *inode, *old_inode = NULL;
1090 1079
@@ -1092,7 +1081,7 @@ static void wait_sb_inodes(struct writeback_control *wbc)
1092 * We need to be protected against the filesystem going from 1081 * We need to be protected against the filesystem going from
1093 * r/o to r/w or vice versa. 1082 * r/o to r/w or vice versa.
1094 */ 1083 */
1095 WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount)); 1084 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1096 1085
1097 spin_lock(&inode_lock); 1086 spin_lock(&inode_lock);
1098 1087
@@ -1103,7 +1092,7 @@ static void wait_sb_inodes(struct writeback_control *wbc)
1103 * In which case, the inode may not be on the dirty list, but 1092 * In which case, the inode may not be on the dirty list, but
1104 * we still have to wait for that writeout. 1093 * we still have to wait for that writeout.
1105 */ 1094 */
1106 list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) { 1095 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1107 struct address_space *mapping; 1096 struct address_space *mapping;
1108 1097
1109 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 1098 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
@@ -1143,14 +1132,8 @@ static void wait_sb_inodes(struct writeback_control *wbc)
1143 * for IO completion of submitted IO. The number of pages submitted is 1132 * for IO completion of submitted IO. The number of pages submitted is
1144 * returned. 1133 * returned.
1145 */ 1134 */
1146long writeback_inodes_sb(struct super_block *sb) 1135void writeback_inodes_sb(struct super_block *sb)
1147{ 1136{
1148 struct writeback_control wbc = {
1149 .sb = sb,
1150 .sync_mode = WB_SYNC_NONE,
1151 .range_start = 0,
1152 .range_end = LLONG_MAX,
1153 };
1154 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); 1137 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1155 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); 1138 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1156 long nr_to_write; 1139 long nr_to_write;
@@ -1158,9 +1141,7 @@ long writeback_inodes_sb(struct super_block *sb)
1158 nr_to_write = nr_dirty + nr_unstable + 1141 nr_to_write = nr_dirty + nr_unstable +
1159 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1142 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1160 1143
1161 wbc.nr_to_write = nr_to_write; 1144 bdi_writeback_all(sb, nr_to_write);
1162 bdi_writeback_all(&wbc);
1163 return nr_to_write - wbc.nr_to_write;
1164} 1145}
1165EXPORT_SYMBOL(writeback_inodes_sb); 1146EXPORT_SYMBOL(writeback_inodes_sb);
1166 1147
@@ -1171,20 +1152,10 @@ EXPORT_SYMBOL(writeback_inodes_sb);
1171 * This function writes and waits on any dirty inode belonging to this 1152 * This function writes and waits on any dirty inode belonging to this
1172 * super_block. The number of pages synced is returned. 1153 * super_block. The number of pages synced is returned.
1173 */ 1154 */
1174long sync_inodes_sb(struct super_block *sb) 1155void sync_inodes_sb(struct super_block *sb)
1175{ 1156{
1176 struct writeback_control wbc = { 1157 bdi_sync_writeback(sb->s_bdi, sb);
1177 .sb = sb, 1158 wait_sb_inodes(sb);
1178 .sync_mode = WB_SYNC_ALL,
1179 .range_start = 0,
1180 .range_end = LLONG_MAX,
1181 };
1182 long nr_to_write = LONG_MAX; /* doesn't actually matter */
1183
1184 wbc.nr_to_write = nr_to_write;
1185 bdi_writeback_all(&wbc);
1186 wait_sb_inodes(&wbc);
1187 return nr_to_write - wbc.nr_to_write;
1188} 1159}
1189EXPORT_SYMBOL(sync_inodes_sb); 1160EXPORT_SYMBOL(sync_inodes_sb);
1190 1161
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 99c99dfb0373..3773fd63d2f9 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -61,6 +61,121 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
61 return simple_read_from_buffer(buf, len, ppos, tmp, size); 61 return simple_read_from_buffer(buf, len, ppos, tmp, size);
62} 62}
63 63
64static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf,
65 size_t len, loff_t *ppos, unsigned val)
66{
67 char tmp[32];
68 size_t size = sprintf(tmp, "%u\n", val);
69
70 return simple_read_from_buffer(buf, len, ppos, tmp, size);
71}
72
73static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf,
74 size_t count, loff_t *ppos, unsigned *val,
75 unsigned global_limit)
76{
77 unsigned long t;
78 char tmp[32];
79 unsigned limit = (1 << 16) - 1;
80 int err;
81
82 if (*ppos || count >= sizeof(tmp) - 1)
83 return -EINVAL;
84
85 if (copy_from_user(tmp, buf, count))
86 return -EINVAL;
87
88 tmp[count] = '\0';
89
90 err = strict_strtoul(tmp, 0, &t);
91 if (err)
92 return err;
93
94 if (!capable(CAP_SYS_ADMIN))
95 limit = min(limit, global_limit);
96
97 if (t > limit)
98 return -EINVAL;
99
100 *val = t;
101
102 return count;
103}
104
105static ssize_t fuse_conn_max_background_read(struct file *file,
106 char __user *buf, size_t len,
107 loff_t *ppos)
108{
109 struct fuse_conn *fc;
110 unsigned val;
111
112 fc = fuse_ctl_file_conn_get(file);
113 if (!fc)
114 return 0;
115
116 val = fc->max_background;
117 fuse_conn_put(fc);
118
119 return fuse_conn_limit_read(file, buf, len, ppos, val);
120}
121
122static ssize_t fuse_conn_max_background_write(struct file *file,
123 const char __user *buf,
124 size_t count, loff_t *ppos)
125{
126 unsigned val;
127 ssize_t ret;
128
129 ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
130 max_user_bgreq);
131 if (ret > 0) {
132 struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
133 if (fc) {
134 fc->max_background = val;
135 fuse_conn_put(fc);
136 }
137 }
138
139 return ret;
140}
141
142static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
143 char __user *buf, size_t len,
144 loff_t *ppos)
145{
146 struct fuse_conn *fc;
147 unsigned val;
148
149 fc = fuse_ctl_file_conn_get(file);
150 if (!fc)
151 return 0;
152
153 val = fc->congestion_threshold;
154 fuse_conn_put(fc);
155
156 return fuse_conn_limit_read(file, buf, len, ppos, val);
157}
158
159static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
160 const char __user *buf,
161 size_t count, loff_t *ppos)
162{
163 unsigned val;
164 ssize_t ret;
165
166 ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
167 max_user_congthresh);
168 if (ret > 0) {
169 struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
170 if (fc) {
171 fc->congestion_threshold = val;
172 fuse_conn_put(fc);
173 }
174 }
175
176 return ret;
177}
178
64static const struct file_operations fuse_ctl_abort_ops = { 179static const struct file_operations fuse_ctl_abort_ops = {
65 .open = nonseekable_open, 180 .open = nonseekable_open,
66 .write = fuse_conn_abort_write, 181 .write = fuse_conn_abort_write,
@@ -71,6 +186,18 @@ static const struct file_operations fuse_ctl_waiting_ops = {
71 .read = fuse_conn_waiting_read, 186 .read = fuse_conn_waiting_read,
72}; 187};
73 188
189static const struct file_operations fuse_conn_max_background_ops = {
190 .open = nonseekable_open,
191 .read = fuse_conn_max_background_read,
192 .write = fuse_conn_max_background_write,
193};
194
195static const struct file_operations fuse_conn_congestion_threshold_ops = {
196 .open = nonseekable_open,
197 .read = fuse_conn_congestion_threshold_read,
198 .write = fuse_conn_congestion_threshold_write,
199};
200
74static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, 201static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
75 struct fuse_conn *fc, 202 struct fuse_conn *fc,
76 const char *name, 203 const char *name,
@@ -127,9 +254,14 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
127 goto err; 254 goto err;
128 255
129 if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1, 256 if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
130 NULL, &fuse_ctl_waiting_ops) || 257 NULL, &fuse_ctl_waiting_ops) ||
131 !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1, 258 !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
132 NULL, &fuse_ctl_abort_ops)) 259 NULL, &fuse_ctl_abort_ops) ||
260 !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600,
261 1, NULL, &fuse_conn_max_background_ops) ||
262 !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
263 S_IFREG | 0600, 1, NULL,
264 &fuse_conn_congestion_threshold_ops))
133 goto err; 265 goto err;
134 266
135 return 0; 267 return 0;
@@ -156,7 +288,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
156 d_drop(dentry); 288 d_drop(dentry);
157 dput(dentry); 289 dput(dentry);
158 } 290 }
159 fuse_control_sb->s_root->d_inode->i_nlink--; 291 drop_nlink(fuse_control_sb->s_root->d_inode);
160} 292}
161 293
162static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) 294static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6484eb75acd6..51d9e33d634f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -250,7 +250,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
250 250
251static void flush_bg_queue(struct fuse_conn *fc) 251static void flush_bg_queue(struct fuse_conn *fc)
252{ 252{
253 while (fc->active_background < FUSE_MAX_BACKGROUND && 253 while (fc->active_background < fc->max_background &&
254 !list_empty(&fc->bg_queue)) { 254 !list_empty(&fc->bg_queue)) {
255 struct fuse_req *req; 255 struct fuse_req *req;
256 256
@@ -280,11 +280,11 @@ __releases(&fc->lock)
280 list_del(&req->intr_entry); 280 list_del(&req->intr_entry);
281 req->state = FUSE_REQ_FINISHED; 281 req->state = FUSE_REQ_FINISHED;
282 if (req->background) { 282 if (req->background) {
283 if (fc->num_background == FUSE_MAX_BACKGROUND) { 283 if (fc->num_background == fc->max_background) {
284 fc->blocked = 0; 284 fc->blocked = 0;
285 wake_up_all(&fc->blocked_waitq); 285 wake_up_all(&fc->blocked_waitq);
286 } 286 }
287 if (fc->num_background == FUSE_CONGESTION_THRESHOLD && 287 if (fc->num_background == fc->congestion_threshold &&
288 fc->connected && fc->bdi_initialized) { 288 fc->connected && fc->bdi_initialized) {
289 clear_bdi_congested(&fc->bdi, BLK_RW_SYNC); 289 clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
290 clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC); 290 clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
@@ -410,9 +410,9 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
410{ 410{
411 req->background = 1; 411 req->background = 1;
412 fc->num_background++; 412 fc->num_background++;
413 if (fc->num_background == FUSE_MAX_BACKGROUND) 413 if (fc->num_background == fc->max_background)
414 fc->blocked = 1; 414 fc->blocked = 1;
415 if (fc->num_background == FUSE_CONGESTION_THRESHOLD && 415 if (fc->num_background == fc->congestion_threshold &&
416 fc->bdi_initialized) { 416 fc->bdi_initialized) {
417 set_bdi_congested(&fc->bdi, BLK_RW_SYNC); 417 set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
418 set_bdi_congested(&fc->bdi, BLK_RW_ASYNC); 418 set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 52b641fc0faf..fc9c79feb5f7 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -25,12 +25,6 @@
25/** Max number of pages that can be used in a single read request */ 25/** Max number of pages that can be used in a single read request */
26#define FUSE_MAX_PAGES_PER_REQ 32 26#define FUSE_MAX_PAGES_PER_REQ 32
27 27
28/** Maximum number of outstanding background requests */
29#define FUSE_MAX_BACKGROUND 12
30
31/** Congestion starts at 75% of maximum */
32#define FUSE_CONGESTION_THRESHOLD (FUSE_MAX_BACKGROUND * 75 / 100)
33
34/** Bias for fi->writectr, meaning new writepages must not be sent */ 28/** Bias for fi->writectr, meaning new writepages must not be sent */
35#define FUSE_NOWRITE INT_MIN 29#define FUSE_NOWRITE INT_MIN
36 30
@@ -38,7 +32,7 @@
38#define FUSE_NAME_MAX 1024 32#define FUSE_NAME_MAX 1024
39 33
40/** Number of dentries for each connection in the control filesystem */ 34/** Number of dentries for each connection in the control filesystem */
41#define FUSE_CTL_NUM_DENTRIES 3 35#define FUSE_CTL_NUM_DENTRIES 5
42 36
43/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem 37/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
44 module will check permissions based on the file mode. Otherwise no 38 module will check permissions based on the file mode. Otherwise no
@@ -55,6 +49,10 @@ extern struct list_head fuse_conn_list;
55/** Global mutex protecting fuse_conn_list and the control filesystem */ 49/** Global mutex protecting fuse_conn_list and the control filesystem */
56extern struct mutex fuse_mutex; 50extern struct mutex fuse_mutex;
57 51
52/** Module parameters */
53extern unsigned max_user_bgreq;
54extern unsigned max_user_congthresh;
55
58/** FUSE inode */ 56/** FUSE inode */
59struct fuse_inode { 57struct fuse_inode {
60 /** Inode data */ 58 /** Inode data */
@@ -349,6 +347,12 @@ struct fuse_conn {
349 /** rbtree of fuse_files waiting for poll events indexed by ph */ 347 /** rbtree of fuse_files waiting for poll events indexed by ph */
350 struct rb_root polled_files; 348 struct rb_root polled_files;
351 349
350 /** Maximum number of outstanding background requests */
351 unsigned max_background;
352
353 /** Number of background requests at which congestion starts */
354 unsigned congestion_threshold;
355
352 /** Number of requests currently in the background */ 356 /** Number of requests currently in the background */
353 unsigned num_background; 357 unsigned num_background;
354 358
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 4567db6f9430..6da947daabda 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -14,6 +14,7 @@
14#include <linux/seq_file.h> 14#include <linux/seq_file.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/moduleparam.h>
17#include <linux/parser.h> 18#include <linux/parser.h>
18#include <linux/statfs.h> 19#include <linux/statfs.h>
19#include <linux/random.h> 20#include <linux/random.h>
@@ -28,10 +29,34 @@ static struct kmem_cache *fuse_inode_cachep;
28struct list_head fuse_conn_list; 29struct list_head fuse_conn_list;
29DEFINE_MUTEX(fuse_mutex); 30DEFINE_MUTEX(fuse_mutex);
30 31
32static int set_global_limit(const char *val, struct kernel_param *kp);
33
34unsigned max_user_bgreq;
35module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
36 &max_user_bgreq, 0644);
37__MODULE_PARM_TYPE(max_user_bgreq, "uint");
38MODULE_PARM_DESC(max_user_bgreq,
39 "Global limit for the maximum number of backgrounded requests an "
40 "unprivileged user can set");
41
42unsigned max_user_congthresh;
43module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
44 &max_user_congthresh, 0644);
45__MODULE_PARM_TYPE(max_user_congthresh, "uint");
46MODULE_PARM_DESC(max_user_congthresh,
47 "Global limit for the maximum congestion threshold an "
48 "unprivileged user can set");
49
31#define FUSE_SUPER_MAGIC 0x65735546 50#define FUSE_SUPER_MAGIC 0x65735546
32 51
33#define FUSE_DEFAULT_BLKSIZE 512 52#define FUSE_DEFAULT_BLKSIZE 512
34 53
54/** Maximum number of outstanding background requests */
55#define FUSE_DEFAULT_MAX_BACKGROUND 12
56
57/** Congestion starts at 75% of maximum */
58#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
59
35struct fuse_mount_data { 60struct fuse_mount_data {
36 int fd; 61 int fd;
37 unsigned rootmode; 62 unsigned rootmode;
@@ -517,6 +542,8 @@ void fuse_conn_init(struct fuse_conn *fc)
517 INIT_LIST_HEAD(&fc->bg_queue); 542 INIT_LIST_HEAD(&fc->bg_queue);
518 INIT_LIST_HEAD(&fc->entry); 543 INIT_LIST_HEAD(&fc->entry);
519 atomic_set(&fc->num_waiting, 0); 544 atomic_set(&fc->num_waiting, 0);
545 fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
546 fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
520 fc->khctr = 0; 547 fc->khctr = 0;
521 fc->polled_files = RB_ROOT; 548 fc->polled_files = RB_ROOT;
522 fc->reqctr = 0; 549 fc->reqctr = 0;
@@ -727,6 +754,54 @@ static const struct super_operations fuse_super_operations = {
727 .show_options = fuse_show_options, 754 .show_options = fuse_show_options,
728}; 755};
729 756
757static void sanitize_global_limit(unsigned *limit)
758{
759 if (*limit == 0)
760 *limit = ((num_physpages << PAGE_SHIFT) >> 13) /
761 sizeof(struct fuse_req);
762
763 if (*limit >= 1 << 16)
764 *limit = (1 << 16) - 1;
765}
766
767static int set_global_limit(const char *val, struct kernel_param *kp)
768{
769 int rv;
770
771 rv = param_set_uint(val, kp);
772 if (rv)
773 return rv;
774
775 sanitize_global_limit((unsigned *)kp->arg);
776
777 return 0;
778}
779
780static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
781{
782 int cap_sys_admin = capable(CAP_SYS_ADMIN);
783
784 if (arg->minor < 13)
785 return;
786
787 sanitize_global_limit(&max_user_bgreq);
788 sanitize_global_limit(&max_user_congthresh);
789
790 if (arg->max_background) {
791 fc->max_background = arg->max_background;
792
793 if (!cap_sys_admin && fc->max_background > max_user_bgreq)
794 fc->max_background = max_user_bgreq;
795 }
796 if (arg->congestion_threshold) {
797 fc->congestion_threshold = arg->congestion_threshold;
798
799 if (!cap_sys_admin &&
800 fc->congestion_threshold > max_user_congthresh)
801 fc->congestion_threshold = max_user_congthresh;
802 }
803}
804
730static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) 805static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
731{ 806{
732 struct fuse_init_out *arg = &req->misc.init_out; 807 struct fuse_init_out *arg = &req->misc.init_out;
@@ -736,6 +811,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
736 else { 811 else {
737 unsigned long ra_pages; 812 unsigned long ra_pages;
738 813
814 process_init_limits(fc, arg);
815
739 if (arg->minor >= 6) { 816 if (arg->minor >= 6) {
740 ra_pages = arg->max_readahead / PAGE_CACHE_SIZE; 817 ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
741 if (arg->flags & FUSE_ASYNC_READ) 818 if (arg->flags & FUSE_ASYNC_READ)
@@ -894,6 +971,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
894 if (err) 971 if (err)
895 goto err_put_conn; 972 goto err_put_conn;
896 973
974 sb->s_bdi = &fc->bdi;
975
897 /* Handle umasking inside the fuse code */ 976 /* Handle umasking inside the fuse code */
898 if (sb->s_flags & MS_POSIXACL) 977 if (sb->s_flags & MS_POSIXACL)
899 fc->dont_mask = 1; 978 fc->dont_mask = 1;
@@ -1148,6 +1227,9 @@ static int __init fuse_init(void)
1148 if (res) 1227 if (res)
1149 goto err_sysfs_cleanup; 1228 goto err_sysfs_cleanup;
1150 1229
1230 sanitize_global_limit(&max_user_bgreq);
1231 sanitize_global_limit(&max_user_congthresh);
1232
1151 return 0; 1233 return 0;
1152 1234
1153 err_sysfs_cleanup: 1235 err_sysfs_cleanup:
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index c3ac18054057..247436c10deb 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -12,7 +12,6 @@
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/buffer_head.h> 13#include <linux/buffer_head.h>
14#include <linux/namei.h> 14#include <linux/namei.h>
15#include <linux/utsname.h>
16#include <linux/mm.h> 15#include <linux/mm.h>
17#include <linux/xattr.h> 16#include <linux/xattr.h>
18#include <linux/posix_acl.h> 17#include <linux/posix_acl.h>
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 28c590b7c9da..8f1cfb02a6cb 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -179,7 +179,7 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
179 * always aligned to a 64 bit boundary. 179 * always aligned to a 64 bit boundary.
180 * 180 *
181 * The size of the buffer is in bytes, but is it assumed that it is 181 * The size of the buffer is in bytes, but is it assumed that it is
182 * always ok to to read a complete multiple of 64 bits at the end 182 * always ok to read a complete multiple of 64 bits at the end
183 * of the block in case the end is no aligned to a natural boundary. 183 * of the block in case the end is no aligned to a natural boundary.
184 * 184 *
185 * Return: the block number (bitmap buffer scope) that was found 185 * Return: the block number (bitmap buffer scope) that was found
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a93b885311d8..133335479c24 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -31,12 +31,10 @@
31#include <linux/statfs.h> 31#include <linux/statfs.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/ima.h> 33#include <linux/ima.h>
34#include <linux/magic.h>
34 35
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36 37
37/* some random number */
38#define HUGETLBFS_MAGIC 0x958458f6
39
40static const struct super_operations hugetlbfs_ops; 38static const struct super_operations hugetlbfs_ops;
41static const struct address_space_operations hugetlbfs_aops; 39static const struct address_space_operations hugetlbfs_aops;
42const struct file_operations hugetlbfs_file_operations; 40const struct file_operations hugetlbfs_file_operations;
@@ -507,6 +505,13 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
507 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 505 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
508 INIT_LIST_HEAD(&inode->i_mapping->private_list); 506 INIT_LIST_HEAD(&inode->i_mapping->private_list);
509 info = HUGETLBFS_I(inode); 507 info = HUGETLBFS_I(inode);
508 /*
509 * The policy is initialized here even if we are creating a
510 * private inode because initialization simply creates an
511 * an empty rb tree and calls spin_lock_init(), later when we
512 * call mpol_free_shared_policy() it will just return because
513 * the rb tree will still be empty.
514 */
510 mpol_shared_policy_init(&info->policy, NULL); 515 mpol_shared_policy_init(&info->policy, NULL);
511 switch (mode & S_IFMT) { 516 switch (mode & S_IFMT) {
512 default: 517 default:
@@ -937,7 +942,7 @@ static int can_do_hugetlb_shm(void)
937} 942}
938 943
939struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, 944struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
940 struct user_struct **user) 945 struct user_struct **user, int creat_flags)
941{ 946{
942 int error = -ENOMEM; 947 int error = -ENOMEM;
943 struct file *file; 948 struct file *file;
@@ -949,7 +954,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
949 if (!hugetlbfs_vfsmount) 954 if (!hugetlbfs_vfsmount)
950 return ERR_PTR(-ENOENT); 955 return ERR_PTR(-ENOENT);
951 956
952 if (!can_do_hugetlb_shm()) { 957 if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
953 *user = current_user(); 958 *user = current_user();
954 if (user_shm_lock(size, *user)) { 959 if (user_shm_lock(size, *user)) {
955 WARN_ONCE(1, 960 WARN_ONCE(1,
diff --git a/fs/inode.c b/fs/inode.c
index ae7b67e48661..76582b06ab97 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/backing-dev.h> 15#include <linux/backing-dev.h>
16#include <linux/wait.h> 16#include <linux/wait.h>
17#include <linux/rwsem.h>
17#include <linux/hash.h> 18#include <linux/hash.h>
18#include <linux/swap.h> 19#include <linux/swap.h>
19#include <linux/security.h> 20#include <linux/security.h>
@@ -87,14 +88,18 @@ static struct hlist_head *inode_hashtable __read_mostly;
87DEFINE_SPINLOCK(inode_lock); 88DEFINE_SPINLOCK(inode_lock);
88 89
89/* 90/*
90 * iprune_mutex provides exclusion between the kswapd or try_to_free_pages 91 * iprune_sem provides exclusion between the kswapd or try_to_free_pages
91 * icache shrinking path, and the umount path. Without this exclusion, 92 * icache shrinking path, and the umount path. Without this exclusion,
92 * by the time prune_icache calls iput for the inode whose pages it has 93 * by the time prune_icache calls iput for the inode whose pages it has
93 * been invalidating, or by the time it calls clear_inode & destroy_inode 94 * been invalidating, or by the time it calls clear_inode & destroy_inode
94 * from its final dispose_list, the struct super_block they refer to 95 * from its final dispose_list, the struct super_block they refer to
95 * (for inode->i_sb->s_op) may already have been freed and reused. 96 * (for inode->i_sb->s_op) may already have been freed and reused.
97 *
98 * We make this an rwsem because the fastpath is icache shrinking. In
99 * some cases a filesystem may be doing a significant amount of work in
100 * its inode reclaim code, so this should improve parallelism.
96 */ 101 */
97static DEFINE_MUTEX(iprune_mutex); 102static DECLARE_RWSEM(iprune_sem);
98 103
99/* 104/*
100 * Statistics gathering.. 105 * Statistics gathering..
@@ -123,7 +128,7 @@ static void wake_up_inode(struct inode *inode)
123int inode_init_always(struct super_block *sb, struct inode *inode) 128int inode_init_always(struct super_block *sb, struct inode *inode)
124{ 129{
125 static const struct address_space_operations empty_aops; 130 static const struct address_space_operations empty_aops;
126 static struct inode_operations empty_iops; 131 static const struct inode_operations empty_iops;
127 static const struct file_operations empty_fops; 132 static const struct file_operations empty_fops;
128 struct address_space *const mapping = &inode->i_data; 133 struct address_space *const mapping = &inode->i_data;
129 134
@@ -182,9 +187,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
182 if (sb->s_bdev) { 187 if (sb->s_bdev) {
183 struct backing_dev_info *bdi; 188 struct backing_dev_info *bdi;
184 189
185 bdi = sb->s_bdev->bd_inode_backing_dev_info; 190 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
186 if (!bdi)
187 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
188 mapping->backing_dev_info = bdi; 191 mapping->backing_dev_info = bdi;
189 } 192 }
190 inode->i_private = NULL; 193 inode->i_private = NULL;
@@ -383,7 +386,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
383 /* 386 /*
384 * We can reschedule here without worrying about the list's 387 * We can reschedule here without worrying about the list's
385 * consistency because the per-sb list of inodes must not 388 * consistency because the per-sb list of inodes must not
386 * change during umount anymore, and because iprune_mutex keeps 389 * change during umount anymore, and because iprune_sem keeps
387 * shrink_icache_memory() away. 390 * shrink_icache_memory() away.
388 */ 391 */
389 cond_resched_lock(&inode_lock); 392 cond_resched_lock(&inode_lock);
@@ -422,7 +425,7 @@ int invalidate_inodes(struct super_block *sb)
422 int busy; 425 int busy;
423 LIST_HEAD(throw_away); 426 LIST_HEAD(throw_away);
424 427
425 mutex_lock(&iprune_mutex); 428 down_write(&iprune_sem);
426 spin_lock(&inode_lock); 429 spin_lock(&inode_lock);
427 inotify_unmount_inodes(&sb->s_inodes); 430 inotify_unmount_inodes(&sb->s_inodes);
428 fsnotify_unmount_inodes(&sb->s_inodes); 431 fsnotify_unmount_inodes(&sb->s_inodes);
@@ -430,7 +433,7 @@ int invalidate_inodes(struct super_block *sb)
430 spin_unlock(&inode_lock); 433 spin_unlock(&inode_lock);
431 434
432 dispose_list(&throw_away); 435 dispose_list(&throw_away);
433 mutex_unlock(&iprune_mutex); 436 up_write(&iprune_sem);
434 437
435 return busy; 438 return busy;
436} 439}
@@ -469,7 +472,7 @@ static void prune_icache(int nr_to_scan)
469 int nr_scanned; 472 int nr_scanned;
470 unsigned long reap = 0; 473 unsigned long reap = 0;
471 474
472 mutex_lock(&iprune_mutex); 475 down_read(&iprune_sem);
473 spin_lock(&inode_lock); 476 spin_lock(&inode_lock);
474 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 477 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
475 struct inode *inode; 478 struct inode *inode;
@@ -511,7 +514,7 @@ static void prune_icache(int nr_to_scan)
511 spin_unlock(&inode_lock); 514 spin_unlock(&inode_lock);
512 515
513 dispose_list(&freeable); 516 dispose_list(&freeable);
514 mutex_unlock(&iprune_mutex); 517 up_read(&iprune_sem);
515} 518}
516 519
517/* 520/*
@@ -697,13 +700,15 @@ void unlock_new_inode(struct inode *inode)
697 } 700 }
698#endif 701#endif
699 /* 702 /*
700 * This is special! We do not need the spinlock 703 * This is special! We do not need the spinlock when clearing I_LOCK,
701 * when clearing I_LOCK, because we're guaranteed 704 * because we're guaranteed that nobody else tries to do anything about
702 * that nobody else tries to do anything about the 705 * the state of the inode when it is locked, as we just created it (so
703 * state of the inode when it is locked, as we 706 * there can be no old holders that haven't tested I_LOCK).
704 * just created it (so there can be no old holders 707 * However we must emit the memory barrier so that other CPUs reliably
705 * that haven't tested I_LOCK). 708 * see the clearing of I_LOCK after the other inode initialisation has
709 * completed.
706 */ 710 */
711 smp_mb();
707 WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW)); 712 WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW));
708 inode->i_state &= ~(I_LOCK|I_NEW); 713 inode->i_state &= ~(I_LOCK|I_NEW);
709 wake_up_inode(inode); 714 wake_up_inode(inode);
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 61f32f3868cd..b0435dd0654d 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -456,7 +456,7 @@ int cleanup_journal_tail(journal_t *journal)
456{ 456{
457 transaction_t * transaction; 457 transaction_t * transaction;
458 tid_t first_tid; 458 tid_t first_tid;
459 unsigned long blocknr, freed; 459 unsigned int blocknr, freed;
460 460
461 if (is_journal_aborted(journal)) 461 if (is_journal_aborted(journal))
462 return 1; 462 return 1;
@@ -502,8 +502,8 @@ int cleanup_journal_tail(journal_t *journal)
502 freed = freed + journal->j_last - journal->j_first; 502 freed = freed + journal->j_last - journal->j_first;
503 503
504 jbd_debug(1, 504 jbd_debug(1,
505 "Cleaning journal tail from %d to %d (offset %lu), " 505 "Cleaning journal tail from %d to %d (offset %u), "
506 "freeing %lu\n", 506 "freeing %u\n",
507 journal->j_tail_sequence, first_tid, blocknr, freed); 507 journal->j_tail_sequence, first_tid, blocknr, freed);
508 508
509 journal->j_free += freed; 509 journal->j_free += freed;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 618e21c0b7a3..4bd882548c45 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -308,7 +308,7 @@ void journal_commit_transaction(journal_t *journal)
308 int bufs; 308 int bufs;
309 int flags; 309 int flags;
310 int err; 310 int err;
311 unsigned long blocknr; 311 unsigned int blocknr;
312 ktime_t start_time; 312 ktime_t start_time;
313 u64 commit_time; 313 u64 commit_time;
314 char *tagp = NULL; 314 char *tagp = NULL;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index f96f85092d1c..bd3c073b485d 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -276,7 +276,7 @@ static void journal_kill_thread(journal_t *journal)
276int journal_write_metadata_buffer(transaction_t *transaction, 276int journal_write_metadata_buffer(transaction_t *transaction,
277 struct journal_head *jh_in, 277 struct journal_head *jh_in,
278 struct journal_head **jh_out, 278 struct journal_head **jh_out,
279 unsigned long blocknr) 279 unsigned int blocknr)
280{ 280{
281 int need_copy_out = 0; 281 int need_copy_out = 0;
282 int done_copy_out = 0; 282 int done_copy_out = 0;
@@ -567,9 +567,9 @@ int log_wait_commit(journal_t *journal, tid_t tid)
567 * Log buffer allocation routines: 567 * Log buffer allocation routines:
568 */ 568 */
569 569
570int journal_next_log_block(journal_t *journal, unsigned long *retp) 570int journal_next_log_block(journal_t *journal, unsigned int *retp)
571{ 571{
572 unsigned long blocknr; 572 unsigned int blocknr;
573 573
574 spin_lock(&journal->j_state_lock); 574 spin_lock(&journal->j_state_lock);
575 J_ASSERT(journal->j_free > 1); 575 J_ASSERT(journal->j_free > 1);
@@ -590,11 +590,11 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp)
590 * this is a no-op. If needed, we can use j_blk_offset - everything is 590 * this is a no-op. If needed, we can use j_blk_offset - everything is
591 * ready. 591 * ready.
592 */ 592 */
593int journal_bmap(journal_t *journal, unsigned long blocknr, 593int journal_bmap(journal_t *journal, unsigned int blocknr,
594 unsigned long *retp) 594 unsigned int *retp)
595{ 595{
596 int err = 0; 596 int err = 0;
597 unsigned long ret; 597 unsigned int ret;
598 598
599 if (journal->j_inode) { 599 if (journal->j_inode) {
600 ret = bmap(journal->j_inode, blocknr); 600 ret = bmap(journal->j_inode, blocknr);
@@ -604,7 +604,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
604 char b[BDEVNAME_SIZE]; 604 char b[BDEVNAME_SIZE];
605 605
606 printk(KERN_ALERT "%s: journal block not found " 606 printk(KERN_ALERT "%s: journal block not found "
607 "at offset %lu on %s\n", 607 "at offset %u on %s\n",
608 __func__, 608 __func__,
609 blocknr, 609 blocknr,
610 bdevname(journal->j_dev, b)); 610 bdevname(journal->j_dev, b));
@@ -630,7 +630,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
630struct journal_head *journal_get_descriptor_buffer(journal_t *journal) 630struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
631{ 631{
632 struct buffer_head *bh; 632 struct buffer_head *bh;
633 unsigned long blocknr; 633 unsigned int blocknr;
634 int err; 634 int err;
635 635
636 err = journal_next_log_block(journal, &blocknr); 636 err = journal_next_log_block(journal, &blocknr);
@@ -774,7 +774,7 @@ journal_t * journal_init_inode (struct inode *inode)
774 journal_t *journal = journal_init_common(); 774 journal_t *journal = journal_init_common();
775 int err; 775 int err;
776 int n; 776 int n;
777 unsigned long blocknr; 777 unsigned int blocknr;
778 778
779 if (!journal) 779 if (!journal)
780 return NULL; 780 return NULL;
@@ -846,12 +846,12 @@ static void journal_fail_superblock (journal_t *journal)
846static int journal_reset(journal_t *journal) 846static int journal_reset(journal_t *journal)
847{ 847{
848 journal_superblock_t *sb = journal->j_superblock; 848 journal_superblock_t *sb = journal->j_superblock;
849 unsigned long first, last; 849 unsigned int first, last;
850 850
851 first = be32_to_cpu(sb->s_first); 851 first = be32_to_cpu(sb->s_first);
852 last = be32_to_cpu(sb->s_maxlen); 852 last = be32_to_cpu(sb->s_maxlen);
853 if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) { 853 if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
854 printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n", 854 printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
855 first, last); 855 first, last);
856 journal_fail_superblock(journal); 856 journal_fail_superblock(journal);
857 return -EINVAL; 857 return -EINVAL;
@@ -885,7 +885,7 @@ static int journal_reset(journal_t *journal)
885 **/ 885 **/
886int journal_create(journal_t *journal) 886int journal_create(journal_t *journal)
887{ 887{
888 unsigned long blocknr; 888 unsigned int blocknr;
889 struct buffer_head *bh; 889 struct buffer_head *bh;
890 journal_superblock_t *sb; 890 journal_superblock_t *sb;
891 int i, err; 891 int i, err;
@@ -969,14 +969,14 @@ void journal_update_superblock(journal_t *journal, int wait)
969 if (sb->s_start == 0 && journal->j_tail_sequence == 969 if (sb->s_start == 0 && journal->j_tail_sequence ==
970 journal->j_transaction_sequence) { 970 journal->j_transaction_sequence) {
971 jbd_debug(1,"JBD: Skipping superblock update on recovered sb " 971 jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
972 "(start %ld, seq %d, errno %d)\n", 972 "(start %u, seq %d, errno %d)\n",
973 journal->j_tail, journal->j_tail_sequence, 973 journal->j_tail, journal->j_tail_sequence,
974 journal->j_errno); 974 journal->j_errno);
975 goto out; 975 goto out;
976 } 976 }
977 977
978 spin_lock(&journal->j_state_lock); 978 spin_lock(&journal->j_state_lock);
979 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", 979 jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
980 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 980 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
981 981
982 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); 982 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1371,7 +1371,7 @@ int journal_flush(journal_t *journal)
1371{ 1371{
1372 int err = 0; 1372 int err = 0;
1373 transaction_t *transaction = NULL; 1373 transaction_t *transaction = NULL;
1374 unsigned long old_tail; 1374 unsigned int old_tail;
1375 1375
1376 spin_lock(&journal->j_state_lock); 1376 spin_lock(&journal->j_state_lock);
1377 1377
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index db5e982c5ddf..cb1a49ae605e 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -70,7 +70,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
70{ 70{
71 int err; 71 int err;
72 unsigned int max, nbufs, next; 72 unsigned int max, nbufs, next;
73 unsigned long blocknr; 73 unsigned int blocknr;
74 struct buffer_head *bh; 74 struct buffer_head *bh;
75 75
76 struct buffer_head * bufs[MAXBUF]; 76 struct buffer_head * bufs[MAXBUF];
@@ -132,7 +132,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
132 unsigned int offset) 132 unsigned int offset)
133{ 133{
134 int err; 134 int err;
135 unsigned long blocknr; 135 unsigned int blocknr;
136 struct buffer_head *bh; 136 struct buffer_head *bh;
137 137
138 *bhp = NULL; 138 *bhp = NULL;
@@ -314,7 +314,7 @@ static int do_one_pass(journal_t *journal,
314 struct recovery_info *info, enum passtype pass) 314 struct recovery_info *info, enum passtype pass)
315{ 315{
316 unsigned int first_commit_ID, next_commit_ID; 316 unsigned int first_commit_ID, next_commit_ID;
317 unsigned long next_log_block; 317 unsigned int next_log_block;
318 int err, success = 0; 318 int err, success = 0;
319 journal_superblock_t * sb; 319 journal_superblock_t * sb;
320 journal_header_t * tmp; 320 journal_header_t * tmp;
@@ -367,14 +367,14 @@ static int do_one_pass(journal_t *journal,
367 if (tid_geq(next_commit_ID, info->end_transaction)) 367 if (tid_geq(next_commit_ID, info->end_transaction))
368 break; 368 break;
369 369
370 jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", 370 jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
371 next_commit_ID, next_log_block, journal->j_last); 371 next_commit_ID, next_log_block, journal->j_last);
372 372
373 /* Skip over each chunk of the transaction looking 373 /* Skip over each chunk of the transaction looking
374 * either the next descriptor block or the final commit 374 * either the next descriptor block or the final commit
375 * record. */ 375 * record. */
376 376
377 jbd_debug(3, "JBD: checking block %ld\n", next_log_block); 377 jbd_debug(3, "JBD: checking block %u\n", next_log_block);
378 err = jread(&bh, journal, next_log_block); 378 err = jread(&bh, journal, next_log_block);
379 if (err) 379 if (err)
380 goto failed; 380 goto failed;
@@ -429,7 +429,7 @@ static int do_one_pass(journal_t *journal,
429 tagp = &bh->b_data[sizeof(journal_header_t)]; 429 tagp = &bh->b_data[sizeof(journal_header_t)];
430 while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) 430 while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
431 <= journal->j_blocksize) { 431 <= journal->j_blocksize) {
432 unsigned long io_block; 432 unsigned int io_block;
433 433
434 tag = (journal_block_tag_t *) tagp; 434 tag = (journal_block_tag_t *) tagp;
435 flags = be32_to_cpu(tag->t_flags); 435 flags = be32_to_cpu(tag->t_flags);
@@ -443,10 +443,10 @@ static int do_one_pass(journal_t *journal,
443 success = err; 443 success = err;
444 printk (KERN_ERR 444 printk (KERN_ERR
445 "JBD: IO error %d recovering " 445 "JBD: IO error %d recovering "
446 "block %ld in log\n", 446 "block %u in log\n",
447 err, io_block); 447 err, io_block);
448 } else { 448 } else {
449 unsigned long blocknr; 449 unsigned int blocknr;
450 450
451 J_ASSERT(obh != NULL); 451 J_ASSERT(obh != NULL);
452 blocknr = be32_to_cpu(tag->t_blocknr); 452 blocknr = be32_to_cpu(tag->t_blocknr);
@@ -581,7 +581,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
581 max = be32_to_cpu(header->r_count); 581 max = be32_to_cpu(header->r_count);
582 582
583 while (offset < max) { 583 while (offset < max) {
584 unsigned long blocknr; 584 unsigned int blocknr;
585 int err; 585 int err;
586 586
587 blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); 587 blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index da6cd9bdaabc..ad717328343a 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -101,7 +101,7 @@ struct jbd_revoke_record_s
101{ 101{
102 struct list_head hash; 102 struct list_head hash;
103 tid_t sequence; /* Used for recovery only */ 103 tid_t sequence; /* Used for recovery only */
104 unsigned long blocknr; 104 unsigned int blocknr;
105}; 105};
106 106
107 107
@@ -126,7 +126,7 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int);
126/* Utility functions to maintain the revoke table */ 126/* Utility functions to maintain the revoke table */
127 127
128/* Borrowed from buffer.c: this is a tried and tested block hash function */ 128/* Borrowed from buffer.c: this is a tried and tested block hash function */
129static inline int hash(journal_t *journal, unsigned long block) 129static inline int hash(journal_t *journal, unsigned int block)
130{ 130{
131 struct jbd_revoke_table_s *table = journal->j_revoke; 131 struct jbd_revoke_table_s *table = journal->j_revoke;
132 int hash_shift = table->hash_shift; 132 int hash_shift = table->hash_shift;
@@ -136,7 +136,7 @@ static inline int hash(journal_t *journal, unsigned long block)
136 (block << (hash_shift - 12))) & (table->hash_size - 1); 136 (block << (hash_shift - 12))) & (table->hash_size - 1);
137} 137}
138 138
139static int insert_revoke_hash(journal_t *journal, unsigned long blocknr, 139static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
140 tid_t seq) 140 tid_t seq)
141{ 141{
142 struct list_head *hash_list; 142 struct list_head *hash_list;
@@ -166,7 +166,7 @@ oom:
166/* Find a revoke record in the journal's hash table. */ 166/* Find a revoke record in the journal's hash table. */
167 167
168static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, 168static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
169 unsigned long blocknr) 169 unsigned int blocknr)
170{ 170{
171 struct list_head *hash_list; 171 struct list_head *hash_list;
172 struct jbd_revoke_record_s *record; 172 struct jbd_revoke_record_s *record;
@@ -332,7 +332,7 @@ void journal_destroy_revoke(journal_t *journal)
332 * by one. 332 * by one.
333 */ 333 */
334 334
335int journal_revoke(handle_t *handle, unsigned long blocknr, 335int journal_revoke(handle_t *handle, unsigned int blocknr,
336 struct buffer_head *bh_in) 336 struct buffer_head *bh_in)
337{ 337{
338 struct buffer_head *bh = NULL; 338 struct buffer_head *bh = NULL;
@@ -401,7 +401,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
401 } 401 }
402 } 402 }
403 403
404 jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in); 404 jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
405 err = insert_revoke_hash(journal, blocknr, 405 err = insert_revoke_hash(journal, blocknr,
406 handle->h_transaction->t_tid); 406 handle->h_transaction->t_tid);
407 BUFFER_TRACE(bh_in, "exit"); 407 BUFFER_TRACE(bh_in, "exit");
@@ -644,7 +644,7 @@ static void flush_descriptor(journal_t *journal,
644 */ 644 */
645 645
646int journal_set_revoke(journal_t *journal, 646int journal_set_revoke(journal_t *journal,
647 unsigned long blocknr, 647 unsigned int blocknr,
648 tid_t sequence) 648 tid_t sequence)
649{ 649{
650 struct jbd_revoke_record_s *record; 650 struct jbd_revoke_record_s *record;
@@ -668,7 +668,7 @@ int journal_set_revoke(journal_t *journal,
668 */ 668 */
669 669
670int journal_test_revoke(journal_t *journal, 670int journal_test_revoke(journal_t *journal,
671 unsigned long blocknr, 671 unsigned int blocknr,
672 tid_t sequence) 672 tid_t sequence)
673{ 673{
674 struct jbd_revoke_record_s *record; 674 struct jbd_revoke_record_s *record;
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index c03ac11f74be..006f9ad838a2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -56,7 +56,8 @@ get_transaction(journal_t *journal, transaction_t *transaction)
56 spin_lock_init(&transaction->t_handle_lock); 56 spin_lock_init(&transaction->t_handle_lock);
57 57
58 /* Set up the commit timer for the new transaction. */ 58 /* Set up the commit timer for the new transaction. */
59 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); 59 journal->j_commit_timer.expires =
60 round_jiffies_up(transaction->t_expires);
60 add_timer(&journal->j_commit_timer); 61 add_timer(&journal->j_commit_timer);
61 62
62 J_ASSERT(journal->j_running_transaction == NULL); 63 J_ASSERT(journal->j_running_transaction == NULL);
@@ -228,6 +229,8 @@ repeat_locked:
228 __log_space_left(journal)); 229 __log_space_left(journal));
229 spin_unlock(&transaction->t_handle_lock); 230 spin_unlock(&transaction->t_handle_lock);
230 spin_unlock(&journal->j_state_lock); 231 spin_unlock(&journal->j_state_lock);
232
233 lock_map_acquire(&handle->h_lockdep_map);
231out: 234out:
232 if (unlikely(new_transaction)) /* It's usually NULL */ 235 if (unlikely(new_transaction)) /* It's usually NULL */
233 kfree(new_transaction); 236 kfree(new_transaction);
@@ -292,9 +295,6 @@ handle_t *journal_start(journal_t *journal, int nblocks)
292 handle = ERR_PTR(err); 295 handle = ERR_PTR(err);
293 goto out; 296 goto out;
294 } 297 }
295
296 lock_map_acquire(&handle->h_lockdep_map);
297
298out: 298out:
299 return handle; 299 return handle;
300} 300}
@@ -416,6 +416,7 @@ int journal_restart(handle_t *handle, int nblocks)
416 __log_start_commit(journal, transaction->t_tid); 416 __log_start_commit(journal, transaction->t_tid);
417 spin_unlock(&journal->j_state_lock); 417 spin_unlock(&journal->j_state_lock);
418 418
419 lock_map_release(&handle->h_lockdep_map);
419 handle->h_buffer_credits = nblocks; 420 handle->h_buffer_credits = nblocks;
420 ret = start_this_handle(journal, handle); 421 ret = start_this_handle(journal, handle);
421 return ret; 422 return ret;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7b4088b2364d..26d991ddc1e6 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
25#include <linux/writeback.h> 25#include <linux/writeback.h>
26#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
27#include <linux/bio.h> 27#include <linux/bio.h>
28#include <linux/blkdev.h>
28#include <trace/events/jbd2.h> 29#include <trace/events/jbd2.h>
29 30
30/* 31/*
@@ -133,8 +134,8 @@ static int journal_submit_commit_record(journal_t *journal,
133 bh->b_end_io = journal_end_buffer_io_sync; 134 bh->b_end_io = journal_end_buffer_io_sync;
134 135
135 if (journal->j_flags & JBD2_BARRIER && 136 if (journal->j_flags & JBD2_BARRIER &&
136 !JBD2_HAS_INCOMPAT_FEATURE(journal, 137 !JBD2_HAS_INCOMPAT_FEATURE(journal,
137 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 138 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
138 set_buffer_ordered(bh); 139 set_buffer_ordered(bh);
139 barrier_done = 1; 140 barrier_done = 1;
140 } 141 }
@@ -220,7 +221,6 @@ static int journal_submit_inode_data_buffers(struct address_space *mapping)
220 .nr_to_write = mapping->nrpages * 2, 221 .nr_to_write = mapping->nrpages * 2,
221 .range_start = 0, 222 .range_start = 0,
222 .range_end = i_size_read(mapping->host), 223 .range_end = i_size_read(mapping->host),
223 .for_writepages = 1,
224 }; 224 };
225 225
226 ret = generic_writepages(mapping, &wbc); 226 ret = generic_writepages(mapping, &wbc);
@@ -707,11 +707,13 @@ start_journal_io:
707 /* Done it all: now write the commit record asynchronously. */ 707 /* Done it all: now write the commit record asynchronously. */
708 708
709 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 709 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
710 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 710 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
711 err = journal_submit_commit_record(journal, commit_transaction, 711 err = journal_submit_commit_record(journal, commit_transaction,
712 &cbh, crc32_sum); 712 &cbh, crc32_sum);
713 if (err) 713 if (err)
714 __jbd2_journal_abort_hard(journal); 714 __jbd2_journal_abort_hard(journal);
715 if (journal->j_flags & JBD2_BARRIER)
716 blkdev_issue_flush(journal->j_dev, NULL);
715 } 717 }
716 718
717 /* 719 /*
@@ -834,7 +836,7 @@ wait_for_iobuf:
834 jbd_debug(3, "JBD: commit phase 5\n"); 836 jbd_debug(3, "JBD: commit phase 5\n");
835 837
836 if (!JBD2_HAS_INCOMPAT_FEATURE(journal, 838 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
837 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 839 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
838 err = journal_submit_commit_record(journal, commit_transaction, 840 err = journal_submit_commit_record(journal, commit_transaction,
839 &cbh, crc32_sum); 841 &cbh, crc32_sum);
840 if (err) 842 if (err)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e378cb383979..53b86e16e5fe 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -768,7 +768,7 @@ static void jbd2_seq_history_stop(struct seq_file *seq, void *v)
768{ 768{
769} 769}
770 770
771static struct seq_operations jbd2_seq_history_ops = { 771static const struct seq_operations jbd2_seq_history_ops = {
772 .start = jbd2_seq_history_start, 772 .start = jbd2_seq_history_start,
773 .next = jbd2_seq_history_next, 773 .next = jbd2_seq_history_next,
774 .stop = jbd2_seq_history_stop, 774 .stop = jbd2_seq_history_stop,
@@ -872,7 +872,7 @@ static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
872{ 872{
873} 873}
874 874
875static struct seq_operations jbd2_seq_info_ops = { 875static const struct seq_operations jbd2_seq_info_ops = {
876 .start = jbd2_seq_info_start, 876 .start = jbd2_seq_info_start,
877 .next = jbd2_seq_info_next, 877 .next = jbd2_seq_info_next,
878 .stop = jbd2_seq_info_stop, 878 .stop = jbd2_seq_info_stop,
@@ -1187,6 +1187,12 @@ static int journal_reset(journal_t *journal)
1187 1187
1188 first = be32_to_cpu(sb->s_first); 1188 first = be32_to_cpu(sb->s_first);
1189 last = be32_to_cpu(sb->s_maxlen); 1189 last = be32_to_cpu(sb->s_maxlen);
1190 if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
1191 printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
1192 first, last);
1193 journal_fail_superblock(journal);
1194 return -EINVAL;
1195 }
1190 1196
1191 journal->j_first = first; 1197 journal->j_first = first;
1192 journal->j_last = last; 1198 journal->j_last = last;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6213ac728f30..a0512700542f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
57 INIT_LIST_HEAD(&transaction->t_private_list); 57 INIT_LIST_HEAD(&transaction->t_private_list);
58 58
59 /* Set up the commit timer for the new transaction. */ 59 /* Set up the commit timer for the new transaction. */
60 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); 60 journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
61 add_timer(&journal->j_commit_timer); 61 add_timer(&journal->j_commit_timer);
62 62
63 J_ASSERT(journal->j_running_transaction == NULL); 63 J_ASSERT(journal->j_running_transaction == NULL);
@@ -238,6 +238,8 @@ repeat_locked:
238 __jbd2_log_space_left(journal)); 238 __jbd2_log_space_left(journal));
239 spin_unlock(&transaction->t_handle_lock); 239 spin_unlock(&transaction->t_handle_lock);
240 spin_unlock(&journal->j_state_lock); 240 spin_unlock(&journal->j_state_lock);
241
242 lock_map_acquire(&handle->h_lockdep_map);
241out: 243out:
242 if (unlikely(new_transaction)) /* It's usually NULL */ 244 if (unlikely(new_transaction)) /* It's usually NULL */
243 kfree(new_transaction); 245 kfree(new_transaction);
@@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
303 handle = ERR_PTR(err); 305 handle = ERR_PTR(err);
304 goto out; 306 goto out;
305 } 307 }
306
307 lock_map_acquire(&handle->h_lockdep_map);
308out: 308out:
309 return handle; 309 return handle;
310} 310}
@@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
426 __jbd2_log_start_commit(journal, transaction->t_tid); 426 __jbd2_log_start_commit(journal, transaction->t_tid);
427 spin_unlock(&journal->j_state_lock); 427 spin_unlock(&journal->j_state_lock);
428 428
429 lock_map_release(&handle->h_lockdep_map);
429 handle->h_buffer_credits = nblocks; 430 handle->h_buffer_credits = nblocks;
430 ret = start_this_handle(journal, handle); 431 ret = start_this_handle(journal, handle);
431 return ret; 432 return ret;
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index e9580104b6ba..3ff50da94789 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -15,6 +15,7 @@
15#include <linux/completion.h> 15#include <linux/completion.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/freezer.h> 17#include <linux/freezer.h>
18#include <linux/kthread.h>
18#include "nodelist.h" 19#include "nodelist.h"
19 20
20 21
@@ -31,7 +32,7 @@ void jffs2_garbage_collect_trigger(struct jffs2_sb_info *c)
31/* This must only ever be called when no GC thread is currently running */ 32/* This must only ever be called when no GC thread is currently running */
32int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c) 33int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c)
33{ 34{
34 pid_t pid; 35 struct task_struct *tsk;
35 int ret = 0; 36 int ret = 0;
36 37
37 BUG_ON(c->gc_task); 38 BUG_ON(c->gc_task);
@@ -39,15 +40,16 @@ int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c)
39 init_completion(&c->gc_thread_start); 40 init_completion(&c->gc_thread_start);
40 init_completion(&c->gc_thread_exit); 41 init_completion(&c->gc_thread_exit);
41 42
42 pid = kernel_thread(jffs2_garbage_collect_thread, c, CLONE_FS|CLONE_FILES); 43 tsk = kthread_run(jffs2_garbage_collect_thread, c, "jffs2_gcd_mtd%d", c->mtd->index);
43 if (pid < 0) { 44 if (IS_ERR(tsk)) {
44 printk(KERN_WARNING "fork failed for JFFS2 garbage collect thread: %d\n", -pid); 45 printk(KERN_WARNING "fork failed for JFFS2 garbage collect thread: %ld\n", -PTR_ERR(tsk));
45 complete(&c->gc_thread_exit); 46 complete(&c->gc_thread_exit);
46 ret = pid; 47 ret = PTR_ERR(tsk);
47 } else { 48 } else {
48 /* Wait for it... */ 49 /* Wait for it... */
49 D1(printk(KERN_DEBUG "JFFS2: Garbage collect thread is pid %d\n", pid)); 50 D1(printk(KERN_DEBUG "JFFS2: Garbage collect thread is pid %d\n", tsk->pid));
50 wait_for_completion(&c->gc_thread_start); 51 wait_for_completion(&c->gc_thread_start);
52 ret = tsk->pid;
51 } 53 }
52 54
53 return ret; 55 return ret;
@@ -71,7 +73,6 @@ static int jffs2_garbage_collect_thread(void *_c)
71{ 73{
72 struct jffs2_sb_info *c = _c; 74 struct jffs2_sb_info *c = _c;
73 75
74 daemonize("jffs2_gcd_mtd%d", c->mtd->index);
75 allow_signal(SIGKILL); 76 allow_signal(SIGKILL);
76 allow_signal(SIGSTOP); 77 allow_signal(SIGSTOP);
77 allow_signal(SIGCONT); 78 allow_signal(SIGCONT);
@@ -107,6 +108,11 @@ static int jffs2_garbage_collect_thread(void *_c)
107 * the GC thread get there first. */ 108 * the GC thread get there first. */
108 schedule_timeout_interruptible(msecs_to_jiffies(50)); 109 schedule_timeout_interruptible(msecs_to_jiffies(50));
109 110
111 if (kthread_should_stop()) {
112 D1(printk(KERN_DEBUG "jffs2_garbage_collect_thread(): kthread_stop() called.\n"));
113 goto die;
114 }
115
110 /* Put_super will send a SIGKILL and then wait on the sem. 116 /* Put_super will send a SIGKILL and then wait on the sem.
111 */ 117 */
112 while (signal_pending(current) || freezing(current)) { 118 while (signal_pending(current) || freezing(current)) {
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 9eff2bdae8a7..c082868910f2 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -39,13 +39,13 @@ int __init jffs2_create_slab_caches(void)
39 39
40 raw_dirent_slab = kmem_cache_create("jffs2_raw_dirent", 40 raw_dirent_slab = kmem_cache_create("jffs2_raw_dirent",
41 sizeof(struct jffs2_raw_dirent), 41 sizeof(struct jffs2_raw_dirent),
42 0, 0, NULL); 42 0, SLAB_HWCACHE_ALIGN, NULL);
43 if (!raw_dirent_slab) 43 if (!raw_dirent_slab)
44 goto err; 44 goto err;
45 45
46 raw_inode_slab = kmem_cache_create("jffs2_raw_inode", 46 raw_inode_slab = kmem_cache_create("jffs2_raw_inode",
47 sizeof(struct jffs2_raw_inode), 47 sizeof(struct jffs2_raw_inode),
48 0, 0, NULL); 48 0, SLAB_HWCACHE_ALIGN, NULL);
49 if (!raw_inode_slab) 49 if (!raw_inode_slab)
50 goto err; 50 goto err;
51 51
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 0035c021395a..9a80e8e595d0 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -123,7 +123,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child)
123 return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino)); 123 return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino));
124} 124}
125 125
126static struct export_operations jffs2_export_ops = { 126static const struct export_operations jffs2_export_ops = {
127 .get_parent = jffs2_get_parent, 127 .get_parent = jffs2_get_parent,
128 .fh_to_dentry = jffs2_fh_to_dentry, 128 .fh_to_dentry = jffs2_fh_to_dentry,
129 .fh_to_parent = jffs2_fh_to_parent, 129 .fh_to_parent = jffs2_fh_to_parent,
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 1f3b0fc0d351..fc9032dc8862 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -166,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
166 */ 166 */
167 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) 167 if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
168 continue; 168 continue;
169 if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) 169 if (!rpc_cmp_addr(nlm_addr(block->b_host), addr))
170 continue; 170 continue;
171 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) 171 if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
172 continue; 172 continue;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 4336adba952a..c81249fef11f 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -458,7 +458,7 @@ static void nlmclnt_locks_release_private(struct file_lock *fl)
458 nlm_put_lockowner(fl->fl_u.nfs_fl.owner); 458 nlm_put_lockowner(fl->fl_u.nfs_fl.owner);
459} 459}
460 460
461static struct file_lock_operations nlmclnt_lock_ops = { 461static const struct file_lock_operations nlmclnt_lock_ops = {
462 .fl_copy_lock = nlmclnt_locks_copy_lock, 462 .fl_copy_lock = nlmclnt_locks_copy_lock,
463 .fl_release_private = nlmclnt_locks_release_private, 463 .fl_release_private = nlmclnt_locks_release_private,
464}; 464};
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 7cb076ac6b45..4600c2037b8b 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -111,7 +111,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
111 */ 111 */
112 chain = &nlm_hosts[nlm_hash_address(ni->sap)]; 112 chain = &nlm_hosts[nlm_hash_address(ni->sap)];
113 hlist_for_each_entry(host, pos, chain, h_hash) { 113 hlist_for_each_entry(host, pos, chain, h_hash) {
114 if (!nlm_cmp_addr(nlm_addr(host), ni->sap)) 114 if (!rpc_cmp_addr(nlm_addr(host), ni->sap))
115 continue; 115 continue;
116 116
117 /* See if we have an NSM handle for this client */ 117 /* See if we have an NSM handle for this client */
@@ -125,7 +125,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
125 if (host->h_server != ni->server) 125 if (host->h_server != ni->server)
126 continue; 126 continue;
127 if (ni->server && 127 if (ni->server &&
128 !nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap)) 128 !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap))
129 continue; 129 continue;
130 130
131 /* Move to head of hash chain. */ 131 /* Move to head of hash chain. */
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 30c933188dd7..f956651d0f65 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -209,7 +209,7 @@ static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
209 struct nsm_handle *nsm; 209 struct nsm_handle *nsm;
210 210
211 list_for_each_entry(nsm, &nsm_handles, sm_link) 211 list_for_each_entry(nsm, &nsm_handles, sm_link)
212 if (nlm_cmp_addr(nsm_addr(nsm), sap)) 212 if (rpc_cmp_addr(nsm_addr(nsm), sap))
213 return nsm; 213 return nsm;
214 return NULL; 214 return NULL;
215} 215}
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index e577a78d7bac..d1001790fa9a 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -705,7 +705,7 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
705 return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid; 705 return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid;
706} 706}
707 707
708struct lock_manager_operations nlmsvc_lock_operations = { 708const struct lock_manager_operations nlmsvc_lock_operations = {
709 .fl_compare_owner = nlmsvc_same_owner, 709 .fl_compare_owner = nlmsvc_same_owner,
710 .fl_notify = nlmsvc_notify_blocked, 710 .fl_notify = nlmsvc_notify_blocked,
711 .fl_grant = nlmsvc_grant_deferred, 711 .fl_grant = nlmsvc_grant_deferred,
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 9e4d6aab611b..ad478da7ca63 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -417,7 +417,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
417static int 417static int
418nlmsvc_match_ip(void *datap, struct nlm_host *host) 418nlmsvc_match_ip(void *datap, struct nlm_host *host)
419{ 419{
420 return nlm_cmp_addr(nlm_srcaddr(host), datap); 420 return rpc_cmp_addr(nlm_srcaddr(host), datap);
421} 421}
422 422
423/** 423/**
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 0336f2beacde..b583ab0a4cbb 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -8,7 +8,6 @@
8 8
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/utsname.h>
12#include <linux/nfs.h> 11#include <linux/nfs.h>
13 12
14#include <linux/sunrpc/xdr.h> 13#include <linux/sunrpc/xdr.h>
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index e1d528653192..ad9dbbc9145d 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -9,7 +9,6 @@
9 9
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/utsname.h>
13#include <linux/nfs.h> 12#include <linux/nfs.h>
14 13
15#include <linux/sunrpc/xdr.h> 14#include <linux/sunrpc/xdr.h>
diff --git a/fs/locks.c b/fs/locks.c
index 19ee18a6829b..a8794f233bc9 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -434,7 +434,7 @@ static int lease_mylease_callback(struct file_lock *fl, struct file_lock *try)
434 return fl->fl_file == try->fl_file; 434 return fl->fl_file == try->fl_file;
435} 435}
436 436
437static struct lock_manager_operations lease_manager_ops = { 437static const struct lock_manager_operations lease_manager_ops = {
438 .fl_break = lease_break_callback, 438 .fl_break = lease_break_callback,
439 .fl_release_private = lease_release_private_callback, 439 .fl_release_private = lease_release_private_callback,
440 .fl_mylease = lease_mylease_callback, 440 .fl_mylease = lease_mylease_callback,
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index d407e7a0b6fe..6198731d7fcd 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -308,14 +308,18 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
308 struct inode *inode = (struct inode*)mapping->host; 308 struct inode *inode = (struct inode*)mapping->host;
309 char *kaddr = page_address(page); 309 char *kaddr = page_address(page);
310 loff_t pos = page_offset(page) + (char*)de - kaddr; 310 loff_t pos = page_offset(page) + (char*)de - kaddr;
311 unsigned len = minix_sb(inode->i_sb)->s_dirsize; 311 struct minix_sb_info *sbi = minix_sb(inode->i_sb);
312 unsigned len = sbi->s_dirsize;
312 int err; 313 int err;
313 314
314 lock_page(page); 315 lock_page(page);
315 err = __minix_write_begin(NULL, mapping, pos, len, 316 err = __minix_write_begin(NULL, mapping, pos, len,
316 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); 317 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
317 if (err == 0) { 318 if (err == 0) {
318 de->inode = 0; 319 if (sbi->s_version == MINIX_V3)
320 ((minix3_dirent *) de)->inode = 0;
321 else
322 de->inode = 0;
319 err = dir_commit_chunk(page, pos, len); 323 err = dir_commit_chunk(page, pos, len);
320 } else { 324 } else {
321 unlock_page(page); 325 unlock_page(page);
@@ -440,7 +444,10 @@ void minix_set_link(struct minix_dir_entry *de, struct page *page,
440 err = __minix_write_begin(NULL, mapping, pos, sbi->s_dirsize, 444 err = __minix_write_begin(NULL, mapping, pos, sbi->s_dirsize,
441 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); 445 AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
442 if (err == 0) { 446 if (err == 0) {
443 de->inode = inode->i_ino; 447 if (sbi->s_version == MINIX_V3)
448 ((minix3_dirent *) de)->inode = inode->i_ino;
449 else
450 de->inode = inode->i_ino;
444 err = dir_commit_chunk(page, pos, sbi->s_dirsize); 451 err = dir_commit_chunk(page, pos, sbi->s_dirsize);
445 } else { 452 } else {
446 unlock_page(page); 453 unlock_page(page);
@@ -470,7 +477,14 @@ ino_t minix_inode_by_name(struct dentry *dentry)
470 ino_t res = 0; 477 ino_t res = 0;
471 478
472 if (de) { 479 if (de) {
473 res = de->inode; 480 struct address_space *mapping = page->mapping;
481 struct inode *inode = mapping->host;
482 struct minix_sb_info *sbi = minix_sb(inode->i_sb);
483
484 if (sbi->s_version == MINIX_V3)
485 res = ((minix3_dirent *) de)->inode;
486 else
487 res = de->inode;
474 dir_put_page(page); 488 dir_put_page(page);
475 } 489 }
476 return res; 490 return res;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 9c590722d87e..b8b5b30d53f0 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1241,7 +1241,7 @@ ncp_date_unix2dos(int unix_date, __le16 *time, __le16 *date)
1241 month = 2; 1241 month = 2;
1242 } else { 1242 } else {
1243 nl_day = (year & 3) || day <= 59 ? day : day - 1; 1243 nl_day = (year & 3) || day <= 59 ? day : day - 1;
1244 for (month = 0; month < 12; month++) 1244 for (month = 1; month < 12; month++)
1245 if (day_n[month] > nl_day) 1245 if (day_n[month] > nl_day)
1246 break; 1246 break;
1247 } 1247 }
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index fa038df63ac8..53a7ed7eb9c6 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -442,7 +442,7 @@ static int __ncp_ioctl(struct inode *inode, struct file *filp,
442 if (dentry) { 442 if (dentry) {
443 struct inode* s_inode = dentry->d_inode; 443 struct inode* s_inode = dentry->d_inode;
444 444
445 if (inode) { 445 if (s_inode) {
446 NCP_FINFO(s_inode)->volNumber = vnum; 446 NCP_FINFO(s_inode)->volNumber = vnum;
447 NCP_FINFO(s_inode)->dirEntNum = de; 447 NCP_FINFO(s_inode)->dirEntNum = de;
448 NCP_FINFO(s_inode)->DosDirNum = dosde; 448 NCP_FINFO(s_inode)->DosDirNum = dosde;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index e5a2dac5f715..76b0aa0f73bf 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -222,7 +222,7 @@ static unsigned decode_sessionid(struct xdr_stream *xdr,
222 222
223 p = read_buf(xdr, len); 223 p = read_buf(xdr, len);
224 if (unlikely(p == NULL)) 224 if (unlikely(p == NULL))
225 return htonl(NFS4ERR_RESOURCE);; 225 return htonl(NFS4ERR_RESOURCE);
226 226
227 memcpy(sid->data, p, len); 227 memcpy(sid->data, p, len);
228 return 0; 228 return 0;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e350bd6a2334..63976c0ccc25 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -648,8 +648,6 @@ static int nfs_start_lockd(struct nfs_server *server)
648 .hostname = clp->cl_hostname, 648 .hostname = clp->cl_hostname,
649 .address = (struct sockaddr *)&clp->cl_addr, 649 .address = (struct sockaddr *)&clp->cl_addr,
650 .addrlen = clp->cl_addrlen, 650 .addrlen = clp->cl_addrlen,
651 .protocol = server->flags & NFS_MOUNT_TCP ?
652 IPPROTO_TCP : IPPROTO_UDP,
653 .nfs_version = clp->rpc_ops->version, 651 .nfs_version = clp->rpc_ops->version,
654 .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? 652 .noresvport = server->flags & NFS_MOUNT_NORESVPORT ?
655 1 : 0, 653 1 : 0,
@@ -660,6 +658,14 @@ static int nfs_start_lockd(struct nfs_server *server)
660 if (server->flags & NFS_MOUNT_NONLM) 658 if (server->flags & NFS_MOUNT_NONLM)
661 return 0; 659 return 0;
662 660
661 switch (clp->cl_proto) {
662 default:
663 nlm_init.protocol = IPPROTO_TCP;
664 break;
665 case XPRT_TRANSPORT_UDP:
666 nlm_init.protocol = IPPROTO_UDP;
667 }
668
663 host = nlmclnt_init(&nlm_init); 669 host = nlmclnt_init(&nlm_init);
664 if (IS_ERR(host)) 670 if (IS_ERR(host))
665 return PTR_ERR(host); 671 return PTR_ERR(host);
@@ -787,7 +793,7 @@ static int nfs_init_server(struct nfs_server *server,
787 dprintk("--> nfs_init_server()\n"); 793 dprintk("--> nfs_init_server()\n");
788 794
789#ifdef CONFIG_NFS_V3 795#ifdef CONFIG_NFS_V3
790 if (data->flags & NFS_MOUNT_VER3) 796 if (data->version == 3)
791 cl_init.rpc_ops = &nfs_v3_clientops; 797 cl_init.rpc_ops = &nfs_v3_clientops;
792#endif 798#endif
793 799
@@ -933,10 +939,6 @@ static int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, str
933 goto out_error; 939 goto out_error;
934 940
935 nfs_server_set_fsinfo(server, &fsinfo); 941 nfs_server_set_fsinfo(server, &fsinfo);
936 error = bdi_init(&server->backing_dev_info);
937 if (error)
938 goto out_error;
939
940 942
941 /* Get some general file system info */ 943 /* Get some general file system info */
942 if (server->namelen == 0) { 944 if (server->namelen == 0) {
@@ -968,6 +970,7 @@ static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_serve
968 target->acdirmin = source->acdirmin; 970 target->acdirmin = source->acdirmin;
969 target->acdirmax = source->acdirmax; 971 target->acdirmax = source->acdirmax;
970 target->caps = source->caps; 972 target->caps = source->caps;
973 target->options = source->options;
971} 974}
972 975
973/* 976/*
@@ -995,6 +998,12 @@ static struct nfs_server *nfs_alloc_server(void)
995 return NULL; 998 return NULL;
996 } 999 }
997 1000
1001 if (bdi_init(&server->backing_dev_info)) {
1002 nfs_free_iostats(server->io_stats);
1003 kfree(server);
1004 return NULL;
1005 }
1006
998 return server; 1007 return server;
999} 1008}
1000 1009
@@ -1529,7 +1538,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
1529static void nfs_server_list_stop(struct seq_file *p, void *v); 1538static void nfs_server_list_stop(struct seq_file *p, void *v);
1530static int nfs_server_list_show(struct seq_file *m, void *v); 1539static int nfs_server_list_show(struct seq_file *m, void *v);
1531 1540
1532static struct seq_operations nfs_server_list_ops = { 1541static const struct seq_operations nfs_server_list_ops = {
1533 .start = nfs_server_list_start, 1542 .start = nfs_server_list_start,
1534 .next = nfs_server_list_next, 1543 .next = nfs_server_list_next,
1535 .stop = nfs_server_list_stop, 1544 .stop = nfs_server_list_stop,
@@ -1550,7 +1559,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos);
1550static void nfs_volume_list_stop(struct seq_file *p, void *v); 1559static void nfs_volume_list_stop(struct seq_file *p, void *v);
1551static int nfs_volume_list_show(struct seq_file *m, void *v); 1560static int nfs_volume_list_show(struct seq_file *m, void *v);
1552 1561
1553static struct seq_operations nfs_volume_list_ops = { 1562static const struct seq_operations nfs_volume_list_ops = {
1554 .start = nfs_volume_list_start, 1563 .start = nfs_volume_list_start,
1555 .next = nfs_volume_list_next, 1564 .next = nfs_volume_list_next,
1556 .stop = nfs_volume_list_stop, 1565 .stop = nfs_volume_list_stop,
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 379be678cb7e..70fad69eb959 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -58,17 +58,34 @@ void nfs_fscache_release_client_cookie(struct nfs_client *clp)
58/* 58/*
59 * Get the cache cookie for an NFS superblock. We have to handle 59 * Get the cache cookie for an NFS superblock. We have to handle
60 * uniquification here because the cache doesn't do it for us. 60 * uniquification here because the cache doesn't do it for us.
61 *
62 * The default uniquifier is just an empty string, but it may be overridden
63 * either by the 'fsc=xxx' option to mount, or by inheriting it from the parent
64 * superblock across an automount point of some nature.
61 */ 65 */
62void nfs_fscache_get_super_cookie(struct super_block *sb, 66void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq,
63 struct nfs_parsed_mount_data *data) 67 struct nfs_clone_mount *mntdata)
64{ 68{
65 struct nfs_fscache_key *key, *xkey; 69 struct nfs_fscache_key *key, *xkey;
66 struct nfs_server *nfss = NFS_SB(sb); 70 struct nfs_server *nfss = NFS_SB(sb);
67 struct rb_node **p, *parent; 71 struct rb_node **p, *parent;
68 const char *uniq = data->fscache_uniq ?: "";
69 int diff, ulen; 72 int diff, ulen;
70 73
71 ulen = strlen(uniq); 74 if (uniq) {
75 ulen = strlen(uniq);
76 } else if (mntdata) {
77 struct nfs_server *mnt_s = NFS_SB(mntdata->sb);
78 if (mnt_s->fscache_key) {
79 uniq = mnt_s->fscache_key->key.uniquifier;
80 ulen = mnt_s->fscache_key->key.uniq_len;
81 }
82 }
83
84 if (!uniq) {
85 uniq = "";
86 ulen = 1;
87 }
88
72 key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL); 89 key = kzalloc(sizeof(*key) + ulen, GFP_KERNEL);
73 if (!key) 90 if (!key)
74 return; 91 return;
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 6e809bb0ff08..b9c572d0679f 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -74,7 +74,8 @@ extern void nfs_fscache_get_client_cookie(struct nfs_client *);
74extern void nfs_fscache_release_client_cookie(struct nfs_client *); 74extern void nfs_fscache_release_client_cookie(struct nfs_client *);
75 75
76extern void nfs_fscache_get_super_cookie(struct super_block *, 76extern void nfs_fscache_get_super_cookie(struct super_block *,
77 struct nfs_parsed_mount_data *); 77 const char *,
78 struct nfs_clone_mount *);
78extern void nfs_fscache_release_super_cookie(struct super_block *); 79extern void nfs_fscache_release_super_cookie(struct super_block *);
79 80
80extern void nfs_fscache_init_inode_cookie(struct inode *); 81extern void nfs_fscache_init_inode_cookie(struct inode *);
@@ -173,7 +174,8 @@ static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
173 174
174static inline void nfs_fscache_get_super_cookie( 175static inline void nfs_fscache_get_super_cookie(
175 struct super_block *sb, 176 struct super_block *sb,
176 struct nfs_parsed_mount_data *data) 177 const char *uniq,
178 struct nfs_clone_mount *mntdata)
177{ 179{
178} 180}
179static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {} 181static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index c862c9340f9a..5e078b222b4e 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -13,7 +13,6 @@
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/utsname.h>
17#include <linux/errno.h> 16#include <linux/errno.h>
18#include <linux/string.h> 17#include <linux/string.h>
19#include <linux/in.h> 18#include <linux/in.h>
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ee6a13f05443..3f8881d1a050 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -7,7 +7,6 @@
7 */ 7 */
8 8
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/utsname.h>
11#include <linux/errno.h> 10#include <linux/errno.h>
12#include <linux/string.h> 11#include <linux/string.h>
13#include <linux/sunrpc/clnt.h> 12#include <linux/sunrpc/clnt.h>
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 35869a4921f1..5fe5492fbd29 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -10,7 +10,6 @@
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/utsname.h>
14#include <linux/errno.h> 13#include <linux/errno.h>
15#include <linux/string.h> 14#include <linux/string.h>
16#include <linux/in.h> 15#include <linux/in.h>
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index be6544aef41f..ed7c269e2514 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -36,7 +36,6 @@
36 */ 36 */
37 37
38#include <linux/mm.h> 38#include <linux/mm.h>
39#include <linux/utsname.h>
40#include <linux/delay.h> 39#include <linux/delay.h>
41#include <linux/errno.h> 40#include <linux/errno.h>
42#include <linux/string.h> 41#include <linux/string.h>
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 1434080aefeb..2ef4fecf3984 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -638,7 +638,7 @@ static void nfs4_fl_release_lock(struct file_lock *fl)
638 nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner); 638 nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner);
639} 639}
640 640
641static struct file_lock_operations nfs4_fl_lock_ops = { 641static const struct file_lock_operations nfs4_fl_lock_ops = {
642 .fl_copy_lock = nfs4_fl_copy_lock, 642 .fl_copy_lock = nfs4_fl_copy_lock,
643 .fl_release_private = nfs4_fl_release_lock, 643 .fl_release_private = nfs4_fl_release_lock,
644}; 644};
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index cfc30d362f94..83ad47cbdd8a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -39,7 +39,6 @@
39#include <linux/time.h> 39#include <linux/time.h>
40#include <linux/mm.h> 40#include <linux/mm.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/utsname.h>
43#include <linux/errno.h> 42#include <linux/errno.h>
44#include <linux/string.h> 43#include <linux/string.h>
45#include <linux/in.h> 44#include <linux/in.h>
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 7be72d90d49d..ef583854d8d0 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -32,7 +32,6 @@
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/time.h> 33#include <linux/time.h>
34#include <linux/mm.h> 34#include <linux/mm.h>
35#include <linux/utsname.h>
36#include <linux/errno.h> 35#include <linux/errno.h>
37#include <linux/string.h> 36#include <linux/string.h>
38#include <linux/in.h> 37#include <linux/in.h>
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 867f70504531..810770f96816 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -728,6 +728,27 @@ static void nfs_umount_begin(struct super_block *sb)
728 unlock_kernel(); 728 unlock_kernel();
729} 729}
730 730
731static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(int flags)
732{
733 struct nfs_parsed_mount_data *data;
734
735 data = kzalloc(sizeof(*data), GFP_KERNEL);
736 if (data) {
737 data->flags = flags;
738 data->rsize = NFS_MAX_FILE_IO_SIZE;
739 data->wsize = NFS_MAX_FILE_IO_SIZE;
740 data->acregmin = NFS_DEF_ACREGMIN;
741 data->acregmax = NFS_DEF_ACREGMAX;
742 data->acdirmin = NFS_DEF_ACDIRMIN;
743 data->acdirmax = NFS_DEF_ACDIRMAX;
744 data->nfs_server.port = NFS_UNSPEC_PORT;
745 data->auth_flavors[0] = RPC_AUTH_UNIX;
746 data->auth_flavor_len = 1;
747 data->minorversion = 0;
748 }
749 return data;
750}
751
731/* 752/*
732 * Sanity-check a server address provided by the mount command. 753 * Sanity-check a server address provided by the mount command.
733 * 754 *
@@ -1430,10 +1451,13 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1430 int status; 1451 int status;
1431 1452
1432 if (args->mount_server.version == 0) { 1453 if (args->mount_server.version == 0) {
1433 if (args->flags & NFS_MOUNT_VER3) 1454 switch (args->version) {
1434 args->mount_server.version = NFS_MNT3_VERSION; 1455 default:
1435 else 1456 args->mount_server.version = NFS_MNT3_VERSION;
1436 args->mount_server.version = NFS_MNT_VERSION; 1457 break;
1458 case 2:
1459 args->mount_server.version = NFS_MNT_VERSION;
1460 }
1437 } 1461 }
1438 request.version = args->mount_server.version; 1462 request.version = args->mount_server.version;
1439 1463
@@ -1634,20 +1658,6 @@ static int nfs_validate_mount_data(void *options,
1634 if (data == NULL) 1658 if (data == NULL)
1635 goto out_no_data; 1659 goto out_no_data;
1636 1660
1637 args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
1638 args->rsize = NFS_MAX_FILE_IO_SIZE;
1639 args->wsize = NFS_MAX_FILE_IO_SIZE;
1640 args->acregmin = NFS_DEF_ACREGMIN;
1641 args->acregmax = NFS_DEF_ACREGMAX;
1642 args->acdirmin = NFS_DEF_ACDIRMIN;
1643 args->acdirmax = NFS_DEF_ACDIRMAX;
1644 args->mount_server.port = NFS_UNSPEC_PORT;
1645 args->nfs_server.port = NFS_UNSPEC_PORT;
1646 args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1647 args->auth_flavors[0] = RPC_AUTH_UNIX;
1648 args->auth_flavor_len = 1;
1649 args->minorversion = 0;
1650
1651 switch (data->version) { 1661 switch (data->version) {
1652 case 1: 1662 case 1:
1653 data->namlen = 0; 1663 data->namlen = 0;
@@ -1778,7 +1788,7 @@ static int nfs_validate_mount_data(void *options,
1778 } 1788 }
1779 1789
1780#ifndef CONFIG_NFS_V3 1790#ifndef CONFIG_NFS_V3
1781 if (args->flags & NFS_MOUNT_VER3) 1791 if (args->version == 3)
1782 goto out_v3_not_compiled; 1792 goto out_v3_not_compiled;
1783#endif /* !CONFIG_NFS_V3 */ 1793#endif /* !CONFIG_NFS_V3 */
1784 1794
@@ -1918,6 +1928,8 @@ static inline void nfs_initialise_sb(struct super_block *sb)
1918 if (server->flags & NFS_MOUNT_NOAC) 1928 if (server->flags & NFS_MOUNT_NOAC)
1919 sb->s_flags |= MS_SYNCHRONOUS; 1929 sb->s_flags |= MS_SYNCHRONOUS;
1920 1930
1931 sb->s_bdi = &server->backing_dev_info;
1932
1921 nfs_super_set_maxbytes(sb, server->maxfilesize); 1933 nfs_super_set_maxbytes(sb, server->maxfilesize);
1922} 1934}
1923 1935
@@ -1934,7 +1946,7 @@ static void nfs_fill_super(struct super_block *sb,
1934 if (data->bsize) 1946 if (data->bsize)
1935 sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits); 1947 sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
1936 1948
1937 if (server->flags & NFS_MOUNT_VER3) { 1949 if (server->nfs_client->rpc_ops->version == 3) {
1938 /* The VFS shouldn't apply the umask to mode bits. We will do 1950 /* The VFS shouldn't apply the umask to mode bits. We will do
1939 * so ourselves when necessary. 1951 * so ourselves when necessary.
1940 */ 1952 */
@@ -1958,7 +1970,7 @@ static void nfs_clone_super(struct super_block *sb,
1958 sb->s_blocksize = old_sb->s_blocksize; 1970 sb->s_blocksize = old_sb->s_blocksize;
1959 sb->s_maxbytes = old_sb->s_maxbytes; 1971 sb->s_maxbytes = old_sb->s_maxbytes;
1960 1972
1961 if (server->flags & NFS_MOUNT_VER3) { 1973 if (server->nfs_client->rpc_ops->version == 3) {
1962 /* The VFS shouldn't apply the umask to mode bits. We will do 1974 /* The VFS shouldn't apply the umask to mode bits. We will do
1963 * so ourselves when necessary. 1975 * so ourselves when necessary.
1964 */ 1976 */
@@ -2092,7 +2104,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2092 }; 2104 };
2093 int error = -ENOMEM; 2105 int error = -ENOMEM;
2094 2106
2095 data = kzalloc(sizeof(*data), GFP_KERNEL); 2107 data = nfs_alloc_parsed_mount_data(NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
2096 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); 2108 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
2097 if (data == NULL || mntfh == NULL) 2109 if (data == NULL || mntfh == NULL)
2098 goto out_free_fh; 2110 goto out_free_fh;
@@ -2142,7 +2154,8 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2142 if (!s->s_root) { 2154 if (!s->s_root) {
2143 /* initial superblock/root creation */ 2155 /* initial superblock/root creation */
2144 nfs_fill_super(s, data); 2156 nfs_fill_super(s, data);
2145 nfs_fscache_get_super_cookie(s, data); 2157 nfs_fscache_get_super_cookie(
2158 s, data ? data->fscache_uniq : NULL, NULL);
2146 } 2159 }
2147 2160
2148 mntroot = nfs_get_root(s, mntfh); 2161 mntroot = nfs_get_root(s, mntfh);
@@ -2188,8 +2201,8 @@ static void nfs_kill_super(struct super_block *s)
2188{ 2201{
2189 struct nfs_server *server = NFS_SB(s); 2202 struct nfs_server *server = NFS_SB(s);
2190 2203
2191 bdi_unregister(&server->backing_dev_info);
2192 kill_anon_super(s); 2204 kill_anon_super(s);
2205 bdi_unregister(&server->backing_dev_info);
2193 nfs_fscache_release_super_cookie(s); 2206 nfs_fscache_release_super_cookie(s);
2194 nfs_free_server(server); 2207 nfs_free_server(server);
2195} 2208}
@@ -2243,6 +2256,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
2243 if (!s->s_root) { 2256 if (!s->s_root) {
2244 /* initial superblock/root creation */ 2257 /* initial superblock/root creation */
2245 nfs_clone_super(s, data->sb); 2258 nfs_clone_super(s, data->sb);
2259 nfs_fscache_get_super_cookie(s, NULL, data);
2246 } 2260 }
2247 2261
2248 mntroot = nfs_get_root(s, data->fh); 2262 mntroot = nfs_get_root(s, data->fh);
@@ -2360,18 +2374,7 @@ static int nfs4_validate_mount_data(void *options,
2360 if (data == NULL) 2374 if (data == NULL)
2361 goto out_no_data; 2375 goto out_no_data;
2362 2376
2363 args->rsize = NFS_MAX_FILE_IO_SIZE;
2364 args->wsize = NFS_MAX_FILE_IO_SIZE;
2365 args->acregmin = NFS_DEF_ACREGMIN;
2366 args->acregmax = NFS_DEF_ACREGMAX;
2367 args->acdirmin = NFS_DEF_ACDIRMIN;
2368 args->acdirmax = NFS_DEF_ACDIRMAX;
2369 args->nfs_server.port = NFS_UNSPEC_PORT;
2370 args->auth_flavors[0] = RPC_AUTH_UNIX;
2371 args->auth_flavor_len = 1;
2372 args->version = 4; 2377 args->version = 4;
2373 args->minorversion = 0;
2374
2375 switch (data->version) { 2378 switch (data->version) {
2376 case 1: 2379 case 1:
2377 if (data->host_addrlen > sizeof(args->nfs_server.address)) 2380 if (data->host_addrlen > sizeof(args->nfs_server.address))
@@ -2506,7 +2509,8 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2506 if (!s->s_root) { 2509 if (!s->s_root) {
2507 /* initial superblock/root creation */ 2510 /* initial superblock/root creation */
2508 nfs4_fill_super(s); 2511 nfs4_fill_super(s);
2509 nfs_fscache_get_super_cookie(s, data); 2512 nfs_fscache_get_super_cookie(
2513 s, data ? data->fscache_uniq : NULL, NULL);
2510 } 2514 }
2511 2515
2512 mntroot = nfs4_get_root(s, mntfh); 2516 mntroot = nfs4_get_root(s, mntfh);
@@ -2654,7 +2658,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2654 struct nfs_parsed_mount_data *data; 2658 struct nfs_parsed_mount_data *data;
2655 int error = -ENOMEM; 2659 int error = -ENOMEM;
2656 2660
2657 data = kzalloc(sizeof(*data), GFP_KERNEL); 2661 data = nfs_alloc_parsed_mount_data(0);
2658 if (data == NULL) 2662 if (data == NULL)
2659 goto out_free_data; 2663 goto out_free_data;
2660 2664
@@ -2739,6 +2743,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
2739 if (!s->s_root) { 2743 if (!s->s_root) {
2740 /* initial superblock/root creation */ 2744 /* initial superblock/root creation */
2741 nfs4_clone_super(s, data->sb); 2745 nfs4_clone_super(s, data->sb);
2746 nfs_fscache_get_super_cookie(s, NULL, data);
2742 } 2747 }
2743 2748
2744 mntroot = nfs4_get_root(s, data->fh); 2749 mntroot = nfs4_get_root(s, data->fh);
@@ -2820,6 +2825,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2820 if (!s->s_root) { 2825 if (!s->s_root) {
2821 /* initial superblock/root creation */ 2826 /* initial superblock/root creation */
2822 nfs4_fill_super(s); 2827 nfs4_fill_super(s);
2828 nfs_fscache_get_super_cookie(s, NULL, data);
2823 } 2829 }
2824 2830
2825 mntroot = nfs4_get_root(s, &mntfh); 2831 mntroot = nfs4_get_root(s, &mntfh);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 120acadc6a84..53eb26c16b50 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1490,7 +1490,6 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
1490 .nr_to_write = LONG_MAX, 1490 .nr_to_write = LONG_MAX,
1491 .range_start = 0, 1491 .range_start = 0,
1492 .range_end = LLONG_MAX, 1492 .range_end = LLONG_MAX,
1493 .for_writepages = 1,
1494 }; 1493 };
1495 1494
1496 return __nfs_write_mapping(mapping, &wbc, how); 1495 return __nfs_write_mapping(mapping, &wbc, how);
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index d9462643155c..c1c9e035d4a4 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1341,6 +1341,8 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
1341 if (rv) 1341 if (rv)
1342 goto out; 1342 goto out;
1343 rv = check_nfsd_access(exp, rqstp); 1343 rv = check_nfsd_access(exp, rqstp);
1344 if (rv)
1345 fh_put(fhp);
1344out: 1346out:
1345 exp_put(exp); 1347 exp_put(exp);
1346 return rv; 1348 return rv;
@@ -1515,7 +1517,7 @@ static int e_show(struct seq_file *m, void *p)
1515 return svc_export_show(m, &svc_export_cache, cp); 1517 return svc_export_show(m, &svc_export_cache, cp);
1516} 1518}
1517 1519
1518struct seq_operations nfs_exports_op = { 1520const struct seq_operations nfs_exports_op = {
1519 .start = e_start, 1521 .start = e_start,
1520 .next = e_next, 1522 .next = e_next,
1521 .stop = e_stop, 1523 .stop = e_stop,
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 01d4ec1c88e0..edf926e1062f 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -814,17 +814,6 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
814 return p; 814 return p;
815} 815}
816 816
817static __be32 *
818encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p,
819 struct svc_fh *fhp)
820{
821 p = encode_post_op_attr(cd->rqstp, p, fhp);
822 *p++ = xdr_one; /* yes, a file handle follows */
823 p = encode_fh(p, fhp);
824 fh_put(fhp);
825 return p;
826}
827
828static int 817static int
829compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, 818compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
830 const char *name, int namlen) 819 const char *name, int namlen)
@@ -836,29 +825,54 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
836 dparent = cd->fh.fh_dentry; 825 dparent = cd->fh.fh_dentry;
837 exp = cd->fh.fh_export; 826 exp = cd->fh.fh_export;
838 827
839 fh_init(fhp, NFS3_FHSIZE);
840 if (isdotent(name, namlen)) { 828 if (isdotent(name, namlen)) {
841 if (namlen == 2) { 829 if (namlen == 2) {
842 dchild = dget_parent(dparent); 830 dchild = dget_parent(dparent);
843 if (dchild == dparent) { 831 if (dchild == dparent) {
844 /* filesystem root - cannot return filehandle for ".." */ 832 /* filesystem root - cannot return filehandle for ".." */
845 dput(dchild); 833 dput(dchild);
846 return 1; 834 return -ENOENT;
847 } 835 }
848 } else 836 } else
849 dchild = dget(dparent); 837 dchild = dget(dparent);
850 } else 838 } else
851 dchild = lookup_one_len(name, dparent, namlen); 839 dchild = lookup_one_len(name, dparent, namlen);
852 if (IS_ERR(dchild)) 840 if (IS_ERR(dchild))
853 return 1; 841 return -ENOENT;
854 if (d_mountpoint(dchild) || 842 rv = -ENOENT;
855 fh_compose(fhp, exp, dchild, &cd->fh) != 0 || 843 if (d_mountpoint(dchild))
856 !dchild->d_inode) 844 goto out;
857 rv = 1; 845 rv = fh_compose(fhp, exp, dchild, &cd->fh);
846 if (rv)
847 goto out;
848 if (!dchild->d_inode)
849 goto out;
850 rv = 0;
851out:
858 dput(dchild); 852 dput(dchild);
859 return rv; 853 return rv;
860} 854}
861 855
856__be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
857{
858 struct svc_fh fh;
859 int err;
860
861 fh_init(&fh, NFS3_FHSIZE);
862 err = compose_entry_fh(cd, &fh, name, namlen);
863 if (err) {
864 *p++ = 0;
865 *p++ = 0;
866 goto out;
867 }
868 p = encode_post_op_attr(cd->rqstp, p, &fh);
869 *p++ = xdr_one; /* yes, a file handle follows */
870 p = encode_fh(p, &fh);
871out:
872 fh_put(&fh);
873 return p;
874}
875
862/* 876/*
863 * Encode a directory entry. This one works for both normal readdir 877 * Encode a directory entry. This one works for both normal readdir
864 * and readdirplus. 878 * and readdirplus.
@@ -929,16 +943,8 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
929 943
930 p = encode_entry_baggage(cd, p, name, namlen, ino); 944 p = encode_entry_baggage(cd, p, name, namlen, ino);
931 945
932 /* throw in readdirplus baggage */ 946 if (plus)
933 if (plus) { 947 p = encode_entryplus_baggage(cd, p, name, namlen);
934 struct svc_fh fh;
935
936 if (compose_entry_fh(cd, &fh, name, namlen) > 0) {
937 *p++ = 0;
938 *p++ = 0;
939 } else
940 p = encode_entryplus_baggage(cd, p, &fh);
941 }
942 num_entry_words = p - cd->buffer; 948 num_entry_words = p - cd->buffer;
943 } else if (cd->rqstp->rq_respages[pn+1] != NULL) { 949 } else if (cd->rqstp->rq_respages[pn+1] != NULL) {
944 /* temporarily encode entry into next page, then move back to 950 /* temporarily encode entry into next page, then move back to
@@ -951,17 +957,8 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
951 957
952 p1 = encode_entry_baggage(cd, p1, name, namlen, ino); 958 p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
953 959
954 /* throw in readdirplus baggage */ 960 if (plus)
955 if (plus) { 961 p = encode_entryplus_baggage(cd, p1, name, namlen);
956 struct svc_fh fh;
957
958 if (compose_entry_fh(cd, &fh, name, namlen) > 0) {
959 /* zero out the filehandle */
960 *p1++ = 0;
961 *p1++ = 0;
962 } else
963 p1 = encode_entryplus_baggage(cd, p1, &fh);
964 }
965 962
966 /* determine entry word length and lengths to go in pages */ 963 /* determine entry word length and lengths to go in pages */
967 num_entry_words = p1 - tmp; 964 num_entry_words = p1 - tmp;
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 54b8b4140c8f..725d02f210e2 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -321,7 +321,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
321 deny = ~pas.group & pas.other; 321 deny = ~pas.group & pas.other;
322 if (deny) { 322 if (deny) {
323 ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; 323 ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
324 ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; 324 ace->flag = eflag;
325 ace->access_mask = deny_mask_from_posix(deny, flags); 325 ace->access_mask = deny_mask_from_posix(deny, flags);
326 ace->whotype = NFS4_ACL_WHO_GROUP; 326 ace->whotype = NFS4_ACL_WHO_GROUP;
327 ace++; 327 ace++;
@@ -335,7 +335,7 @@ _posix_to_nfsv4_one(struct posix_acl *pacl, struct nfs4_acl *acl,
335 if (deny) { 335 if (deny) {
336 ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE; 336 ace->type = NFS4_ACE_ACCESS_DENIED_ACE_TYPE;
337 ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP; 337 ace->flag = eflag | NFS4_ACE_IDENTIFIER_GROUP;
338 ace->access_mask = mask_from_posix(deny, flags); 338 ace->access_mask = deny_mask_from_posix(deny, flags);
339 ace->whotype = NFS4_ACL_WHO_NAMED; 339 ace->whotype = NFS4_ACL_WHO_NAMED;
340 ace->who = pa->e_id; 340 ace->who = pa->e_id;
341 ace++; 341 ace++;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 3fd23f7aceca..24e8d78f8dde 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -43,25 +43,30 @@
43#include <linux/sunrpc/xdr.h> 43#include <linux/sunrpc/xdr.h>
44#include <linux/sunrpc/svc.h> 44#include <linux/sunrpc/svc.h>
45#include <linux/sunrpc/clnt.h> 45#include <linux/sunrpc/clnt.h>
46#include <linux/sunrpc/svcsock.h>
46#include <linux/nfsd/nfsd.h> 47#include <linux/nfsd/nfsd.h>
47#include <linux/nfsd/state.h> 48#include <linux/nfsd/state.h>
48#include <linux/sunrpc/sched.h> 49#include <linux/sunrpc/sched.h>
49#include <linux/nfs4.h> 50#include <linux/nfs4.h>
51#include <linux/sunrpc/xprtsock.h>
50 52
51#define NFSDDBG_FACILITY NFSDDBG_PROC 53#define NFSDDBG_FACILITY NFSDDBG_PROC
52 54
53#define NFSPROC4_CB_NULL 0 55#define NFSPROC4_CB_NULL 0
54#define NFSPROC4_CB_COMPOUND 1 56#define NFSPROC4_CB_COMPOUND 1
57#define NFS4_STATEID_SIZE 16
55 58
56/* Index of predefined Linux callback client operations */ 59/* Index of predefined Linux callback client operations */
57 60
58enum { 61enum {
59 NFSPROC4_CLNT_CB_NULL = 0, 62 NFSPROC4_CLNT_CB_NULL = 0,
60 NFSPROC4_CLNT_CB_RECALL, 63 NFSPROC4_CLNT_CB_RECALL,
64 NFSPROC4_CLNT_CB_SEQUENCE,
61}; 65};
62 66
63enum nfs_cb_opnum4 { 67enum nfs_cb_opnum4 {
64 OP_CB_RECALL = 4, 68 OP_CB_RECALL = 4,
69 OP_CB_SEQUENCE = 11,
65}; 70};
66 71
67#define NFS4_MAXTAGLEN 20 72#define NFS4_MAXTAGLEN 20
@@ -70,17 +75,29 @@ enum nfs_cb_opnum4 {
70#define NFS4_dec_cb_null_sz 0 75#define NFS4_dec_cb_null_sz 0
71#define cb_compound_enc_hdr_sz 4 76#define cb_compound_enc_hdr_sz 4
72#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2)) 77#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2))
78#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2)
79#define cb_sequence_enc_sz (sessionid_sz + 4 + \
80 1 /* no referring calls list yet */)
81#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4)
82
73#define op_enc_sz 1 83#define op_enc_sz 1
74#define op_dec_sz 2 84#define op_dec_sz 2
75#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2)) 85#define enc_nfs4_fh_sz (1 + (NFS4_FHSIZE >> 2))
76#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2) 86#define enc_stateid_sz (NFS4_STATEID_SIZE >> 2)
77#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \ 87#define NFS4_enc_cb_recall_sz (cb_compound_enc_hdr_sz + \
88 cb_sequence_enc_sz + \
78 1 + enc_stateid_sz + \ 89 1 + enc_stateid_sz + \
79 enc_nfs4_fh_sz) 90 enc_nfs4_fh_sz)
80 91
81#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ 92#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
93 cb_sequence_dec_sz + \
82 op_dec_sz) 94 op_dec_sz)
83 95
96struct nfs4_rpc_args {
97 void *args_op;
98 struct nfsd4_cb_sequence args_seq;
99};
100
84/* 101/*
85* Generic encode routines from fs/nfs/nfs4xdr.c 102* Generic encode routines from fs/nfs/nfs4xdr.c
86*/ 103*/
@@ -137,11 +154,13 @@ xdr_error: \
137} while (0) 154} while (0)
138 155
139struct nfs4_cb_compound_hdr { 156struct nfs4_cb_compound_hdr {
140 int status; 157 /* args */
141 u32 ident; 158 u32 ident; /* minorversion 0 only */
142 u32 nops; 159 u32 nops;
143 __be32 *nops_p; 160 __be32 *nops_p;
144 u32 minorversion; 161 u32 minorversion;
162 /* res */
163 int status;
145 u32 taglen; 164 u32 taglen;
146 char *tag; 165 char *tag;
147}; 166};
@@ -238,6 +257,27 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
238 hdr->nops++; 257 hdr->nops++;
239} 258}
240 259
260static void
261encode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *args,
262 struct nfs4_cb_compound_hdr *hdr)
263{
264 __be32 *p;
265
266 if (hdr->minorversion == 0)
267 return;
268
269 RESERVE_SPACE(1 + NFS4_MAX_SESSIONID_LEN + 20);
270
271 WRITE32(OP_CB_SEQUENCE);
272 WRITEMEM(args->cbs_clp->cl_sessionid.data, NFS4_MAX_SESSIONID_LEN);
273 WRITE32(args->cbs_clp->cl_cb_seq_nr);
274 WRITE32(0); /* slotid, always 0 */
275 WRITE32(0); /* highest slotid always 0 */
276 WRITE32(0); /* cachethis always 0 */
277 WRITE32(0); /* FIXME: support referring_call_lists */
278 hdr->nops++;
279}
280
241static int 281static int
242nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p) 282nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
243{ 283{
@@ -249,15 +289,19 @@ nfs4_xdr_enc_cb_null(struct rpc_rqst *req, __be32 *p)
249} 289}
250 290
251static int 291static int
252nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, struct nfs4_delegation *args) 292nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
293 struct nfs4_rpc_args *rpc_args)
253{ 294{
254 struct xdr_stream xdr; 295 struct xdr_stream xdr;
296 struct nfs4_delegation *args = rpc_args->args_op;
255 struct nfs4_cb_compound_hdr hdr = { 297 struct nfs4_cb_compound_hdr hdr = {
256 .ident = args->dl_ident, 298 .ident = args->dl_ident,
299 .minorversion = rpc_args->args_seq.cbs_minorversion,
257 }; 300 };
258 301
259 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 302 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
260 encode_cb_compound_hdr(&xdr, &hdr); 303 encode_cb_compound_hdr(&xdr, &hdr);
304 encode_cb_sequence(&xdr, &rpc_args->args_seq, &hdr);
261 encode_cb_recall(&xdr, args, &hdr); 305 encode_cb_recall(&xdr, args, &hdr);
262 encode_cb_nops(&hdr); 306 encode_cb_nops(&hdr);
263 return 0; 307 return 0;
@@ -299,6 +343,57 @@ decode_cb_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
299 return 0; 343 return 0;
300} 344}
301 345
346/*
347 * Our current back channel implmentation supports a single backchannel
348 * with a single slot.
349 */
350static int
351decode_cb_sequence(struct xdr_stream *xdr, struct nfsd4_cb_sequence *res,
352 struct rpc_rqst *rqstp)
353{
354 struct nfs4_sessionid id;
355 int status;
356 u32 dummy;
357 __be32 *p;
358
359 if (res->cbs_minorversion == 0)
360 return 0;
361
362 status = decode_cb_op_hdr(xdr, OP_CB_SEQUENCE);
363 if (status)
364 return status;
365
366 /*
367 * If the server returns different values for sessionID, slotID or
368 * sequence number, the server is looney tunes.
369 */
370 status = -ESERVERFAULT;
371
372 READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
373 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
374 p += XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN);
375 if (memcmp(id.data, res->cbs_clp->cl_sessionid.data,
376 NFS4_MAX_SESSIONID_LEN)) {
377 dprintk("%s Invalid session id\n", __func__);
378 goto out;
379 }
380 READ32(dummy);
381 if (dummy != res->cbs_clp->cl_cb_seq_nr) {
382 dprintk("%s Invalid sequence number\n", __func__);
383 goto out;
384 }
385 READ32(dummy); /* slotid must be 0 */
386 if (dummy != 0) {
387 dprintk("%s Invalid slotid\n", __func__);
388 goto out;
389 }
390 /* FIXME: process highest slotid and target highest slotid */
391 status = 0;
392out:
393 return status;
394}
395
396
302static int 397static int
303nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p) 398nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
304{ 399{
@@ -306,7 +401,8 @@ nfs4_xdr_dec_cb_null(struct rpc_rqst *req, __be32 *p)
306} 401}
307 402
308static int 403static int
309nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p) 404nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p,
405 struct nfsd4_cb_sequence *seq)
310{ 406{
311 struct xdr_stream xdr; 407 struct xdr_stream xdr;
312 struct nfs4_cb_compound_hdr hdr; 408 struct nfs4_cb_compound_hdr hdr;
@@ -316,6 +412,11 @@ nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, __be32 *p)
316 status = decode_cb_compound_hdr(&xdr, &hdr); 412 status = decode_cb_compound_hdr(&xdr, &hdr);
317 if (status) 413 if (status)
318 goto out; 414 goto out;
415 if (seq) {
416 status = decode_cb_sequence(&xdr, seq, rqstp);
417 if (status)
418 goto out;
419 }
319 status = decode_cb_op_hdr(&xdr, OP_CB_RECALL); 420 status = decode_cb_op_hdr(&xdr, OP_CB_RECALL);
320out: 421out:
321 return status; 422 return status;
@@ -377,16 +478,15 @@ static int max_cb_time(void)
377 478
378int setup_callback_client(struct nfs4_client *clp) 479int setup_callback_client(struct nfs4_client *clp)
379{ 480{
380 struct sockaddr_in addr;
381 struct nfs4_cb_conn *cb = &clp->cl_cb_conn; 481 struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
382 struct rpc_timeout timeparms = { 482 struct rpc_timeout timeparms = {
383 .to_initval = max_cb_time(), 483 .to_initval = max_cb_time(),
384 .to_retries = 0, 484 .to_retries = 0,
385 }; 485 };
386 struct rpc_create_args args = { 486 struct rpc_create_args args = {
387 .protocol = IPPROTO_TCP, 487 .protocol = XPRT_TRANSPORT_TCP,
388 .address = (struct sockaddr *)&addr, 488 .address = (struct sockaddr *) &cb->cb_addr,
389 .addrsize = sizeof(addr), 489 .addrsize = cb->cb_addrlen,
390 .timeout = &timeparms, 490 .timeout = &timeparms,
391 .program = &cb_program, 491 .program = &cb_program,
392 .prognumber = cb->cb_prog, 492 .prognumber = cb->cb_prog,
@@ -399,13 +499,10 @@ int setup_callback_client(struct nfs4_client *clp)
399 499
400 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) 500 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
401 return -EINVAL; 501 return -EINVAL;
402 502 if (cb->cb_minorversion) {
403 /* Initialize address */ 503 args.bc_xprt = clp->cl_cb_xprt;
404 memset(&addr, 0, sizeof(addr)); 504 args.protocol = XPRT_TRANSPORT_BC_TCP;
405 addr.sin_family = AF_INET; 505 }
406 addr.sin_port = htons(cb->cb_port);
407 addr.sin_addr.s_addr = htonl(cb->cb_addr);
408
409 /* Create RPC client */ 506 /* Create RPC client */
410 client = rpc_create(&args); 507 client = rpc_create(&args);
411 if (IS_ERR(client)) { 508 if (IS_ERR(client)) {
@@ -439,42 +536,29 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
439 .rpc_call_done = nfsd4_cb_probe_done, 536 .rpc_call_done = nfsd4_cb_probe_done,
440}; 537};
441 538
442static struct rpc_cred *lookup_cb_cred(struct nfs4_cb_conn *cb) 539static struct rpc_cred *callback_cred;
443{
444 struct auth_cred acred = {
445 .machine_cred = 1
446 };
447 540
448 /* 541int set_callback_cred(void)
449 * Note in the gss case this doesn't actually have to wait for a 542{
450 * gss upcall (or any calls to the client); this just creates a 543 callback_cred = rpc_lookup_machine_cred();
451 * non-uptodate cred which the rpc state machine will fill in with 544 if (!callback_cred)
452 * a refresh_upcall later. 545 return -ENOMEM;
453 */ 546 return 0;
454 return rpcauth_lookup_credcache(cb->cb_client->cl_auth, &acred,
455 RPCAUTH_LOOKUP_NEW);
456} 547}
457 548
549
458void do_probe_callback(struct nfs4_client *clp) 550void do_probe_callback(struct nfs4_client *clp)
459{ 551{
460 struct nfs4_cb_conn *cb = &clp->cl_cb_conn; 552 struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
461 struct rpc_message msg = { 553 struct rpc_message msg = {
462 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], 554 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
463 .rpc_argp = clp, 555 .rpc_argp = clp,
556 .rpc_cred = callback_cred
464 }; 557 };
465 struct rpc_cred *cred;
466 int status; 558 int status;
467 559
468 cred = lookup_cb_cred(cb);
469 if (IS_ERR(cred)) {
470 status = PTR_ERR(cred);
471 goto out;
472 }
473 cb->cb_cred = cred;
474 msg.rpc_cred = cb->cb_cred;
475 status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT, 560 status = rpc_call_async(cb->cb_client, &msg, RPC_TASK_SOFT,
476 &nfsd4_cb_probe_ops, (void *)clp); 561 &nfsd4_cb_probe_ops, (void *)clp);
477out:
478 if (status) { 562 if (status) {
479 warn_no_callback_path(clp, status); 563 warn_no_callback_path(clp, status);
480 put_nfs4_client(clp); 564 put_nfs4_client(clp);
@@ -503,11 +587,95 @@ nfsd4_probe_callback(struct nfs4_client *clp)
503 do_probe_callback(clp); 587 do_probe_callback(clp);
504} 588}
505 589
590/*
591 * There's currently a single callback channel slot.
592 * If the slot is available, then mark it busy. Otherwise, set the
593 * thread for sleeping on the callback RPC wait queue.
594 */
595static int nfsd41_cb_setup_sequence(struct nfs4_client *clp,
596 struct rpc_task *task)
597{
598 struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
599 u32 *ptr = (u32 *)clp->cl_sessionid.data;
600 int status = 0;
601
602 dprintk("%s: %u:%u:%u:%u\n", __func__,
603 ptr[0], ptr[1], ptr[2], ptr[3]);
604
605 if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) {
606 rpc_sleep_on(&clp->cl_cb_waitq, task, NULL);
607 dprintk("%s slot is busy\n", __func__);
608 status = -EAGAIN;
609 goto out;
610 }
611
612 /*
613 * We'll need the clp during XDR encoding and decoding,
614 * and the sequence during decoding to verify the reply
615 */
616 args->args_seq.cbs_clp = clp;
617 task->tk_msg.rpc_resp = &args->args_seq;
618
619out:
620 dprintk("%s status=%d\n", __func__, status);
621 return status;
622}
623
624/*
625 * TODO: cb_sequence should support referring call lists, cachethis, multiple
626 * slots, and mark callback channel down on communication errors.
627 */
628static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
629{
630 struct nfs4_delegation *dp = calldata;
631 struct nfs4_client *clp = dp->dl_client;
632 struct nfs4_rpc_args *args = task->tk_msg.rpc_argp;
633 u32 minorversion = clp->cl_cb_conn.cb_minorversion;
634 int status = 0;
635
636 args->args_seq.cbs_minorversion = minorversion;
637 if (minorversion) {
638 status = nfsd41_cb_setup_sequence(clp, task);
639 if (status) {
640 if (status != -EAGAIN) {
641 /* terminate rpc task */
642 task->tk_status = status;
643 task->tk_action = NULL;
644 }
645 return;
646 }
647 }
648 rpc_call_start(task);
649}
650
651static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
652{
653 struct nfs4_delegation *dp = calldata;
654 struct nfs4_client *clp = dp->dl_client;
655
656 dprintk("%s: minorversion=%d\n", __func__,
657 clp->cl_cb_conn.cb_minorversion);
658
659 if (clp->cl_cb_conn.cb_minorversion) {
660 /* No need for lock, access serialized in nfsd4_cb_prepare */
661 ++clp->cl_cb_seq_nr;
662 clear_bit(0, &clp->cl_cb_slot_busy);
663 rpc_wake_up_next(&clp->cl_cb_waitq);
664 dprintk("%s: freed slot, new seqid=%d\n", __func__,
665 clp->cl_cb_seq_nr);
666
667 /* We're done looking into the sequence information */
668 task->tk_msg.rpc_resp = NULL;
669 }
670}
671
506static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) 672static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
507{ 673{
508 struct nfs4_delegation *dp = calldata; 674 struct nfs4_delegation *dp = calldata;
509 struct nfs4_client *clp = dp->dl_client; 675 struct nfs4_client *clp = dp->dl_client;
510 676
677 nfsd4_cb_done(task, calldata);
678
511 switch (task->tk_status) { 679 switch (task->tk_status) {
512 case -EIO: 680 case -EIO:
513 /* Network partition? */ 681 /* Network partition? */
@@ -520,16 +688,19 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
520 break; 688 break;
521 default: 689 default:
522 /* success, or error we can't handle */ 690 /* success, or error we can't handle */
523 return; 691 goto done;
524 } 692 }
525 if (dp->dl_retries--) { 693 if (dp->dl_retries--) {
526 rpc_delay(task, 2*HZ); 694 rpc_delay(task, 2*HZ);
527 task->tk_status = 0; 695 task->tk_status = 0;
528 rpc_restart_call(task); 696 rpc_restart_call(task);
697 return;
529 } else { 698 } else {
530 atomic_set(&clp->cl_cb_conn.cb_set, 0); 699 atomic_set(&clp->cl_cb_conn.cb_set, 0);
531 warn_no_callback_path(clp, task->tk_status); 700 warn_no_callback_path(clp, task->tk_status);
532 } 701 }
702done:
703 kfree(task->tk_msg.rpc_argp);
533} 704}
534 705
535static void nfsd4_cb_recall_release(void *calldata) 706static void nfsd4_cb_recall_release(void *calldata)
@@ -542,6 +713,7 @@ static void nfsd4_cb_recall_release(void *calldata)
542} 713}
543 714
544static const struct rpc_call_ops nfsd4_cb_recall_ops = { 715static const struct rpc_call_ops nfsd4_cb_recall_ops = {
716 .rpc_call_prepare = nfsd4_cb_prepare,
545 .rpc_call_done = nfsd4_cb_recall_done, 717 .rpc_call_done = nfsd4_cb_recall_done,
546 .rpc_release = nfsd4_cb_recall_release, 718 .rpc_release = nfsd4_cb_recall_release,
547}; 719};
@@ -554,17 +726,24 @@ nfsd4_cb_recall(struct nfs4_delegation *dp)
554{ 726{
555 struct nfs4_client *clp = dp->dl_client; 727 struct nfs4_client *clp = dp->dl_client;
556 struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client; 728 struct rpc_clnt *clnt = clp->cl_cb_conn.cb_client;
729 struct nfs4_rpc_args *args;
557 struct rpc_message msg = { 730 struct rpc_message msg = {
558 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL], 731 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
559 .rpc_argp = dp, 732 .rpc_cred = callback_cred
560 .rpc_cred = clp->cl_cb_conn.cb_cred
561 }; 733 };
562 int status; 734 int status = -ENOMEM;
563 735
736 args = kzalloc(sizeof(*args), GFP_KERNEL);
737 if (!args)
738 goto out;
739 args->args_op = dp;
740 msg.rpc_argp = args;
564 dp->dl_retries = 1; 741 dp->dl_retries = 1;
565 status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, 742 status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT,
566 &nfsd4_cb_recall_ops, dp); 743 &nfsd4_cb_recall_ops, dp);
744out:
567 if (status) { 745 if (status) {
746 kfree(args);
568 put_nfs4_client(clp); 747 put_nfs4_client(clp);
569 nfs4_put_delegation(dp); 748 nfs4_put_delegation(dp);
570 } 749 }
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index cdfa86fa1471..ba2c199592fd 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -38,7 +38,6 @@
38#include <linux/init.h> 38#include <linux/init.h>
39 39
40#include <linux/mm.h> 40#include <linux/mm.h>
41#include <linux/utsname.h>
42#include <linux/errno.h> 41#include <linux/errno.h>
43#include <linux/string.h> 42#include <linux/string.h>
44#include <linux/sunrpc/clnt.h> 43#include <linux/sunrpc/clnt.h>
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 7c8801769a3c..bebc0c2e1b0a 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -68,7 +68,6 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
68 u32 *bmval, u32 *writable) 68 u32 *bmval, u32 *writable)
69{ 69{
70 struct dentry *dentry = cstate->current_fh.fh_dentry; 70 struct dentry *dentry = cstate->current_fh.fh_dentry;
71 struct svc_export *exp = cstate->current_fh.fh_export;
72 71
73 /* 72 /*
74 * Check about attributes are supported by the NFSv4 server or not. 73 * Check about attributes are supported by the NFSv4 server or not.
@@ -80,17 +79,13 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
80 return nfserr_attrnotsupp; 79 return nfserr_attrnotsupp;
81 80
82 /* 81 /*
83 * Check FATTR4_WORD0_ACL & FATTR4_WORD0_FS_LOCATIONS can be supported 82 * Check FATTR4_WORD0_ACL can be supported
84 * in current environment or not. 83 * in current environment or not.
85 */ 84 */
86 if (bmval[0] & FATTR4_WORD0_ACL) { 85 if (bmval[0] & FATTR4_WORD0_ACL) {
87 if (!IS_POSIXACL(dentry->d_inode)) 86 if (!IS_POSIXACL(dentry->d_inode))
88 return nfserr_attrnotsupp; 87 return nfserr_attrnotsupp;
89 } 88 }
90 if (bmval[0] & FATTR4_WORD0_FS_LOCATIONS) {
91 if (exp->ex_fslocs.locations == NULL)
92 return nfserr_attrnotsupp;
93 }
94 89
95 /* 90 /*
96 * According to spec, read-only attributes return ERR_INVAL. 91 * According to spec, read-only attributes return ERR_INVAL.
@@ -123,6 +118,35 @@ nfsd4_check_open_attributes(struct svc_rqst *rqstp,
123 return status; 118 return status;
124} 119}
125 120
121static int
122is_create_with_attrs(struct nfsd4_open *open)
123{
124 return open->op_create == NFS4_OPEN_CREATE
125 && (open->op_createmode == NFS4_CREATE_UNCHECKED
126 || open->op_createmode == NFS4_CREATE_GUARDED
127 || open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1);
128}
129
130/*
131 * if error occurs when setting the acl, just clear the acl bit
132 * in the returned attr bitmap.
133 */
134static void
135do_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
136 struct nfs4_acl *acl, u32 *bmval)
137{
138 __be32 status;
139
140 status = nfsd4_set_nfs4_acl(rqstp, fhp, acl);
141 if (status)
142 /*
143 * We should probably fail the whole open at this point,
144 * but we've already created the file, so it's too late;
145 * So this seems the least of evils:
146 */
147 bmval[0] &= ~FATTR4_WORD0_ACL;
148}
149
126static inline void 150static inline void
127fh_dup2(struct svc_fh *dst, struct svc_fh *src) 151fh_dup2(struct svc_fh *dst, struct svc_fh *src)
128{ 152{
@@ -206,6 +230,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
206 if (status) 230 if (status)
207 goto out; 231 goto out;
208 232
233 if (is_create_with_attrs(open) && open->op_acl != NULL)
234 do_set_nfs4_acl(rqstp, &resfh, open->op_acl, open->op_bmval);
235
209 set_change_info(&open->op_cinfo, current_fh); 236 set_change_info(&open->op_cinfo, current_fh);
210 fh_dup2(current_fh, &resfh); 237 fh_dup2(current_fh, &resfh);
211 238
@@ -536,12 +563,17 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
536 status = nfserr_badtype; 563 status = nfserr_badtype;
537 } 564 }
538 565
539 if (!status) { 566 if (status)
540 fh_unlock(&cstate->current_fh); 567 goto out;
541 set_change_info(&create->cr_cinfo, &cstate->current_fh); 568
542 fh_dup2(&cstate->current_fh, &resfh); 569 if (create->cr_acl != NULL)
543 } 570 do_set_nfs4_acl(rqstp, &resfh, create->cr_acl,
571 create->cr_bmval);
544 572
573 fh_unlock(&cstate->current_fh);
574 set_change_info(&create->cr_cinfo, &cstate->current_fh);
575 fh_dup2(&cstate->current_fh, &resfh);
576out:
545 fh_put(&resfh); 577 fh_put(&resfh);
546 return status; 578 return status;
547} 579}
@@ -947,34 +979,6 @@ static struct nfsd4_operation nfsd4_ops[];
947static const char *nfsd4_op_name(unsigned opnum); 979static const char *nfsd4_op_name(unsigned opnum);
948 980
949/* 981/*
950 * This is a replay of a compound for which no cache entry pages
951 * were used. Encode the sequence operation, and if cachethis is FALSE
952 * encode the uncache rep error on the next operation.
953 */
954static __be32
955nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args,
956 struct nfsd4_compoundres *resp)
957{
958 struct nfsd4_op *op;
959
960 dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__,
961 resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
962
963 /* Encode the replayed sequence operation */
964 BUG_ON(resp->opcnt != 1);
965 op = &args->ops[resp->opcnt - 1];
966 nfsd4_encode_operation(resp, op);
967
968 /*return nfserr_retry_uncached_rep in next operation. */
969 if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) {
970 op = &args->ops[resp->opcnt++];
971 op->status = nfserr_retry_uncached_rep;
972 nfsd4_encode_operation(resp, op);
973 }
974 return op->status;
975}
976
977/*
978 * Enforce NFSv4.1 COMPOUND ordering rules. 982 * Enforce NFSv4.1 COMPOUND ordering rules.
979 * 983 *
980 * TODO: 984 * TODO:
@@ -1083,13 +1087,10 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1083 BUG_ON(op->status == nfs_ok); 1087 BUG_ON(op->status == nfs_ok);
1084 1088
1085encode_op: 1089encode_op:
1086 /* Only from SEQUENCE or CREATE_SESSION */ 1090 /* Only from SEQUENCE */
1087 if (resp->cstate.status == nfserr_replay_cache) { 1091 if (resp->cstate.status == nfserr_replay_cache) {
1088 dprintk("%s NFS4.1 replay from cache\n", __func__); 1092 dprintk("%s NFS4.1 replay from cache\n", __func__);
1089 if (nfsd4_not_cached(resp)) 1093 status = op->status;
1090 status = nfsd4_enc_uncached_replay(args, resp);
1091 else
1092 status = op->status;
1093 goto out; 1094 goto out;
1094 } 1095 }
1095 if (op->status == nfserr_replay_me) { 1096 if (op->status == nfserr_replay_me) {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 980a216a48c8..2153f9bdbebd 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -55,6 +55,7 @@
55#include <linux/lockd/bind.h> 55#include <linux/lockd/bind.h>
56#include <linux/module.h> 56#include <linux/module.h>
57#include <linux/sunrpc/svcauth_gss.h> 57#include <linux/sunrpc/svcauth_gss.h>
58#include <linux/sunrpc/clnt.h>
58 59
59#define NFSDDBG_FACILITY NFSDDBG_PROC 60#define NFSDDBG_FACILITY NFSDDBG_PROC
60 61
@@ -413,36 +414,65 @@ gen_sessionid(struct nfsd4_session *ses)
413} 414}
414 415
415/* 416/*
416 * Give the client the number of slots it requests bound by 417 * The protocol defines ca_maxresponssize_cached to include the size of
417 * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages. 418 * the rpc header, but all we need to cache is the data starting after
419 * the end of the initial SEQUENCE operation--the rest we regenerate
420 * each time. Therefore we can advertise a ca_maxresponssize_cached
421 * value that is the number of bytes in our cache plus a few additional
422 * bytes. In order to stay on the safe side, and not promise more than
423 * we can cache, those additional bytes must be the minimum possible: 24
424 * bytes of rpc header (xid through accept state, with AUTH_NULL
425 * verifier), 12 for the compound header (with zero-length tag), and 44
426 * for the SEQUENCE op response:
427 */
428#define NFSD_MIN_HDR_SEQ_SZ (24 + 12 + 44)
429
430/*
431 * Give the client the number of ca_maxresponsesize_cached slots it
432 * requests, of size bounded by NFSD_SLOT_CACHE_SIZE,
433 * NFSD_MAX_MEM_PER_SESSION, and nfsd_drc_max_mem. Do not allow more
434 * than NFSD_MAX_SLOTS_PER_SESSION.
418 * 435 *
419 * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we 436 * If we run out of reserved DRC memory we should (up to a point)
420 * should (up to a point) re-negotiate active sessions and reduce their 437 * re-negotiate active sessions and reduce their slot usage to make
421 * slot usage to make rooom for new connections. For now we just fail the 438 * rooom for new connections. For now we just fail the create session.
422 * create session.
423 */ 439 */
424static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan) 440static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan)
425{ 441{
426 int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT; 442 int mem, size = fchan->maxresp_cached;
427 443
428 if (fchan->maxreqs < 1) 444 if (fchan->maxreqs < 1)
429 return nfserr_inval; 445 return nfserr_inval;
430 else if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
431 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
432 446
433 spin_lock(&nfsd_serv->sv_lock); 447 if (size < NFSD_MIN_HDR_SEQ_SZ)
434 if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages) 448 size = NFSD_MIN_HDR_SEQ_SZ;
435 np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used; 449 size -= NFSD_MIN_HDR_SEQ_SZ;
436 nfsd_serv->sv_drc_pages_used += np; 450 if (size > NFSD_SLOT_CACHE_SIZE)
437 spin_unlock(&nfsd_serv->sv_lock); 451 size = NFSD_SLOT_CACHE_SIZE;
452
453 /* bound the maxreqs by NFSD_MAX_MEM_PER_SESSION */
454 mem = fchan->maxreqs * size;
455 if (mem > NFSD_MAX_MEM_PER_SESSION) {
456 fchan->maxreqs = NFSD_MAX_MEM_PER_SESSION / size;
457 if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
458 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
459 mem = fchan->maxreqs * size;
460 }
438 461
439 if (np <= 0) { 462 spin_lock(&nfsd_drc_lock);
440 status = nfserr_resource; 463 /* bound the total session drc memory ussage */
441 fchan->maxreqs = 0; 464 if (mem + nfsd_drc_mem_used > nfsd_drc_max_mem) {
442 } else 465 fchan->maxreqs = (nfsd_drc_max_mem - nfsd_drc_mem_used) / size;
443 fchan->maxreqs = np / NFSD_PAGES_PER_SLOT; 466 mem = fchan->maxreqs * size;
467 }
468 nfsd_drc_mem_used += mem;
469 spin_unlock(&nfsd_drc_lock);
444 470
445 return status; 471 if (fchan->maxreqs == 0)
472 return nfserr_serverfault;
473
474 fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ;
475 return 0;
446} 476}
447 477
448/* 478/*
@@ -466,36 +496,41 @@ static int init_forechannel_attrs(struct svc_rqst *rqstp,
466 fchan->maxresp_sz = maxcount; 496 fchan->maxresp_sz = maxcount;
467 session_fchan->maxresp_sz = fchan->maxresp_sz; 497 session_fchan->maxresp_sz = fchan->maxresp_sz;
468 498
469 /* Set the max response cached size our default which is
470 * a multiple of PAGE_SIZE and small */
471 session_fchan->maxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
472 fchan->maxresp_cached = session_fchan->maxresp_cached;
473
474 /* Use the client's maxops if possible */ 499 /* Use the client's maxops if possible */
475 if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND) 500 if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
476 fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND; 501 fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
477 session_fchan->maxops = fchan->maxops; 502 session_fchan->maxops = fchan->maxops;
478 503
479 /* try to use the client requested number of slots */
480 if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
481 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
482
483 /* FIXME: Error means no more DRC pages so the server should 504 /* FIXME: Error means no more DRC pages so the server should
484 * recover pages from existing sessions. For now fail session 505 * recover pages from existing sessions. For now fail session
485 * creation. 506 * creation.
486 */ 507 */
487 status = set_forechannel_maxreqs(fchan); 508 status = set_forechannel_drc_size(fchan);
488 509
510 session_fchan->maxresp_cached = fchan->maxresp_cached;
489 session_fchan->maxreqs = fchan->maxreqs; 511 session_fchan->maxreqs = fchan->maxreqs;
512
513 dprintk("%s status %d\n", __func__, status);
490 return status; 514 return status;
491} 515}
492 516
517static void
518free_session_slots(struct nfsd4_session *ses)
519{
520 int i;
521
522 for (i = 0; i < ses->se_fchannel.maxreqs; i++)
523 kfree(ses->se_slots[i]);
524}
525
493static int 526static int
494alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, 527alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
495 struct nfsd4_create_session *cses) 528 struct nfsd4_create_session *cses)
496{ 529{
497 struct nfsd4_session *new, tmp; 530 struct nfsd4_session *new, tmp;
498 int idx, status = nfserr_resource, slotsize; 531 struct nfsd4_slot *sp;
532 int idx, slotsize, cachesize, i;
533 int status;
499 534
500 memset(&tmp, 0, sizeof(tmp)); 535 memset(&tmp, 0, sizeof(tmp));
501 536
@@ -506,14 +541,27 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
506 if (status) 541 if (status)
507 goto out; 542 goto out;
508 543
509 /* allocate struct nfsd4_session and slot table in one piece */ 544 BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot)
510 slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot); 545 + sizeof(struct nfsd4_session) > PAGE_SIZE);
546
547 status = nfserr_serverfault;
548 /* allocate struct nfsd4_session and slot table pointers in one piece */
549 slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
511 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL); 550 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
512 if (!new) 551 if (!new)
513 goto out; 552 goto out;
514 553
515 memcpy(new, &tmp, sizeof(*new)); 554 memcpy(new, &tmp, sizeof(*new));
516 555
556 /* allocate each struct nfsd4_slot and data cache in one piece */
557 cachesize = new->se_fchannel.maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
558 for (i = 0; i < new->se_fchannel.maxreqs; i++) {
559 sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL);
560 if (!sp)
561 goto out_free;
562 new->se_slots[i] = sp;
563 }
564
517 new->se_client = clp; 565 new->se_client = clp;
518 gen_sessionid(new); 566 gen_sessionid(new);
519 idx = hash_sessionid(&new->se_sessionid); 567 idx = hash_sessionid(&new->se_sessionid);
@@ -530,6 +578,10 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
530 status = nfs_ok; 578 status = nfs_ok;
531out: 579out:
532 return status; 580 return status;
581out_free:
582 free_session_slots(new);
583 kfree(new);
584 goto out;
533} 585}
534 586
535/* caller must hold sessionid_lock */ 587/* caller must hold sessionid_lock */
@@ -572,19 +624,16 @@ release_session(struct nfsd4_session *ses)
572 nfsd4_put_session(ses); 624 nfsd4_put_session(ses);
573} 625}
574 626
575static void nfsd4_release_respages(struct page **respages, short resused);
576
577void 627void
578free_session(struct kref *kref) 628free_session(struct kref *kref)
579{ 629{
580 struct nfsd4_session *ses; 630 struct nfsd4_session *ses;
581 int i;
582 631
583 ses = container_of(kref, struct nfsd4_session, se_ref); 632 ses = container_of(kref, struct nfsd4_session, se_ref);
584 for (i = 0; i < ses->se_fchannel.maxreqs; i++) { 633 spin_lock(&nfsd_drc_lock);
585 struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry; 634 nfsd_drc_mem_used -= ses->se_fchannel.maxreqs * NFSD_SLOT_CACHE_SIZE;
586 nfsd4_release_respages(e->ce_respages, e->ce_resused); 635 spin_unlock(&nfsd_drc_lock);
587 } 636 free_session_slots(ses);
588 kfree(ses); 637 kfree(ses);
589} 638}
590 639
@@ -647,18 +696,14 @@ shutdown_callback_client(struct nfs4_client *clp)
647 clp->cl_cb_conn.cb_client = NULL; 696 clp->cl_cb_conn.cb_client = NULL;
648 rpc_shutdown_client(clnt); 697 rpc_shutdown_client(clnt);
649 } 698 }
650 if (clp->cl_cb_conn.cb_cred) {
651 put_rpccred(clp->cl_cb_conn.cb_cred);
652 clp->cl_cb_conn.cb_cred = NULL;
653 }
654} 699}
655 700
656static inline void 701static inline void
657free_client(struct nfs4_client *clp) 702free_client(struct nfs4_client *clp)
658{ 703{
659 shutdown_callback_client(clp); 704 shutdown_callback_client(clp);
660 nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages, 705 if (clp->cl_cb_xprt)
661 clp->cl_slot.sl_cache_entry.ce_resused); 706 svc_xprt_put(clp->cl_cb_xprt);
662 if (clp->cl_cred.cr_group_info) 707 if (clp->cl_cred.cr_group_info)
663 put_group_info(clp->cl_cred.cr_group_info); 708 put_group_info(clp->cl_cred.cr_group_info);
664 kfree(clp->cl_principal); 709 kfree(clp->cl_principal);
@@ -714,25 +759,6 @@ expire_client(struct nfs4_client *clp)
714 put_nfs4_client(clp); 759 put_nfs4_client(clp);
715} 760}
716 761
717static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
718{
719 struct nfs4_client *clp;
720
721 clp = alloc_client(name);
722 if (clp == NULL)
723 return NULL;
724 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
725 atomic_set(&clp->cl_count, 1);
726 atomic_set(&clp->cl_cb_conn.cb_set, 0);
727 INIT_LIST_HEAD(&clp->cl_idhash);
728 INIT_LIST_HEAD(&clp->cl_strhash);
729 INIT_LIST_HEAD(&clp->cl_openowners);
730 INIT_LIST_HEAD(&clp->cl_delegations);
731 INIT_LIST_HEAD(&clp->cl_sessions);
732 INIT_LIST_HEAD(&clp->cl_lru);
733 return clp;
734}
735
736static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) 762static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
737{ 763{
738 memcpy(target->cl_verifier.data, source->data, 764 memcpy(target->cl_verifier.data, source->data,
@@ -795,6 +821,46 @@ static void gen_confirm(struct nfs4_client *clp)
795 *p++ = i++; 821 *p++ = i++;
796} 822}
797 823
824static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
825 struct svc_rqst *rqstp, nfs4_verifier *verf)
826{
827 struct nfs4_client *clp;
828 struct sockaddr *sa = svc_addr(rqstp);
829 char *princ;
830
831 clp = alloc_client(name);
832 if (clp == NULL)
833 return NULL;
834
835 princ = svc_gss_principal(rqstp);
836 if (princ) {
837 clp->cl_principal = kstrdup(princ, GFP_KERNEL);
838 if (clp->cl_principal == NULL) {
839 free_client(clp);
840 return NULL;
841 }
842 }
843
844 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
845 atomic_set(&clp->cl_count, 1);
846 atomic_set(&clp->cl_cb_conn.cb_set, 0);
847 INIT_LIST_HEAD(&clp->cl_idhash);
848 INIT_LIST_HEAD(&clp->cl_strhash);
849 INIT_LIST_HEAD(&clp->cl_openowners);
850 INIT_LIST_HEAD(&clp->cl_delegations);
851 INIT_LIST_HEAD(&clp->cl_sessions);
852 INIT_LIST_HEAD(&clp->cl_lru);
853 clear_bit(0, &clp->cl_cb_slot_busy);
854 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
855 copy_verf(clp, verf);
856 rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
857 clp->cl_flavor = rqstp->rq_flavor;
858 copy_cred(&clp->cl_cred, &rqstp->rq_cred);
859 gen_confirm(clp);
860
861 return clp;
862}
863
798static int check_name(struct xdr_netobj name) 864static int check_name(struct xdr_netobj name)
799{ 865{
800 if (name.len == 0) 866 if (name.len == 0)
@@ -902,93 +968,40 @@ find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
902 return NULL; 968 return NULL;
903} 969}
904 970
905/* a helper function for parse_callback */
906static int
907parse_octet(unsigned int *lenp, char **addrp)
908{
909 unsigned int len = *lenp;
910 char *p = *addrp;
911 int n = -1;
912 char c;
913
914 for (;;) {
915 if (!len)
916 break;
917 len--;
918 c = *p++;
919 if (c == '.')
920 break;
921 if ((c < '0') || (c > '9')) {
922 n = -1;
923 break;
924 }
925 if (n < 0)
926 n = 0;
927 n = (n * 10) + (c - '0');
928 if (n > 255) {
929 n = -1;
930 break;
931 }
932 }
933 *lenp = len;
934 *addrp = p;
935 return n;
936}
937
938/* parse and set the setclientid ipv4 callback address */
939static int
940parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigned short *cbportp)
941{
942 int temp = 0;
943 u32 cbaddr = 0;
944 u16 cbport = 0;
945 u32 addrlen = addr_len;
946 char *addr = addr_val;
947 int i, shift;
948
949 /* ipaddress */
950 shift = 24;
951 for(i = 4; i > 0 ; i--) {
952 if ((temp = parse_octet(&addrlen, &addr)) < 0) {
953 return 0;
954 }
955 cbaddr |= (temp << shift);
956 if (shift > 0)
957 shift -= 8;
958 }
959 *cbaddrp = cbaddr;
960
961 /* port */
962 shift = 8;
963 for(i = 2; i > 0 ; i--) {
964 if ((temp = parse_octet(&addrlen, &addr)) < 0) {
965 return 0;
966 }
967 cbport |= (temp << shift);
968 if (shift > 0)
969 shift -= 8;
970 }
971 *cbportp = cbport;
972 return 1;
973}
974
975static void 971static void
976gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se) 972gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, u32 scopeid)
977{ 973{
978 struct nfs4_cb_conn *cb = &clp->cl_cb_conn; 974 struct nfs4_cb_conn *cb = &clp->cl_cb_conn;
979 975 unsigned short expected_family;
980 /* Currently, we only support tcp for the callback channel */ 976
981 if ((se->se_callback_netid_len != 3) || memcmp((char *)se->se_callback_netid_val, "tcp", 3)) 977 /* Currently, we only support tcp and tcp6 for the callback channel */
978 if (se->se_callback_netid_len == 3 &&
979 !memcmp(se->se_callback_netid_val, "tcp", 3))
980 expected_family = AF_INET;
981 else if (se->se_callback_netid_len == 4 &&
982 !memcmp(se->se_callback_netid_val, "tcp6", 4))
983 expected_family = AF_INET6;
984 else
982 goto out_err; 985 goto out_err;
983 986
984 if ( !(parse_ipv4(se->se_callback_addr_len, se->se_callback_addr_val, 987 cb->cb_addrlen = rpc_uaddr2sockaddr(se->se_callback_addr_val,
985 &cb->cb_addr, &cb->cb_port))) 988 se->se_callback_addr_len,
989 (struct sockaddr *) &cb->cb_addr,
990 sizeof(cb->cb_addr));
991
992 if (!cb->cb_addrlen || cb->cb_addr.ss_family != expected_family)
986 goto out_err; 993 goto out_err;
994
995 if (cb->cb_addr.ss_family == AF_INET6)
996 ((struct sockaddr_in6 *) &cb->cb_addr)->sin6_scope_id = scopeid;
997
987 cb->cb_minorversion = 0; 998 cb->cb_minorversion = 0;
988 cb->cb_prog = se->se_callback_prog; 999 cb->cb_prog = se->se_callback_prog;
989 cb->cb_ident = se->se_callback_ident; 1000 cb->cb_ident = se->se_callback_ident;
990 return; 1001 return;
991out_err: 1002out_err:
1003 cb->cb_addr.ss_family = AF_UNSPEC;
1004 cb->cb_addrlen = 0;
992 dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) " 1005 dprintk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
993 "will not receive delegations\n", 1006 "will not receive delegations\n",
994 clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id); 1007 clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
@@ -996,175 +1009,87 @@ out_err:
996 return; 1009 return;
997} 1010}
998 1011
999void
1000nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
1001{
1002 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1003
1004 resp->cstate.statp = statp;
1005}
1006
1007/* 1012/*
1008 * Dereference the result pages. 1013 * Cache a reply. nfsd4_check_drc_limit() has bounded the cache size.
1009 */ 1014 */
1010static void 1015void
1011nfsd4_release_respages(struct page **respages, short resused) 1016nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
1012{ 1017{
1013 int i; 1018 struct nfsd4_slot *slot = resp->cstate.slot;
1019 unsigned int base;
1014 1020
1015 dprintk("--> %s\n", __func__); 1021 dprintk("--> %s slot %p\n", __func__, slot);
1016 for (i = 0; i < resused; i++) {
1017 if (!respages[i])
1018 continue;
1019 put_page(respages[i]);
1020 respages[i] = NULL;
1021 }
1022}
1023 1022
1024static void 1023 slot->sl_opcnt = resp->opcnt;
1025nfsd4_copy_pages(struct page **topages, struct page **frompages, short count) 1024 slot->sl_status = resp->cstate.status;
1026{
1027 int i;
1028 1025
1029 for (i = 0; i < count; i++) { 1026 if (nfsd4_not_cached(resp)) {
1030 topages[i] = frompages[i]; 1027 slot->sl_datalen = 0;
1031 if (!topages[i]) 1028 return;
1032 continue;
1033 get_page(topages[i]);
1034 } 1029 }
1030 slot->sl_datalen = (char *)resp->p - (char *)resp->cstate.datap;
1031 base = (char *)resp->cstate.datap -
1032 (char *)resp->xbuf->head[0].iov_base;
1033 if (read_bytes_from_xdr_buf(resp->xbuf, base, slot->sl_data,
1034 slot->sl_datalen))
1035 WARN("%s: sessions DRC could not cache compound\n", __func__);
1036 return;
1035} 1037}
1036 1038
1037/* 1039/*
1038 * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous 1040 * Encode the replay sequence operation from the slot values.
1039 * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total 1041 * If cachethis is FALSE encode the uncached rep error on the next
1040 * length of the XDR response is less than se_fmaxresp_cached 1042 * operation which sets resp->p and increments resp->opcnt for
1041 * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a 1043 * nfs4svc_encode_compoundres.
1042 * of the reply (e.g. readdir).
1043 * 1044 *
1044 * Store the base and length of the rq_req.head[0] page
1045 * of the NFSv4.1 data, just past the rpc header.
1046 */ 1045 */
1047void 1046static __be32
1048nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) 1047nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args,
1048 struct nfsd4_compoundres *resp)
1049{ 1049{
1050 struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry; 1050 struct nfsd4_op *op;
1051 struct svc_rqst *rqstp = resp->rqstp; 1051 struct nfsd4_slot *slot = resp->cstate.slot;
1052 struct nfsd4_compoundargs *args = rqstp->rq_argp;
1053 struct nfsd4_op *op = &args->ops[resp->opcnt];
1054 struct kvec *resv = &rqstp->rq_res.head[0];
1055
1056 dprintk("--> %s entry %p\n", __func__, entry);
1057
1058 /* Don't cache a failed OP_SEQUENCE. */
1059 if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
1060 return;
1061 1052
1062 nfsd4_release_respages(entry->ce_respages, entry->ce_resused); 1053 dprintk("--> %s resp->opcnt %d cachethis %u \n", __func__,
1063 entry->ce_opcnt = resp->opcnt; 1054 resp->opcnt, resp->cstate.slot->sl_cachethis);
1064 entry->ce_status = resp->cstate.status;
1065 1055
1066 /* 1056 /* Encode the replayed sequence operation */
1067 * Don't need a page to cache just the sequence operation - the slot 1057 op = &args->ops[resp->opcnt - 1];
1068 * does this for us! 1058 nfsd4_encode_operation(resp, op);
1069 */
1070 1059
1071 if (nfsd4_not_cached(resp)) { 1060 /* Return nfserr_retry_uncached_rep in next operation. */
1072 entry->ce_resused = 0; 1061 if (args->opcnt > 1 && slot->sl_cachethis == 0) {
1073 entry->ce_rpchdrlen = 0; 1062 op = &args->ops[resp->opcnt++];
1074 dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__, 1063 op->status = nfserr_retry_uncached_rep;
1075 resp->cstate.slot->sl_cache_entry.ce_cachethis); 1064 nfsd4_encode_operation(resp, op);
1076 return;
1077 }
1078 entry->ce_resused = rqstp->rq_resused;
1079 if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
1080 entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
1081 nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
1082 entry->ce_resused);
1083 entry->ce_datav.iov_base = resp->cstate.statp;
1084 entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
1085 (char *)page_address(rqstp->rq_respages[0]));
1086 /* Current request rpc header length*/
1087 entry->ce_rpchdrlen = (char *)resp->cstate.statp -
1088 (char *)page_address(rqstp->rq_respages[0]);
1089}
1090
1091/*
1092 * We keep the rpc header, but take the nfs reply from the replycache.
1093 */
1094static int
1095nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
1096 struct nfsd4_cache_entry *entry)
1097{
1098 struct svc_rqst *rqstp = resp->rqstp;
1099 struct kvec *resv = &resp->rqstp->rq_res.head[0];
1100 int len;
1101
1102 /* Current request rpc header length*/
1103 len = (char *)resp->cstate.statp -
1104 (char *)page_address(rqstp->rq_respages[0]);
1105 if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
1106 dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
1107 entry->ce_datav.iov_len);
1108 return 0;
1109 } 1065 }
1110 /* copy the cached reply nfsd data past the current rpc header */ 1066 return op->status;
1111 memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
1112 entry->ce_datav.iov_len);
1113 resv->iov_len = len + entry->ce_datav.iov_len;
1114 return 1;
1115} 1067}
1116 1068
1117/* 1069/*
1118 * Keep the first page of the replay. Copy the NFSv4.1 data from the first 1070 * The sequence operation is not cached because we can use the slot and
1119 * cached page. Replace any futher replay pages from the cache. 1071 * session values.
1120 */ 1072 */
1121__be32 1073__be32
1122nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, 1074nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
1123 struct nfsd4_sequence *seq) 1075 struct nfsd4_sequence *seq)
1124{ 1076{
1125 struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry; 1077 struct nfsd4_slot *slot = resp->cstate.slot;
1126 __be32 status; 1078 __be32 status;
1127 1079
1128 dprintk("--> %s entry %p\n", __func__, entry); 1080 dprintk("--> %s slot %p\n", __func__, slot);
1129
1130 /*
1131 * If this is just the sequence operation, we did not keep
1132 * a page in the cache entry because we can just use the
1133 * slot info stored in struct nfsd4_sequence that was checked
1134 * against the slot in nfsd4_sequence().
1135 *
1136 * This occurs when seq->cachethis is FALSE, or when the client
1137 * session inactivity timer fires and a solo sequence operation
1138 * is sent (lease renewal).
1139 */
1140 if (seq && nfsd4_not_cached(resp)) {
1141 seq->maxslots = resp->cstate.session->se_fchannel.maxreqs;
1142 return nfs_ok;
1143 }
1144
1145 if (!nfsd41_copy_replay_data(resp, entry)) {
1146 /*
1147 * Not enough room to use the replay rpc header, send the
1148 * cached header. Release all the allocated result pages.
1149 */
1150 svc_free_res_pages(resp->rqstp);
1151 nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
1152 entry->ce_resused);
1153 } else {
1154 /* Release all but the first allocated result page */
1155 1081
1156 resp->rqstp->rq_resused--; 1082 /* Either returns 0 or nfserr_retry_uncached */
1157 svc_free_res_pages(resp->rqstp); 1083 status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp);
1084 if (status == nfserr_retry_uncached_rep)
1085 return status;
1158 1086
1159 nfsd4_copy_pages(&resp->rqstp->rq_respages[1], 1087 /* The sequence operation has been encoded, cstate->datap set. */
1160 &entry->ce_respages[1], 1088 memcpy(resp->cstate.datap, slot->sl_data, slot->sl_datalen);
1161 entry->ce_resused - 1);
1162 }
1163 1089
1164 resp->rqstp->rq_resused = entry->ce_resused; 1090 resp->opcnt = slot->sl_opcnt;
1165 resp->opcnt = entry->ce_opcnt; 1091 resp->p = resp->cstate.datap + XDR_QUADLEN(slot->sl_datalen);
1166 resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen; 1092 status = slot->sl_status;
1167 status = entry->ce_status;
1168 1093
1169 return status; 1094 return status;
1170} 1095}
@@ -1194,13 +1119,15 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1194 int status; 1119 int status;
1195 unsigned int strhashval; 1120 unsigned int strhashval;
1196 char dname[HEXDIR_LEN]; 1121 char dname[HEXDIR_LEN];
1122 char addr_str[INET6_ADDRSTRLEN];
1197 nfs4_verifier verf = exid->verifier; 1123 nfs4_verifier verf = exid->verifier;
1198 u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr; 1124 struct sockaddr *sa = svc_addr(rqstp);
1199 1125
1126 rpc_ntop(sa, addr_str, sizeof(addr_str));
1200 dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " 1127 dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
1201 " ip_addr=%u flags %x, spa_how %d\n", 1128 "ip_addr=%s flags %x, spa_how %d\n",
1202 __func__, rqstp, exid, exid->clname.len, exid->clname.data, 1129 __func__, rqstp, exid, exid->clname.len, exid->clname.data,
1203 ip_addr, exid->flags, exid->spa_how); 1130 addr_str, exid->flags, exid->spa_how);
1204 1131
1205 if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A)) 1132 if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
1206 return nfserr_inval; 1133 return nfserr_inval;
@@ -1281,28 +1208,23 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1281 1208
1282out_new: 1209out_new:
1283 /* Normal case */ 1210 /* Normal case */
1284 new = create_client(exid->clname, dname); 1211 new = create_client(exid->clname, dname, rqstp, &verf);
1285 if (new == NULL) { 1212 if (new == NULL) {
1286 status = nfserr_resource; 1213 status = nfserr_serverfault;
1287 goto out; 1214 goto out;
1288 } 1215 }
1289 1216
1290 copy_verf(new, &verf);
1291 copy_cred(&new->cl_cred, &rqstp->rq_cred);
1292 new->cl_addr = ip_addr;
1293 gen_clid(new); 1217 gen_clid(new);
1294 gen_confirm(new);
1295 add_to_unconfirmed(new, strhashval); 1218 add_to_unconfirmed(new, strhashval);
1296out_copy: 1219out_copy:
1297 exid->clientid.cl_boot = new->cl_clientid.cl_boot; 1220 exid->clientid.cl_boot = new->cl_clientid.cl_boot;
1298 exid->clientid.cl_id = new->cl_clientid.cl_id; 1221 exid->clientid.cl_id = new->cl_clientid.cl_id;
1299 1222
1300 new->cl_slot.sl_seqid = 0;
1301 exid->seqid = 1; 1223 exid->seqid = 1;
1302 nfsd4_set_ex_flags(new, exid); 1224 nfsd4_set_ex_flags(new, exid);
1303 1225
1304 dprintk("nfsd4_exchange_id seqid %d flags %x\n", 1226 dprintk("nfsd4_exchange_id seqid %d flags %x\n",
1305 new->cl_slot.sl_seqid, new->cl_exchange_flags); 1227 new->cl_cs_slot.sl_seqid, new->cl_exchange_flags);
1306 status = nfs_ok; 1228 status = nfs_ok;
1307 1229
1308out: 1230out:
@@ -1313,40 +1235,60 @@ error:
1313} 1235}
1314 1236
1315static int 1237static int
1316check_slot_seqid(u32 seqid, struct nfsd4_slot *slot) 1238check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
1317{ 1239{
1318 dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid, 1240 dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid,
1319 slot->sl_seqid); 1241 slot_seqid);
1320 1242
1321 /* The slot is in use, and no response has been sent. */ 1243 /* The slot is in use, and no response has been sent. */
1322 if (slot->sl_inuse) { 1244 if (slot_inuse) {
1323 if (seqid == slot->sl_seqid) 1245 if (seqid == slot_seqid)
1324 return nfserr_jukebox; 1246 return nfserr_jukebox;
1325 else 1247 else
1326 return nfserr_seq_misordered; 1248 return nfserr_seq_misordered;
1327 } 1249 }
1328 /* Normal */ 1250 /* Normal */
1329 if (likely(seqid == slot->sl_seqid + 1)) 1251 if (likely(seqid == slot_seqid + 1))
1330 return nfs_ok; 1252 return nfs_ok;
1331 /* Replay */ 1253 /* Replay */
1332 if (seqid == slot->sl_seqid) 1254 if (seqid == slot_seqid)
1333 return nfserr_replay_cache; 1255 return nfserr_replay_cache;
1334 /* Wraparound */ 1256 /* Wraparound */
1335 if (seqid == 1 && (slot->sl_seqid + 1) == 0) 1257 if (seqid == 1 && (slot_seqid + 1) == 0)
1336 return nfs_ok; 1258 return nfs_ok;
1337 /* Misordered replay or misordered new request */ 1259 /* Misordered replay or misordered new request */
1338 return nfserr_seq_misordered; 1260 return nfserr_seq_misordered;
1339} 1261}
1340 1262
1263/*
1264 * Cache the create session result into the create session single DRC
1265 * slot cache by saving the xdr structure. sl_seqid has been set.
1266 * Do this for solo or embedded create session operations.
1267 */
1268static void
1269nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses,
1270 struct nfsd4_clid_slot *slot, int nfserr)
1271{
1272 slot->sl_status = nfserr;
1273 memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses));
1274}
1275
1276static __be32
1277nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses,
1278 struct nfsd4_clid_slot *slot)
1279{
1280 memcpy(cr_ses, &slot->sl_cr_ses, sizeof(*cr_ses));
1281 return slot->sl_status;
1282}
1283
1341__be32 1284__be32
1342nfsd4_create_session(struct svc_rqst *rqstp, 1285nfsd4_create_session(struct svc_rqst *rqstp,
1343 struct nfsd4_compound_state *cstate, 1286 struct nfsd4_compound_state *cstate,
1344 struct nfsd4_create_session *cr_ses) 1287 struct nfsd4_create_session *cr_ses)
1345{ 1288{
1346 u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr; 1289 struct sockaddr *sa = svc_addr(rqstp);
1347 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1348 struct nfs4_client *conf, *unconf; 1290 struct nfs4_client *conf, *unconf;
1349 struct nfsd4_slot *slot = NULL; 1291 struct nfsd4_clid_slot *cs_slot = NULL;
1350 int status = 0; 1292 int status = 0;
1351 1293
1352 nfs4_lock_state(); 1294 nfs4_lock_state();
@@ -1354,40 +1296,38 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1354 conf = find_confirmed_client(&cr_ses->clientid); 1296 conf = find_confirmed_client(&cr_ses->clientid);
1355 1297
1356 if (conf) { 1298 if (conf) {
1357 slot = &conf->cl_slot; 1299 cs_slot = &conf->cl_cs_slot;
1358 status = check_slot_seqid(cr_ses->seqid, slot); 1300 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
1359 if (status == nfserr_replay_cache) { 1301 if (status == nfserr_replay_cache) {
1360 dprintk("Got a create_session replay! seqid= %d\n", 1302 dprintk("Got a create_session replay! seqid= %d\n",
1361 slot->sl_seqid); 1303 cs_slot->sl_seqid);
1362 cstate->slot = slot;
1363 cstate->status = status;
1364 /* Return the cached reply status */ 1304 /* Return the cached reply status */
1365 status = nfsd4_replay_cache_entry(resp, NULL); 1305 status = nfsd4_replay_create_session(cr_ses, cs_slot);
1366 goto out; 1306 goto out;
1367 } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) { 1307 } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) {
1368 status = nfserr_seq_misordered; 1308 status = nfserr_seq_misordered;
1369 dprintk("Sequence misordered!\n"); 1309 dprintk("Sequence misordered!\n");
1370 dprintk("Expected seqid= %d but got seqid= %d\n", 1310 dprintk("Expected seqid= %d but got seqid= %d\n",
1371 slot->sl_seqid, cr_ses->seqid); 1311 cs_slot->sl_seqid, cr_ses->seqid);
1372 goto out; 1312 goto out;
1373 } 1313 }
1374 conf->cl_slot.sl_seqid++; 1314 cs_slot->sl_seqid++;
1375 } else if (unconf) { 1315 } else if (unconf) {
1376 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || 1316 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
1377 (ip_addr != unconf->cl_addr)) { 1317 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
1378 status = nfserr_clid_inuse; 1318 status = nfserr_clid_inuse;
1379 goto out; 1319 goto out;
1380 } 1320 }
1381 1321
1382 slot = &unconf->cl_slot; 1322 cs_slot = &unconf->cl_cs_slot;
1383 status = check_slot_seqid(cr_ses->seqid, slot); 1323 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
1384 if (status) { 1324 if (status) {
1385 /* an unconfirmed replay returns misordered */ 1325 /* an unconfirmed replay returns misordered */
1386 status = nfserr_seq_misordered; 1326 status = nfserr_seq_misordered;
1387 goto out; 1327 goto out_cache;
1388 } 1328 }
1389 1329
1390 slot->sl_seqid++; /* from 0 to 1 */ 1330 cs_slot->sl_seqid++; /* from 0 to 1 */
1391 move_to_confirmed(unconf); 1331 move_to_confirmed(unconf);
1392 1332
1393 /* 1333 /*
@@ -1396,6 +1336,19 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1396 cr_ses->flags &= ~SESSION4_PERSIST; 1336 cr_ses->flags &= ~SESSION4_PERSIST;
1397 cr_ses->flags &= ~SESSION4_RDMA; 1337 cr_ses->flags &= ~SESSION4_RDMA;
1398 1338
1339 if (cr_ses->flags & SESSION4_BACK_CHAN) {
1340 unconf->cl_cb_xprt = rqstp->rq_xprt;
1341 svc_xprt_get(unconf->cl_cb_xprt);
1342 rpc_copy_addr(
1343 (struct sockaddr *)&unconf->cl_cb_conn.cb_addr,
1344 sa);
1345 unconf->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
1346 unconf->cl_cb_conn.cb_minorversion =
1347 cstate->minorversion;
1348 unconf->cl_cb_conn.cb_prog = cr_ses->callback_prog;
1349 unconf->cl_cb_seq_nr = 1;
1350 nfsd4_probe_callback(unconf);
1351 }
1399 conf = unconf; 1352 conf = unconf;
1400 } else { 1353 } else {
1401 status = nfserr_stale_clientid; 1354 status = nfserr_stale_clientid;
@@ -1408,12 +1361,11 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1408 1361
1409 memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data, 1362 memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
1410 NFS4_MAX_SESSIONID_LEN); 1363 NFS4_MAX_SESSIONID_LEN);
1411 cr_ses->seqid = slot->sl_seqid; 1364 cr_ses->seqid = cs_slot->sl_seqid;
1412 1365
1413 slot->sl_inuse = true; 1366out_cache:
1414 cstate->slot = slot; 1367 /* cache solo and embedded create sessions under the state lock */
1415 /* Ensure a page is used for the cache */ 1368 nfsd4_cache_create_session(cr_ses, cs_slot, status);
1416 slot->sl_cache_entry.ce_cachethis = 1;
1417out: 1369out:
1418 nfs4_unlock_state(); 1370 nfs4_unlock_state();
1419 dprintk("%s returns %d\n", __func__, ntohl(status)); 1371 dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1478,18 +1430,23 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1478 if (seq->slotid >= session->se_fchannel.maxreqs) 1430 if (seq->slotid >= session->se_fchannel.maxreqs)
1479 goto out; 1431 goto out;
1480 1432
1481 slot = &session->se_slots[seq->slotid]; 1433 slot = session->se_slots[seq->slotid];
1482 dprintk("%s: slotid %d\n", __func__, seq->slotid); 1434 dprintk("%s: slotid %d\n", __func__, seq->slotid);
1483 1435
1484 status = check_slot_seqid(seq->seqid, slot); 1436 /* We do not negotiate the number of slots yet, so set the
1437 * maxslots to the session maxreqs which is used to encode
1438 * sr_highest_slotid and the sr_target_slot id to maxslots */
1439 seq->maxslots = session->se_fchannel.maxreqs;
1440
1441 status = check_slot_seqid(seq->seqid, slot->sl_seqid, slot->sl_inuse);
1485 if (status == nfserr_replay_cache) { 1442 if (status == nfserr_replay_cache) {
1486 cstate->slot = slot; 1443 cstate->slot = slot;
1487 cstate->session = session; 1444 cstate->session = session;
1488 /* Return the cached reply status and set cstate->status 1445 /* Return the cached reply status and set cstate->status
1489 * for nfsd4_svc_encode_compoundres processing */ 1446 * for nfsd4_proc_compound processing */
1490 status = nfsd4_replay_cache_entry(resp, seq); 1447 status = nfsd4_replay_cache_entry(resp, seq);
1491 cstate->status = nfserr_replay_cache; 1448 cstate->status = nfserr_replay_cache;
1492 goto replay_cache; 1449 goto out;
1493 } 1450 }
1494 if (status) 1451 if (status)
1495 goto out; 1452 goto out;
@@ -1497,23 +1454,23 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1497 /* Success! bump slot seqid */ 1454 /* Success! bump slot seqid */
1498 slot->sl_inuse = true; 1455 slot->sl_inuse = true;
1499 slot->sl_seqid = seq->seqid; 1456 slot->sl_seqid = seq->seqid;
1500 slot->sl_cache_entry.ce_cachethis = seq->cachethis; 1457 slot->sl_cachethis = seq->cachethis;
1501 /* Always set the cache entry cachethis for solo sequence */
1502 if (nfsd4_is_solo_sequence(resp))
1503 slot->sl_cache_entry.ce_cachethis = 1;
1504 1458
1505 cstate->slot = slot; 1459 cstate->slot = slot;
1506 cstate->session = session; 1460 cstate->session = session;
1507 1461
1508replay_cache: 1462 /* Hold a session reference until done processing the compound:
1509 /* Renew the clientid on success and on replay.
1510 * Hold a session reference until done processing the compound:
1511 * nfsd4_put_session called only if the cstate slot is set. 1463 * nfsd4_put_session called only if the cstate slot is set.
1512 */ 1464 */
1513 renew_client(session->se_client);
1514 nfsd4_get_session(session); 1465 nfsd4_get_session(session);
1515out: 1466out:
1516 spin_unlock(&sessionid_lock); 1467 spin_unlock(&sessionid_lock);
1468 /* Renew the clientid on success and on replay */
1469 if (cstate->session) {
1470 nfs4_lock_state();
1471 renew_client(session->se_client);
1472 nfs4_unlock_state();
1473 }
1517 dprintk("%s: return %d\n", __func__, ntohl(status)); 1474 dprintk("%s: return %d\n", __func__, ntohl(status));
1518 return status; 1475 return status;
1519} 1476}
@@ -1522,7 +1479,7 @@ __be32
1522nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 1479nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1523 struct nfsd4_setclientid *setclid) 1480 struct nfsd4_setclientid *setclid)
1524{ 1481{
1525 struct sockaddr_in *sin = svc_addr_in(rqstp); 1482 struct sockaddr *sa = svc_addr(rqstp);
1526 struct xdr_netobj clname = { 1483 struct xdr_netobj clname = {
1527 .len = setclid->se_namelen, 1484 .len = setclid->se_namelen,
1528 .data = setclid->se_name, 1485 .data = setclid->se_name,
@@ -1531,7 +1488,6 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1531 unsigned int strhashval; 1488 unsigned int strhashval;
1532 struct nfs4_client *conf, *unconf, *new; 1489 struct nfs4_client *conf, *unconf, *new;
1533 __be32 status; 1490 __be32 status;
1534 char *princ;
1535 char dname[HEXDIR_LEN]; 1491 char dname[HEXDIR_LEN];
1536 1492
1537 if (!check_name(clname)) 1493 if (!check_name(clname))
@@ -1554,8 +1510,11 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1554 /* RFC 3530 14.2.33 CASE 0: */ 1510 /* RFC 3530 14.2.33 CASE 0: */
1555 status = nfserr_clid_inuse; 1511 status = nfserr_clid_inuse;
1556 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) { 1512 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
1557 dprintk("NFSD: setclientid: string in use by client" 1513 char addr_str[INET6_ADDRSTRLEN];
1558 " at %pI4\n", &conf->cl_addr); 1514 rpc_ntop((struct sockaddr *) &conf->cl_addr, addr_str,
1515 sizeof(addr_str));
1516 dprintk("NFSD: setclientid: string in use by client "
1517 "at %s\n", addr_str);
1559 goto out; 1518 goto out;
1560 } 1519 }
1561 } 1520 }
@@ -1573,7 +1532,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1573 */ 1532 */
1574 if (unconf) 1533 if (unconf)
1575 expire_client(unconf); 1534 expire_client(unconf);
1576 new = create_client(clname, dname); 1535 new = create_client(clname, dname, rqstp, &clverifier);
1577 if (new == NULL) 1536 if (new == NULL)
1578 goto out; 1537 goto out;
1579 gen_clid(new); 1538 gen_clid(new);
@@ -1590,7 +1549,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1590 */ 1549 */
1591 expire_client(unconf); 1550 expire_client(unconf);
1592 } 1551 }
1593 new = create_client(clname, dname); 1552 new = create_client(clname, dname, rqstp, &clverifier);
1594 if (new == NULL) 1553 if (new == NULL)
1595 goto out; 1554 goto out;
1596 copy_clid(new, conf); 1555 copy_clid(new, conf);
@@ -1600,7 +1559,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1600 * probable client reboot; state will be removed if 1559 * probable client reboot; state will be removed if
1601 * confirmed. 1560 * confirmed.
1602 */ 1561 */
1603 new = create_client(clname, dname); 1562 new = create_client(clname, dname, rqstp, &clverifier);
1604 if (new == NULL) 1563 if (new == NULL)
1605 goto out; 1564 goto out;
1606 gen_clid(new); 1565 gen_clid(new);
@@ -1611,25 +1570,12 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1611 * confirmed. 1570 * confirmed.
1612 */ 1571 */
1613 expire_client(unconf); 1572 expire_client(unconf);
1614 new = create_client(clname, dname); 1573 new = create_client(clname, dname, rqstp, &clverifier);
1615 if (new == NULL) 1574 if (new == NULL)
1616 goto out; 1575 goto out;
1617 gen_clid(new); 1576 gen_clid(new);
1618 } 1577 }
1619 copy_verf(new, &clverifier); 1578 gen_callback(new, setclid, rpc_get_scope_id(sa));
1620 new->cl_addr = sin->sin_addr.s_addr;
1621 new->cl_flavor = rqstp->rq_flavor;
1622 princ = svc_gss_principal(rqstp);
1623 if (princ) {
1624 new->cl_principal = kstrdup(princ, GFP_KERNEL);
1625 if (new->cl_principal == NULL) {
1626 free_client(new);
1627 goto out;
1628 }
1629 }
1630 copy_cred(&new->cl_cred, &rqstp->rq_cred);
1631 gen_confirm(new);
1632 gen_callback(new, setclid);
1633 add_to_unconfirmed(new, strhashval); 1579 add_to_unconfirmed(new, strhashval);
1634 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; 1580 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
1635 setclid->se_clientid.cl_id = new->cl_clientid.cl_id; 1581 setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
@@ -1651,7 +1597,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1651 struct nfsd4_compound_state *cstate, 1597 struct nfsd4_compound_state *cstate,
1652 struct nfsd4_setclientid_confirm *setclientid_confirm) 1598 struct nfsd4_setclientid_confirm *setclientid_confirm)
1653{ 1599{
1654 struct sockaddr_in *sin = svc_addr_in(rqstp); 1600 struct sockaddr *sa = svc_addr(rqstp);
1655 struct nfs4_client *conf, *unconf; 1601 struct nfs4_client *conf, *unconf;
1656 nfs4_verifier confirm = setclientid_confirm->sc_confirm; 1602 nfs4_verifier confirm = setclientid_confirm->sc_confirm;
1657 clientid_t * clid = &setclientid_confirm->sc_clientid; 1603 clientid_t * clid = &setclientid_confirm->sc_clientid;
@@ -1670,9 +1616,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
1670 unconf = find_unconfirmed_client(clid); 1616 unconf = find_unconfirmed_client(clid);
1671 1617
1672 status = nfserr_clid_inuse; 1618 status = nfserr_clid_inuse;
1673 if (conf && conf->cl_addr != sin->sin_addr.s_addr) 1619 if (conf && !rpc_cmp_addr((struct sockaddr *) &conf->cl_addr, sa))
1674 goto out; 1620 goto out;
1675 if (unconf && unconf->cl_addr != sin->sin_addr.s_addr) 1621 if (unconf && !rpc_cmp_addr((struct sockaddr *) &unconf->cl_addr, sa))
1676 goto out; 1622 goto out;
1677 1623
1678 /* 1624 /*
@@ -2163,7 +2109,7 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
2163 return -EAGAIN; 2109 return -EAGAIN;
2164} 2110}
2165 2111
2166static struct lock_manager_operations nfsd_lease_mng_ops = { 2112static const struct lock_manager_operations nfsd_lease_mng_ops = {
2167 .fl_break = nfsd_break_deleg_cb, 2113 .fl_break = nfsd_break_deleg_cb,
2168 .fl_release_private = nfsd_release_deleg_cb, 2114 .fl_release_private = nfsd_release_deleg_cb,
2169 .fl_copy_lock = nfsd_copy_lock_deleg_cb, 2115 .fl_copy_lock = nfsd_copy_lock_deleg_cb,
@@ -3368,7 +3314,7 @@ nfs4_transform_lock_offset(struct file_lock *lock)
3368 3314
3369/* Hack!: For now, we're defining this just so we can use a pointer to it 3315/* Hack!: For now, we're defining this just so we can use a pointer to it
3370 * as a unique cookie to identify our (NFSv4's) posix locks. */ 3316 * as a unique cookie to identify our (NFSv4's) posix locks. */
3371static struct lock_manager_operations nfsd_posix_mng_ops = { 3317static const struct lock_manager_operations nfsd_posix_mng_ops = {
3372}; 3318};
3373 3319
3374static inline void 3320static inline void
@@ -4072,7 +4018,7 @@ set_max_delegations(void)
4072 4018
4073/* initialization to perform when the nfsd service is started: */ 4019/* initialization to perform when the nfsd service is started: */
4074 4020
4075static void 4021static int
4076__nfs4_state_start(void) 4022__nfs4_state_start(void)
4077{ 4023{
4078 unsigned long grace_time; 4024 unsigned long grace_time;
@@ -4084,19 +4030,26 @@ __nfs4_state_start(void)
4084 printk(KERN_INFO "NFSD: starting %ld-second grace period\n", 4030 printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
4085 grace_time/HZ); 4031 grace_time/HZ);
4086 laundry_wq = create_singlethread_workqueue("nfsd4"); 4032 laundry_wq = create_singlethread_workqueue("nfsd4");
4033 if (laundry_wq == NULL)
4034 return -ENOMEM;
4087 queue_delayed_work(laundry_wq, &laundromat_work, grace_time); 4035 queue_delayed_work(laundry_wq, &laundromat_work, grace_time);
4088 set_max_delegations(); 4036 set_max_delegations();
4037 return set_callback_cred();
4089} 4038}
4090 4039
4091void 4040int
4092nfs4_state_start(void) 4041nfs4_state_start(void)
4093{ 4042{
4043 int ret;
4044
4094 if (nfs4_init) 4045 if (nfs4_init)
4095 return; 4046 return 0;
4096 nfsd4_load_reboot_recovery_data(); 4047 nfsd4_load_reboot_recovery_data();
4097 __nfs4_state_start(); 4048 ret = __nfs4_state_start();
4049 if (ret)
4050 return ret;
4098 nfs4_init = 1; 4051 nfs4_init = 1;
4099 return; 4052 return 0;
4100} 4053}
4101 4054
4102time_t 4055time_t
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2dcc7feaa6ff..0fbd50cee1f6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1599,7 +1599,8 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
1599static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat) 1599static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *stat)
1600{ 1600{
1601 struct svc_fh tmp_fh; 1601 struct svc_fh tmp_fh;
1602 char *path, *rootpath; 1602 char *path = NULL, *rootpath;
1603 size_t rootlen;
1603 1604
1604 fh_init(&tmp_fh, NFS4_FHSIZE); 1605 fh_init(&tmp_fh, NFS4_FHSIZE);
1605 *stat = exp_pseudoroot(rqstp, &tmp_fh); 1606 *stat = exp_pseudoroot(rqstp, &tmp_fh);
@@ -1609,14 +1610,18 @@ static char *nfsd4_path(struct svc_rqst *rqstp, struct svc_export *exp, __be32 *
1609 1610
1610 path = exp->ex_pathname; 1611 path = exp->ex_pathname;
1611 1612
1612 if (strncmp(path, rootpath, strlen(rootpath))) { 1613 rootlen = strlen(rootpath);
1614 if (strncmp(path, rootpath, rootlen)) {
1613 dprintk("nfsd: fs_locations failed;" 1615 dprintk("nfsd: fs_locations failed;"
1614 "%s is not contained in %s\n", path, rootpath); 1616 "%s is not contained in %s\n", path, rootpath);
1615 *stat = nfserr_notsupp; 1617 *stat = nfserr_notsupp;
1616 return NULL; 1618 path = NULL;
1619 goto out;
1617 } 1620 }
1618 1621 path += rootlen;
1619 return path + strlen(rootpath); 1622out:
1623 fh_put(&tmp_fh);
1624 return path;
1620} 1625}
1621 1626
1622/* 1627/*
@@ -1793,11 +1798,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1793 goto out_nfserr; 1798 goto out_nfserr;
1794 } 1799 }
1795 } 1800 }
1796 if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
1797 if (exp->ex_fslocs.locations == NULL) {
1798 bmval0 &= ~FATTR4_WORD0_FS_LOCATIONS;
1799 }
1800 }
1801 if ((buflen -= 16) < 0) 1801 if ((buflen -= 16) < 0)
1802 goto out_resource; 1802 goto out_resource;
1803 1803
@@ -1825,8 +1825,6 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1825 goto out_resource; 1825 goto out_resource;
1826 if (!aclsupport) 1826 if (!aclsupport)
1827 word0 &= ~FATTR4_WORD0_ACL; 1827 word0 &= ~FATTR4_WORD0_ACL;
1828 if (!exp->ex_fslocs.locations)
1829 word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
1830 if (!word2) { 1828 if (!word2) {
1831 WRITE32(2); 1829 WRITE32(2);
1832 WRITE32(word0); 1830 WRITE32(word0);
@@ -3064,6 +3062,7 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
3064 WRITE32(0); 3062 WRITE32(0);
3065 3063
3066 ADJUST_ARGS(); 3064 ADJUST_ARGS();
3065 resp->cstate.datap = p; /* DRC cache data pointer */
3067 return 0; 3066 return 0;
3068} 3067}
3069 3068
@@ -3166,7 +3165,7 @@ static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
3166 return status; 3165 return status;
3167 3166
3168 session = resp->cstate.session; 3167 session = resp->cstate.session;
3169 if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0) 3168 if (session == NULL || slot->sl_cachethis == 0)
3170 return status; 3169 return status;
3171 3170
3172 if (resp->opcnt >= args->opcnt) 3171 if (resp->opcnt >= args->opcnt)
@@ -3291,6 +3290,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
3291 /* 3290 /*
3292 * All that remains is to write the tag and operation count... 3291 * All that remains is to write the tag and operation count...
3293 */ 3292 */
3293 struct nfsd4_compound_state *cs = &resp->cstate;
3294 struct kvec *iov; 3294 struct kvec *iov;
3295 p = resp->tagp; 3295 p = resp->tagp;
3296 *p++ = htonl(resp->taglen); 3296 *p++ = htonl(resp->taglen);
@@ -3304,17 +3304,11 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
3304 iov = &rqstp->rq_res.head[0]; 3304 iov = &rqstp->rq_res.head[0];
3305 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; 3305 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
3306 BUG_ON(iov->iov_len > PAGE_SIZE); 3306 BUG_ON(iov->iov_len > PAGE_SIZE);
3307 if (nfsd4_has_session(&resp->cstate)) { 3307 if (nfsd4_has_session(cs) && cs->status != nfserr_replay_cache) {
3308 if (resp->cstate.status == nfserr_replay_cache && 3308 nfsd4_store_cache_entry(resp);
3309 !nfsd4_not_cached(resp)) { 3309 dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
3310 iov->iov_len = resp->cstate.iovlen; 3310 resp->cstate.slot->sl_inuse = false;
3311 } else { 3311 nfsd4_put_session(resp->cstate.session);
3312 nfsd4_store_cache_entry(resp);
3313 dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
3314 resp->cstate.slot->sl_inuse = 0;
3315 }
3316 if (resp->cstate.session)
3317 nfsd4_put_session(resp->cstate.session);
3318 } 3312 }
3319 return 1; 3313 return 1;
3320} 3314}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7e906c5b7671..00388d2a3c99 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -174,12 +174,13 @@ static const struct file_operations exports_operations = {
174}; 174};
175 175
176extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); 176extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
177extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
177 178
178static struct file_operations pool_stats_operations = { 179static struct file_operations pool_stats_operations = {
179 .open = nfsd_pool_stats_open, 180 .open = nfsd_pool_stats_open,
180 .read = seq_read, 181 .read = seq_read,
181 .llseek = seq_lseek, 182 .llseek = seq_lseek,
182 .release = seq_release, 183 .release = nfsd_pool_stats_release,
183 .owner = THIS_MODULE, 184 .owner = THIS_MODULE,
184}; 185};
185 186
@@ -776,10 +777,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
776 size -= len; 777 size -= len;
777 mesg += len; 778 mesg += len;
778 } 779 }
779 780 rv = mesg - buf;
780 mutex_unlock(&nfsd_mutex);
781 return (mesg-buf);
782
783out_free: 781out_free:
784 kfree(nthreads); 782 kfree(nthreads);
785 mutex_unlock(&nfsd_mutex); 783 mutex_unlock(&nfsd_mutex);
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 8847f3fbfc1e..01965b2f3a76 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -397,44 +397,51 @@ static inline void _fh_update_old(struct dentry *dentry,
397 fh->ofh_dirino = 0; 397 fh->ofh_dirino = 0;
398} 398}
399 399
400__be32 400static bool is_root_export(struct svc_export *exp)
401fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
402 struct svc_fh *ref_fh)
403{ 401{
404 /* ref_fh is a reference file handle. 402 return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root;
405 * if it is non-null and for the same filesystem, then we should compose 403}
406 * a filehandle which is of the same version, where possible.
407 * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca
408 * Then create a 32byte filehandle using nfs_fhbase_old
409 *
410 */
411 404
412 u8 version; 405static struct super_block *exp_sb(struct svc_export *exp)
413 u8 fsid_type = 0; 406{
414 struct inode * inode = dentry->d_inode; 407 return exp->ex_path.dentry->d_inode->i_sb;
415 struct dentry *parent = dentry->d_parent; 408}
416 __u32 *datap;
417 dev_t ex_dev = exp->ex_path.dentry->d_inode->i_sb->s_dev;
418 int root_export = (exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root);
419 409
420 dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n", 410static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp)
421 MAJOR(ex_dev), MINOR(ex_dev), 411{
422 (long) exp->ex_path.dentry->d_inode->i_ino, 412 switch (fsid_type) {
423 parent->d_name.name, dentry->d_name.name, 413 case FSID_DEV:
424 (inode ? inode->i_ino : 0)); 414 if (!old_valid_dev(exp_sb(exp)->s_dev))
415 return 0;
416 /* FALL THROUGH */
417 case FSID_MAJOR_MINOR:
418 case FSID_ENCODE_DEV:
419 return exp_sb(exp)->s_type->fs_flags & FS_REQUIRES_DEV;
420 case FSID_NUM:
421 return exp->ex_flags & NFSEXP_FSID;
422 case FSID_UUID8:
423 case FSID_UUID16:
424 if (!is_root_export(exp))
425 return 0;
426 /* fall through */
427 case FSID_UUID4_INUM:
428 case FSID_UUID16_INUM:
429 return exp->ex_uuid != NULL;
430 }
431 return 1;
432}
425 433
426 /* Choose filehandle version and fsid type based on 434
427 * the reference filehandle (if it is in the same export) 435static void set_version_and_fsid_type(struct svc_fh *fhp, struct svc_export *exp, struct svc_fh *ref_fh)
428 * or the export options. 436{
429 */ 437 u8 version;
430 retry: 438 u8 fsid_type;
439retry:
431 version = 1; 440 version = 1;
432 if (ref_fh && ref_fh->fh_export == exp) { 441 if (ref_fh && ref_fh->fh_export == exp) {
433 version = ref_fh->fh_handle.fh_version; 442 version = ref_fh->fh_handle.fh_version;
434 fsid_type = ref_fh->fh_handle.fh_fsid_type; 443 fsid_type = ref_fh->fh_handle.fh_fsid_type;
435 444
436 if (ref_fh == fhp)
437 fh_put(ref_fh);
438 ref_fh = NULL; 445 ref_fh = NULL;
439 446
440 switch (version) { 447 switch (version) {
@@ -447,58 +454,66 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
447 goto retry; 454 goto retry;
448 } 455 }
449 456
450 /* Need to check that this type works for this 457 /*
451 * export point. As the fsid -> filesystem mapping 458 * As the fsid -> filesystem mapping was guided by
452 * was guided by user-space, there is no guarantee 459 * user-space, there is no guarantee that the filesystem
453 * that the filesystem actually supports that fsid 460 * actually supports that fsid type. If it doesn't we
454 * type. If it doesn't we loop around again without 461 * loop around again without ref_fh set.
455 * ref_fh set.
456 */ 462 */
457 switch(fsid_type) { 463 if (!fsid_type_ok_for_exp(fsid_type, exp))
458 case FSID_DEV: 464 goto retry;
459 if (!old_valid_dev(ex_dev))
460 goto retry;
461 /* FALL THROUGH */
462 case FSID_MAJOR_MINOR:
463 case FSID_ENCODE_DEV:
464 if (!(exp->ex_path.dentry->d_inode->i_sb->s_type->fs_flags
465 & FS_REQUIRES_DEV))
466 goto retry;
467 break;
468 case FSID_NUM:
469 if (! (exp->ex_flags & NFSEXP_FSID))
470 goto retry;
471 break;
472 case FSID_UUID8:
473 case FSID_UUID16:
474 if (!root_export)
475 goto retry;
476 /* fall through */
477 case FSID_UUID4_INUM:
478 case FSID_UUID16_INUM:
479 if (exp->ex_uuid == NULL)
480 goto retry;
481 break;
482 }
483 } else if (exp->ex_flags & NFSEXP_FSID) { 465 } else if (exp->ex_flags & NFSEXP_FSID) {
484 fsid_type = FSID_NUM; 466 fsid_type = FSID_NUM;
485 } else if (exp->ex_uuid) { 467 } else if (exp->ex_uuid) {
486 if (fhp->fh_maxsize >= 64) { 468 if (fhp->fh_maxsize >= 64) {
487 if (root_export) 469 if (is_root_export(exp))
488 fsid_type = FSID_UUID16; 470 fsid_type = FSID_UUID16;
489 else 471 else
490 fsid_type = FSID_UUID16_INUM; 472 fsid_type = FSID_UUID16_INUM;
491 } else { 473 } else {
492 if (root_export) 474 if (is_root_export(exp))
493 fsid_type = FSID_UUID8; 475 fsid_type = FSID_UUID8;
494 else 476 else
495 fsid_type = FSID_UUID4_INUM; 477 fsid_type = FSID_UUID4_INUM;
496 } 478 }
497 } else if (!old_valid_dev(ex_dev)) 479 } else if (!old_valid_dev(exp_sb(exp)->s_dev))
498 /* for newer device numbers, we must use a newer fsid format */ 480 /* for newer device numbers, we must use a newer fsid format */
499 fsid_type = FSID_ENCODE_DEV; 481 fsid_type = FSID_ENCODE_DEV;
500 else 482 else
501 fsid_type = FSID_DEV; 483 fsid_type = FSID_DEV;
484 fhp->fh_handle.fh_version = version;
485 if (version)
486 fhp->fh_handle.fh_fsid_type = fsid_type;
487}
488
489__be32
490fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
491 struct svc_fh *ref_fh)
492{
493 /* ref_fh is a reference file handle.
494 * if it is non-null and for the same filesystem, then we should compose
495 * a filehandle which is of the same version, where possible.
496 * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca
497 * Then create a 32byte filehandle using nfs_fhbase_old
498 *
499 */
500
501 struct inode * inode = dentry->d_inode;
502 struct dentry *parent = dentry->d_parent;
503 __u32 *datap;
504 dev_t ex_dev = exp_sb(exp)->s_dev;
505
506 dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %s/%s, ino=%ld)\n",
507 MAJOR(ex_dev), MINOR(ex_dev),
508 (long) exp->ex_path.dentry->d_inode->i_ino,
509 parent->d_name.name, dentry->d_name.name,
510 (inode ? inode->i_ino : 0));
511
512 /* Choose filehandle version and fsid type based on
513 * the reference filehandle (if it is in the same export)
514 * or the export options.
515 */
516 set_version_and_fsid_type(fhp, exp, ref_fh);
502 517
503 if (ref_fh == fhp) 518 if (ref_fh == fhp)
504 fh_put(ref_fh); 519 fh_put(ref_fh);
@@ -516,7 +531,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
516 fhp->fh_export = exp; 531 fhp->fh_export = exp;
517 cache_get(&exp->h); 532 cache_get(&exp->h);
518 533
519 if (version == 0xca) { 534 if (fhp->fh_handle.fh_version == 0xca) {
520 /* old style filehandle please */ 535 /* old style filehandle please */
521 memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE); 536 memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE);
522 fhp->fh_handle.fh_size = NFS_FHSIZE; 537 fhp->fh_handle.fh_size = NFS_FHSIZE;
@@ -530,22 +545,22 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
530 _fh_update_old(dentry, exp, &fhp->fh_handle); 545 _fh_update_old(dentry, exp, &fhp->fh_handle);
531 } else { 546 } else {
532 int len; 547 int len;
533 fhp->fh_handle.fh_version = 1;
534 fhp->fh_handle.fh_auth_type = 0; 548 fhp->fh_handle.fh_auth_type = 0;
535 datap = fhp->fh_handle.fh_auth+0; 549 datap = fhp->fh_handle.fh_auth+0;
536 fhp->fh_handle.fh_fsid_type = fsid_type; 550 mk_fsid(fhp->fh_handle.fh_fsid_type, datap, ex_dev,
537 mk_fsid(fsid_type, datap, ex_dev,
538 exp->ex_path.dentry->d_inode->i_ino, 551 exp->ex_path.dentry->d_inode->i_ino,
539 exp->ex_fsid, exp->ex_uuid); 552 exp->ex_fsid, exp->ex_uuid);
540 553
541 len = key_len(fsid_type); 554 len = key_len(fhp->fh_handle.fh_fsid_type);
542 datap += len/4; 555 datap += len/4;
543 fhp->fh_handle.fh_size = 4 + len; 556 fhp->fh_handle.fh_size = 4 + len;
544 557
545 if (inode) 558 if (inode)
546 _fh_update(fhp, exp, dentry); 559 _fh_update(fhp, exp, dentry);
547 if (fhp->fh_handle.fh_fileid_type == 255) 560 if (fhp->fh_handle.fh_fileid_type == 255) {
561 fh_put(fhp);
548 return nfserr_opnotsupp; 562 return nfserr_opnotsupp;
563 }
549 } 564 }
550 565
551 return 0; 566 return 0;
@@ -639,8 +654,7 @@ enum fsid_source fsid_source(struct svc_fh *fhp)
639 case FSID_DEV: 654 case FSID_DEV:
640 case FSID_ENCODE_DEV: 655 case FSID_ENCODE_DEV:
641 case FSID_MAJOR_MINOR: 656 case FSID_MAJOR_MINOR:
642 if (fhp->fh_export->ex_path.dentry->d_inode->i_sb->s_type->fs_flags 657 if (exp_sb(fhp->fh_export)->s_type->fs_flags & FS_REQUIRES_DEV)
643 & FS_REQUIRES_DEV)
644 return FSIDSOURCE_DEV; 658 return FSIDSOURCE_DEV;
645 break; 659 break;
646 case FSID_NUM: 660 case FSID_NUM:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 24d58adfe5fd..67ea83eedd43 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -34,6 +34,7 @@
34#include <linux/nfsd/syscall.h> 34#include <linux/nfsd/syscall.h>
35#include <linux/lockd/bind.h> 35#include <linux/lockd/bind.h>
36#include <linux/nfsacl.h> 36#include <linux/nfsacl.h>
37#include <linux/seq_file.h>
37 38
38#define NFSDDBG_FACILITY NFSDDBG_SVC 39#define NFSDDBG_FACILITY NFSDDBG_SVC
39 40
@@ -66,6 +67,16 @@ struct timeval nfssvc_boot;
66DEFINE_MUTEX(nfsd_mutex); 67DEFINE_MUTEX(nfsd_mutex);
67struct svc_serv *nfsd_serv; 68struct svc_serv *nfsd_serv;
68 69
70/*
71 * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used.
72 * nfsd_drc_max_pages limits the total amount of memory available for
73 * version 4.1 DRC caches.
74 * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
75 */
76spinlock_t nfsd_drc_lock;
77unsigned int nfsd_drc_max_mem;
78unsigned int nfsd_drc_mem_used;
79
69#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) 80#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
70static struct svc_stat nfsd_acl_svcstats; 81static struct svc_stat nfsd_acl_svcstats;
71static struct svc_version * nfsd_acl_version[] = { 82static struct svc_version * nfsd_acl_version[] = {
@@ -235,13 +246,12 @@ void nfsd_reset_versions(void)
235 */ 246 */
236static void set_max_drc(void) 247static void set_max_drc(void)
237{ 248{
238 /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */ 249 #define NFSD_DRC_SIZE_SHIFT 10
239 #define NFSD_DRC_SIZE_SHIFT 7 250 nfsd_drc_max_mem = (nr_free_buffer_pages()
240 nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages() 251 >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
241 >> NFSD_DRC_SIZE_SHIFT; 252 nfsd_drc_mem_used = 0;
242 nfsd_serv->sv_drc_pages_used = 0; 253 spin_lock_init(&nfsd_drc_lock);
243 dprintk("%s svc_drc_max_pages %u\n", __func__, 254 dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem);
244 nfsd_serv->sv_drc_max_pages);
245} 255}
246 256
247int nfsd_create_serv(void) 257int nfsd_create_serv(void)
@@ -401,7 +411,9 @@ nfsd_svc(unsigned short port, int nrservs)
401 error = nfsd_racache_init(2*nrservs); 411 error = nfsd_racache_init(2*nrservs);
402 if (error<0) 412 if (error<0)
403 goto out; 413 goto out;
404 nfs4_state_start(); 414 error = nfs4_state_start();
415 if (error)
416 goto out;
405 417
406 nfsd_reset_versions(); 418 nfsd_reset_versions();
407 419
@@ -569,10 +581,6 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
569 + rqstp->rq_res.head[0].iov_len; 581 + rqstp->rq_res.head[0].iov_len;
570 rqstp->rq_res.head[0].iov_len += sizeof(__be32); 582 rqstp->rq_res.head[0].iov_len += sizeof(__be32);
571 583
572 /* NFSv4.1 DRC requires statp */
573 if (rqstp->rq_vers == 4)
574 nfsd4_set_statp(rqstp, statp);
575
576 /* Now call the procedure handler, and encode NFS status. */ 584 /* Now call the procedure handler, and encode NFS status. */
577 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); 585 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
578 nfserr = map_new_errors(rqstp->rq_vers, nfserr); 586 nfserr = map_new_errors(rqstp->rq_vers, nfserr);
@@ -607,7 +615,25 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
607 615
608int nfsd_pool_stats_open(struct inode *inode, struct file *file) 616int nfsd_pool_stats_open(struct inode *inode, struct file *file)
609{ 617{
610 if (nfsd_serv == NULL) 618 int ret;
619 mutex_lock(&nfsd_mutex);
620 if (nfsd_serv == NULL) {
621 mutex_unlock(&nfsd_mutex);
611 return -ENODEV; 622 return -ENODEV;
612 return svc_pool_stats_open(nfsd_serv, file); 623 }
624 /* bump up the psudo refcount while traversing */
625 svc_get(nfsd_serv);
626 ret = svc_pool_stats_open(nfsd_serv, file);
627 mutex_unlock(&nfsd_mutex);
628 return ret;
629}
630
631int nfsd_pool_stats_release(struct inode *inode, struct file *file)
632{
633 int ret = seq_release(inode, file);
634 mutex_lock(&nfsd_mutex);
635 /* this function really, really should have been called svc_put() */
636 svc_destroy(nfsd_serv);
637 mutex_unlock(&nfsd_mutex);
638 return ret;
613} 639}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 8fa09bfbcba7..a293f0273263 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -89,6 +89,12 @@ struct raparm_hbucket {
89#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) 89#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)
90static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE]; 90static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE];
91 91
92static inline int
93nfsd_v4client(struct svc_rqst *rq)
94{
95 return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4;
96}
97
92/* 98/*
93 * Called from nfsd_lookup and encode_dirent. Check if we have crossed 99 * Called from nfsd_lookup and encode_dirent. Check if we have crossed
94 * a mount point. 100 * a mount point.
@@ -115,7 +121,8 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
115 path_put(&path); 121 path_put(&path);
116 goto out; 122 goto out;
117 } 123 }
118 if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { 124 if (nfsd_v4client(rqstp) ||
125 (exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
119 /* successfully crossed mount point */ 126 /* successfully crossed mount point */
120 /* 127 /*
121 * This is subtle: path.dentry is *not* on path.mnt 128 * This is subtle: path.dentry is *not* on path.mnt
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index c668bca579c1..6a2711f4c321 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -46,7 +46,7 @@ void nilfs_btnode_cache_init_once(struct address_space *btnc)
46 INIT_LIST_HEAD(&btnc->i_mmap_nonlinear); 46 INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
47} 47}
48 48
49static struct address_space_operations def_btnode_aops = { 49static const struct address_space_operations def_btnode_aops = {
50 .sync_page = block_sync_page, 50 .sync_page = block_sync_page,
51}; 51};
52 52
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 6bd84a0d8238..fc8278c77cdd 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -151,7 +151,7 @@ struct file_operations nilfs_file_operations = {
151 .splice_read = generic_file_splice_read, 151 .splice_read = generic_file_splice_read,
152}; 152};
153 153
154struct inode_operations nilfs_file_inode_operations = { 154const struct inode_operations nilfs_file_inode_operations = {
155 .truncate = nilfs_truncate, 155 .truncate = nilfs_truncate,
156 .setattr = nilfs_setattr, 156 .setattr = nilfs_setattr,
157 .permission = nilfs_permission, 157 .permission = nilfs_permission,
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 1b3c2bb20da9..e6de0a27ab5d 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -52,7 +52,7 @@
52#include "dat.h" 52#include "dat.h"
53#include "ifile.h" 53#include "ifile.h"
54 54
55static struct address_space_operations def_gcinode_aops = { 55static const struct address_space_operations def_gcinode_aops = {
56 .sync_page = block_sync_page, 56 .sync_page = block_sync_page,
57}; 57};
58 58
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 807e584b163d..2d2c501deb54 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -238,7 +238,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
238 return size; 238 return size;
239} 239}
240 240
241struct address_space_operations nilfs_aops = { 241const struct address_space_operations nilfs_aops = {
242 .writepage = nilfs_writepage, 242 .writepage = nilfs_writepage,
243 .readpage = nilfs_readpage, 243 .readpage = nilfs_readpage,
244 .sync_page = block_sync_page, 244 .sync_page = block_sync_page,
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 156bf6091a96..b18c4998f8d0 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -427,12 +427,12 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
427} 427}
428 428
429 429
430static struct address_space_operations def_mdt_aops = { 430static const struct address_space_operations def_mdt_aops = {
431 .writepage = nilfs_mdt_write_page, 431 .writepage = nilfs_mdt_write_page,
432 .sync_page = block_sync_page, 432 .sync_page = block_sync_page,
433}; 433};
434 434
435static struct inode_operations def_mdt_iops; 435static const struct inode_operations def_mdt_iops;
436static struct file_operations def_mdt_fops; 436static struct file_operations def_mdt_fops;
437 437
438/* 438/*
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index df70dadb336f..ed02e886fa79 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -448,7 +448,7 @@ out:
448 return err; 448 return err;
449} 449}
450 450
451struct inode_operations nilfs_dir_inode_operations = { 451const struct inode_operations nilfs_dir_inode_operations = {
452 .create = nilfs_create, 452 .create = nilfs_create,
453 .lookup = nilfs_lookup, 453 .lookup = nilfs_lookup,
454 .link = nilfs_link, 454 .link = nilfs_link,
@@ -462,12 +462,12 @@ struct inode_operations nilfs_dir_inode_operations = {
462 .permission = nilfs_permission, 462 .permission = nilfs_permission,
463}; 463};
464 464
465struct inode_operations nilfs_special_inode_operations = { 465const struct inode_operations nilfs_special_inode_operations = {
466 .setattr = nilfs_setattr, 466 .setattr = nilfs_setattr,
467 .permission = nilfs_permission, 467 .permission = nilfs_permission,
468}; 468};
469 469
470struct inode_operations nilfs_symlink_inode_operations = { 470const struct inode_operations nilfs_symlink_inode_operations = {
471 .readlink = generic_readlink, 471 .readlink = generic_readlink,
472 .follow_link = page_follow_link_light, 472 .follow_link = page_follow_link_light,
473 .put_link = page_put_link, 473 .put_link = page_put_link,
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 724c63766e82..bad7368782d0 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -295,12 +295,12 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *);
295 * Inodes and files operations 295 * Inodes and files operations
296 */ 296 */
297extern struct file_operations nilfs_dir_operations; 297extern struct file_operations nilfs_dir_operations;
298extern struct inode_operations nilfs_file_inode_operations; 298extern const struct inode_operations nilfs_file_inode_operations;
299extern struct file_operations nilfs_file_operations; 299extern struct file_operations nilfs_file_operations;
300extern struct address_space_operations nilfs_aops; 300extern const struct address_space_operations nilfs_aops;
301extern struct inode_operations nilfs_dir_inode_operations; 301extern const struct inode_operations nilfs_dir_inode_operations;
302extern struct inode_operations nilfs_special_inode_operations; 302extern const struct inode_operations nilfs_special_inode_operations;
303extern struct inode_operations nilfs_symlink_inode_operations; 303extern const struct inode_operations nilfs_symlink_inode_operations;
304 304
305/* 305/*
306 * filesystem type 306 * filesystem type
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 55f3d6b60732..644e66727dd0 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -504,7 +504,7 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
504 return 0; 504 return 0;
505} 505}
506 506
507static struct super_operations nilfs_sops = { 507static const struct super_operations nilfs_sops = {
508 .alloc_inode = nilfs_alloc_inode, 508 .alloc_inode = nilfs_alloc_inode,
509 .destroy_inode = nilfs_destroy_inode, 509 .destroy_inode = nilfs_destroy_inode,
510 .dirty_inode = nilfs_dirty_inode, 510 .dirty_inode = nilfs_dirty_inode,
@@ -560,7 +560,7 @@ nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
560 nilfs_nfs_get_inode); 560 nilfs_nfs_get_inode);
561} 561}
562 562
563static struct export_operations nilfs_export_ops = { 563static const struct export_operations nilfs_export_ops = {
564 .fh_to_dentry = nilfs_fh_to_dentry, 564 .fh_to_dentry = nilfs_fh_to_dentry,
565 .fh_to_parent = nilfs_fh_to_parent, 565 .fh_to_parent = nilfs_fh_to_parent,
566 .get_parent = nilfs_get_parent, 566 .get_parent = nilfs_get_parent,
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index d4168e269c5d..ad391a8c3e7e 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -591,9 +591,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
591 591
592 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state); 592 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
593 593
594 bdi = nilfs->ns_bdev->bd_inode_backing_dev_info; 594 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
595 if (!bdi)
596 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
597 nilfs->ns_bdi = bdi ? : &default_backing_dev_info; 595 nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
598 596
599 /* Finding last segment */ 597 /* Finding last segment */
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 4350d4993b18..663c0e341f8b 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2146,46 +2146,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2146} 2146}
2147 2147
2148/** 2148/**
2149 * ntfs_file_writev -
2150 *
2151 * Basically the same as generic_file_writev() except that it ends up calling
2152 * ntfs_file_aio_write_nolock() instead of __generic_file_aio_write_nolock().
2153 */
2154static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
2155 unsigned long nr_segs, loff_t *ppos)
2156{
2157 struct address_space *mapping = file->f_mapping;
2158 struct inode *inode = mapping->host;
2159 struct kiocb kiocb;
2160 ssize_t ret;
2161
2162 mutex_lock(&inode->i_mutex);
2163 init_sync_kiocb(&kiocb, file);
2164 ret = ntfs_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
2165 if (ret == -EIOCBQUEUED)
2166 ret = wait_on_sync_kiocb(&kiocb);
2167 mutex_unlock(&inode->i_mutex);
2168 if (ret > 0) {
2169 int err = generic_write_sync(file, *ppos - ret, ret);
2170 if (err < 0)
2171 ret = err;
2172 }
2173 return ret;
2174}
2175
2176/**
2177 * ntfs_file_write - simple wrapper for ntfs_file_writev()
2178 */
2179static ssize_t ntfs_file_write(struct file *file, const char __user *buf,
2180 size_t count, loff_t *ppos)
2181{
2182 struct iovec local_iov = { .iov_base = (void __user *)buf,
2183 .iov_len = count };
2184
2185 return ntfs_file_writev(file, &local_iov, 1, ppos);
2186}
2187
2188/**
2189 * ntfs_file_fsync - sync a file to disk 2149 * ntfs_file_fsync - sync a file to disk
2190 * @filp: file to be synced 2150 * @filp: file to be synced
2191 * @dentry: dentry describing the file to sync 2151 * @dentry: dentry describing the file to sync
@@ -2247,7 +2207,7 @@ const struct file_operations ntfs_file_ops = {
2247 .read = do_sync_read, /* Read from file. */ 2207 .read = do_sync_read, /* Read from file. */
2248 .aio_read = generic_file_aio_read, /* Async read from file. */ 2208 .aio_read = generic_file_aio_read, /* Async read from file. */
2249#ifdef NTFS_RW 2209#ifdef NTFS_RW
2250 .write = ntfs_file_write, /* Write to file. */ 2210 .write = do_sync_write, /* Write to file. */
2251 .aio_write = ntfs_file_aio_write, /* Async write to file. */ 2211 .aio_write = ntfs_file_aio_write, /* Async write to file. */
2252 /*.release = ,*/ /* Last file is closed. See 2212 /*.release = ,*/ /* Last file is closed. See
2253 fs/ext2/file.c:: 2213 fs/ext2/file.c::
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 50931b1ce4b9..8b2549f672bf 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -829,7 +829,7 @@ enum {
829 /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the 829 /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
830 F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, 830 F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
831 F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask 831 F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask
832 is used to to obtain all flags that are valid for setting. */ 832 is used to obtain all flags that are valid for setting. */
833 /* 833 /*
834 * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all 834 * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all
835 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION 835 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index cd0be3f5c3cd..a44b14cbceeb 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -47,7 +47,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
47 return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM); 47 return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
48 /* return (void *)__get_free_page(gfp_mask); */ 48 /* return (void *)__get_free_page(gfp_mask); */
49 } 49 }
50 if (likely(size >> PAGE_SHIFT < num_physpages)) 50 if (likely((size >> PAGE_SHIFT) < totalram_pages))
51 return __vmalloc(size, gfp_mask, PAGE_KERNEL); 51 return __vmalloc(size, gfp_mask, PAGE_KERNEL);
52 return NULL; 52 return NULL;
53} 53}
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 01596079dd63..31f25ce32c97 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -28,6 +28,7 @@ ocfs2-objs := \
28 locks.o \ 28 locks.o \
29 mmap.o \ 29 mmap.o \
30 namei.o \ 30 namei.o \
31 refcounttree.o \
31 resize.o \ 32 resize.o \
32 slot_map.o \ 33 slot_map.o \
33 suballoc.o \ 34 suballoc.o \
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index ab513ddaeff2..38a42f5d59ff 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -49,10 +49,21 @@
49#include "super.h" 49#include "super.h"
50#include "uptodate.h" 50#include "uptodate.h"
51#include "xattr.h" 51#include "xattr.h"
52#include "refcounttree.h"
52 53
53#include "buffer_head_io.h" 54#include "buffer_head_io.h"
54 55
56enum ocfs2_contig_type {
57 CONTIG_NONE = 0,
58 CONTIG_LEFT,
59 CONTIG_RIGHT,
60 CONTIG_LEFTRIGHT,
61};
55 62
63static enum ocfs2_contig_type
64 ocfs2_extent_rec_contig(struct super_block *sb,
65 struct ocfs2_extent_rec *ext,
66 struct ocfs2_extent_rec *insert_rec);
56/* 67/*
57 * Operations for a specific extent tree type. 68 * Operations for a specific extent tree type.
58 * 69 *
@@ -79,18 +90,30 @@ struct ocfs2_extent_tree_operations {
79 * that value. new_clusters is the delta, and must be 90 * that value. new_clusters is the delta, and must be
80 * added to the total. Required. 91 * added to the total. Required.
81 */ 92 */
82 void (*eo_update_clusters)(struct inode *inode, 93 void (*eo_update_clusters)(struct ocfs2_extent_tree *et,
83 struct ocfs2_extent_tree *et,
84 u32 new_clusters); 94 u32 new_clusters);
85 95
86 /* 96 /*
97 * If this extent tree is supported by an extent map, insert
98 * a record into the map.
99 */
100 void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et,
101 struct ocfs2_extent_rec *rec);
102
103 /*
104 * If this extent tree is supported by an extent map, truncate the
105 * map to clusters,
106 */
107 void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et,
108 u32 clusters);
109
110 /*
87 * If ->eo_insert_check() exists, it is called before rec is 111 * If ->eo_insert_check() exists, it is called before rec is
88 * inserted into the extent tree. It is optional. 112 * inserted into the extent tree. It is optional.
89 */ 113 */
90 int (*eo_insert_check)(struct inode *inode, 114 int (*eo_insert_check)(struct ocfs2_extent_tree *et,
91 struct ocfs2_extent_tree *et,
92 struct ocfs2_extent_rec *rec); 115 struct ocfs2_extent_rec *rec);
93 int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et); 116 int (*eo_sanity_check)(struct ocfs2_extent_tree *et);
94 117
95 /* 118 /*
96 * -------------------------------------------------------------- 119 * --------------------------------------------------------------
@@ -109,8 +132,17 @@ struct ocfs2_extent_tree_operations {
109 * it exists. If it does not, et->et_max_leaf_clusters is set 132 * it exists. If it does not, et->et_max_leaf_clusters is set
110 * to 0 (unlimited). Optional. 133 * to 0 (unlimited). Optional.
111 */ 134 */
112 void (*eo_fill_max_leaf_clusters)(struct inode *inode, 135 void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
113 struct ocfs2_extent_tree *et); 136
137 /*
138 * ->eo_extent_contig test whether the 2 ocfs2_extent_rec
139 * are contiguous or not. Optional. Don't need to set it if use
140 * ocfs2_extent_rec as the tree leaf.
141 */
142 enum ocfs2_contig_type
143 (*eo_extent_contig)(struct ocfs2_extent_tree *et,
144 struct ocfs2_extent_rec *ext,
145 struct ocfs2_extent_rec *insert_rec);
114}; 146};
115 147
116 148
@@ -121,19 +153,22 @@ struct ocfs2_extent_tree_operations {
121static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et); 153static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
122static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, 154static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
123 u64 blkno); 155 u64 blkno);
124static void ocfs2_dinode_update_clusters(struct inode *inode, 156static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
125 struct ocfs2_extent_tree *et,
126 u32 clusters); 157 u32 clusters);
127static int ocfs2_dinode_insert_check(struct inode *inode, 158static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
128 struct ocfs2_extent_tree *et, 159 struct ocfs2_extent_rec *rec);
160static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
161 u32 clusters);
162static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
129 struct ocfs2_extent_rec *rec); 163 struct ocfs2_extent_rec *rec);
130static int ocfs2_dinode_sanity_check(struct inode *inode, 164static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
131 struct ocfs2_extent_tree *et);
132static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); 165static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
133static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { 166static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
134 .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, 167 .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
135 .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, 168 .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
136 .eo_update_clusters = ocfs2_dinode_update_clusters, 169 .eo_update_clusters = ocfs2_dinode_update_clusters,
170 .eo_extent_map_insert = ocfs2_dinode_extent_map_insert,
171 .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate,
137 .eo_insert_check = ocfs2_dinode_insert_check, 172 .eo_insert_check = ocfs2_dinode_insert_check,
138 .eo_sanity_check = ocfs2_dinode_sanity_check, 173 .eo_sanity_check = ocfs2_dinode_sanity_check,
139 .eo_fill_root_el = ocfs2_dinode_fill_root_el, 174 .eo_fill_root_el = ocfs2_dinode_fill_root_el,
@@ -156,40 +191,53 @@ static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
156 return le64_to_cpu(di->i_last_eb_blk); 191 return le64_to_cpu(di->i_last_eb_blk);
157} 192}
158 193
159static void ocfs2_dinode_update_clusters(struct inode *inode, 194static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
160 struct ocfs2_extent_tree *et,
161 u32 clusters) 195 u32 clusters)
162{ 196{
197 struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
163 struct ocfs2_dinode *di = et->et_object; 198 struct ocfs2_dinode *di = et->et_object;
164 199
165 le32_add_cpu(&di->i_clusters, clusters); 200 le32_add_cpu(&di->i_clusters, clusters);
166 spin_lock(&OCFS2_I(inode)->ip_lock); 201 spin_lock(&oi->ip_lock);
167 OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); 202 oi->ip_clusters = le32_to_cpu(di->i_clusters);
168 spin_unlock(&OCFS2_I(inode)->ip_lock); 203 spin_unlock(&oi->ip_lock);
169} 204}
170 205
171static int ocfs2_dinode_insert_check(struct inode *inode, 206static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
172 struct ocfs2_extent_tree *et, 207 struct ocfs2_extent_rec *rec)
208{
209 struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
210
211 ocfs2_extent_map_insert_rec(inode, rec);
212}
213
214static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
215 u32 clusters)
216{
217 struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
218
219 ocfs2_extent_map_trunc(inode, clusters);
220}
221
222static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
173 struct ocfs2_extent_rec *rec) 223 struct ocfs2_extent_rec *rec)
174{ 224{
175 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 225 struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
226 struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb);
176 227
177 BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); 228 BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL);
178 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && 229 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
179 (OCFS2_I(inode)->ip_clusters != 230 (oi->ip_clusters != le32_to_cpu(rec->e_cpos)),
180 le32_to_cpu(rec->e_cpos)),
181 "Device %s, asking for sparse allocation: inode %llu, " 231 "Device %s, asking for sparse allocation: inode %llu, "
182 "cpos %u, clusters %u\n", 232 "cpos %u, clusters %u\n",
183 osb->dev_str, 233 osb->dev_str,
184 (unsigned long long)OCFS2_I(inode)->ip_blkno, 234 (unsigned long long)oi->ip_blkno,
185 rec->e_cpos, 235 rec->e_cpos, oi->ip_clusters);
186 OCFS2_I(inode)->ip_clusters);
187 236
188 return 0; 237 return 0;
189} 238}
190 239
191static int ocfs2_dinode_sanity_check(struct inode *inode, 240static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et)
192 struct ocfs2_extent_tree *et)
193{ 241{
194 struct ocfs2_dinode *di = et->et_object; 242 struct ocfs2_dinode *di = et->et_object;
195 243
@@ -229,8 +277,7 @@ static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
229 return le64_to_cpu(vb->vb_xv->xr_last_eb_blk); 277 return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
230} 278}
231 279
232static void ocfs2_xattr_value_update_clusters(struct inode *inode, 280static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
233 struct ocfs2_extent_tree *et,
234 u32 clusters) 281 u32 clusters)
235{ 282{
236 struct ocfs2_xattr_value_buf *vb = et->et_object; 283 struct ocfs2_xattr_value_buf *vb = et->et_object;
@@ -252,12 +299,11 @@ static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
252 et->et_root_el = &xb->xb_attrs.xb_root.xt_list; 299 et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
253} 300}
254 301
255static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode, 302static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et)
256 struct ocfs2_extent_tree *et)
257{ 303{
304 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
258 et->et_max_leaf_clusters = 305 et->et_max_leaf_clusters =
259 ocfs2_clusters_for_bytes(inode->i_sb, 306 ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
260 OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
261} 307}
262 308
263static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, 309static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
@@ -277,8 +323,7 @@ static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
277 return le64_to_cpu(xt->xt_last_eb_blk); 323 return le64_to_cpu(xt->xt_last_eb_blk);
278} 324}
279 325
280static void ocfs2_xattr_tree_update_clusters(struct inode *inode, 326static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
281 struct ocfs2_extent_tree *et,
282 u32 clusters) 327 u32 clusters)
283{ 328{
284 struct ocfs2_xattr_block *xb = et->et_object; 329 struct ocfs2_xattr_block *xb = et->et_object;
@@ -309,8 +354,7 @@ static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
309 return le64_to_cpu(dx_root->dr_last_eb_blk); 354 return le64_to_cpu(dx_root->dr_last_eb_blk);
310} 355}
311 356
312static void ocfs2_dx_root_update_clusters(struct inode *inode, 357static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et,
313 struct ocfs2_extent_tree *et,
314 u32 clusters) 358 u32 clusters)
315{ 359{
316 struct ocfs2_dx_root_block *dx_root = et->et_object; 360 struct ocfs2_dx_root_block *dx_root = et->et_object;
@@ -318,8 +362,7 @@ static void ocfs2_dx_root_update_clusters(struct inode *inode,
318 le32_add_cpu(&dx_root->dr_clusters, clusters); 362 le32_add_cpu(&dx_root->dr_clusters, clusters);
319} 363}
320 364
321static int ocfs2_dx_root_sanity_check(struct inode *inode, 365static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et)
322 struct ocfs2_extent_tree *et)
323{ 366{
324 struct ocfs2_dx_root_block *dx_root = et->et_object; 367 struct ocfs2_dx_root_block *dx_root = et->et_object;
325 368
@@ -343,8 +386,54 @@ static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
343 .eo_fill_root_el = ocfs2_dx_root_fill_root_el, 386 .eo_fill_root_el = ocfs2_dx_root_fill_root_el,
344}; 387};
345 388
389static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
390{
391 struct ocfs2_refcount_block *rb = et->et_object;
392
393 et->et_root_el = &rb->rf_list;
394}
395
396static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
397 u64 blkno)
398{
399 struct ocfs2_refcount_block *rb = et->et_object;
400
401 rb->rf_last_eb_blk = cpu_to_le64(blkno);
402}
403
404static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
405{
406 struct ocfs2_refcount_block *rb = et->et_object;
407
408 return le64_to_cpu(rb->rf_last_eb_blk);
409}
410
411static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
412 u32 clusters)
413{
414 struct ocfs2_refcount_block *rb = et->et_object;
415
416 le32_add_cpu(&rb->rf_clusters, clusters);
417}
418
419static enum ocfs2_contig_type
420ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
421 struct ocfs2_extent_rec *ext,
422 struct ocfs2_extent_rec *insert_rec)
423{
424 return CONTIG_NONE;
425}
426
427static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
428 .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk,
429 .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk,
430 .eo_update_clusters = ocfs2_refcount_tree_update_clusters,
431 .eo_fill_root_el = ocfs2_refcount_tree_fill_root_el,
432 .eo_extent_contig = ocfs2_refcount_tree_extent_contig,
433};
434
346static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, 435static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
347 struct inode *inode, 436 struct ocfs2_caching_info *ci,
348 struct buffer_head *bh, 437 struct buffer_head *bh,
349 ocfs2_journal_access_func access, 438 ocfs2_journal_access_func access,
350 void *obj, 439 void *obj,
@@ -352,6 +441,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
352{ 441{
353 et->et_ops = ops; 442 et->et_ops = ops;
354 et->et_root_bh = bh; 443 et->et_root_bh = bh;
444 et->et_ci = ci;
355 et->et_root_journal_access = access; 445 et->et_root_journal_access = access;
356 if (!obj) 446 if (!obj)
357 obj = (void *)bh->b_data; 447 obj = (void *)bh->b_data;
@@ -361,41 +451,49 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
361 if (!et->et_ops->eo_fill_max_leaf_clusters) 451 if (!et->et_ops->eo_fill_max_leaf_clusters)
362 et->et_max_leaf_clusters = 0; 452 et->et_max_leaf_clusters = 0;
363 else 453 else
364 et->et_ops->eo_fill_max_leaf_clusters(inode, et); 454 et->et_ops->eo_fill_max_leaf_clusters(et);
365} 455}
366 456
367void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, 457void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
368 struct inode *inode, 458 struct ocfs2_caching_info *ci,
369 struct buffer_head *bh) 459 struct buffer_head *bh)
370{ 460{
371 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di, 461 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di,
372 NULL, &ocfs2_dinode_et_ops); 462 NULL, &ocfs2_dinode_et_ops);
373} 463}
374 464
375void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, 465void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
376 struct inode *inode, 466 struct ocfs2_caching_info *ci,
377 struct buffer_head *bh) 467 struct buffer_head *bh)
378{ 468{
379 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb, 469 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb,
380 NULL, &ocfs2_xattr_tree_et_ops); 470 NULL, &ocfs2_xattr_tree_et_ops);
381} 471}
382 472
383void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 473void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
384 struct inode *inode, 474 struct ocfs2_caching_info *ci,
385 struct ocfs2_xattr_value_buf *vb) 475 struct ocfs2_xattr_value_buf *vb)
386{ 476{
387 __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb, 477 __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb,
388 &ocfs2_xattr_value_et_ops); 478 &ocfs2_xattr_value_et_ops);
389} 479}
390 480
391void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, 481void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
392 struct inode *inode, 482 struct ocfs2_caching_info *ci,
393 struct buffer_head *bh) 483 struct buffer_head *bh)
394{ 484{
395 __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr, 485 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr,
396 NULL, &ocfs2_dx_root_et_ops); 486 NULL, &ocfs2_dx_root_et_ops);
397} 487}
398 488
489void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
490 struct ocfs2_caching_info *ci,
491 struct buffer_head *bh)
492{
493 __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
494 NULL, &ocfs2_refcount_tree_et_ops);
495}
496
399static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, 497static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
400 u64 new_last_eb_blk) 498 u64 new_last_eb_blk)
401{ 499{
@@ -407,78 +505,71 @@ static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
407 return et->et_ops->eo_get_last_eb_blk(et); 505 return et->et_ops->eo_get_last_eb_blk(et);
408} 506}
409 507
410static inline void ocfs2_et_update_clusters(struct inode *inode, 508static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et,
411 struct ocfs2_extent_tree *et,
412 u32 clusters) 509 u32 clusters)
413{ 510{
414 et->et_ops->eo_update_clusters(inode, et, clusters); 511 et->et_ops->eo_update_clusters(et, clusters);
512}
513
514static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et,
515 struct ocfs2_extent_rec *rec)
516{
517 if (et->et_ops->eo_extent_map_insert)
518 et->et_ops->eo_extent_map_insert(et, rec);
519}
520
521static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et,
522 u32 clusters)
523{
524 if (et->et_ops->eo_extent_map_truncate)
525 et->et_ops->eo_extent_map_truncate(et, clusters);
415} 526}
416 527
417static inline int ocfs2_et_root_journal_access(handle_t *handle, 528static inline int ocfs2_et_root_journal_access(handle_t *handle,
418 struct inode *inode,
419 struct ocfs2_extent_tree *et, 529 struct ocfs2_extent_tree *et,
420 int type) 530 int type)
421{ 531{
422 return et->et_root_journal_access(handle, inode, et->et_root_bh, 532 return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
423 type); 533 type);
424} 534}
425 535
426static inline int ocfs2_et_insert_check(struct inode *inode, 536static inline enum ocfs2_contig_type
427 struct ocfs2_extent_tree *et, 537 ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
538 struct ocfs2_extent_rec *rec,
539 struct ocfs2_extent_rec *insert_rec)
540{
541 if (et->et_ops->eo_extent_contig)
542 return et->et_ops->eo_extent_contig(et, rec, insert_rec);
543
544 return ocfs2_extent_rec_contig(
545 ocfs2_metadata_cache_get_super(et->et_ci),
546 rec, insert_rec);
547}
548
549static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
428 struct ocfs2_extent_rec *rec) 550 struct ocfs2_extent_rec *rec)
429{ 551{
430 int ret = 0; 552 int ret = 0;
431 553
432 if (et->et_ops->eo_insert_check) 554 if (et->et_ops->eo_insert_check)
433 ret = et->et_ops->eo_insert_check(inode, et, rec); 555 ret = et->et_ops->eo_insert_check(et, rec);
434 return ret; 556 return ret;
435} 557}
436 558
437static inline int ocfs2_et_sanity_check(struct inode *inode, 559static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
438 struct ocfs2_extent_tree *et)
439{ 560{
440 int ret = 0; 561 int ret = 0;
441 562
442 if (et->et_ops->eo_sanity_check) 563 if (et->et_ops->eo_sanity_check)
443 ret = et->et_ops->eo_sanity_check(inode, et); 564 ret = et->et_ops->eo_sanity_check(et);
444 return ret; 565 return ret;
445} 566}
446 567
447static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); 568static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
448static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 569static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
449 struct ocfs2_extent_block *eb); 570 struct ocfs2_extent_block *eb);
450 571static void ocfs2_adjust_rightmost_records(handle_t *handle,
451/* 572 struct ocfs2_extent_tree *et,
452 * Structures which describe a path through a btree, and functions to
453 * manipulate them.
454 *
455 * The idea here is to be as generic as possible with the tree
456 * manipulation code.
457 */
458struct ocfs2_path_item {
459 struct buffer_head *bh;
460 struct ocfs2_extent_list *el;
461};
462
463#define OCFS2_MAX_PATH_DEPTH 5
464
465struct ocfs2_path {
466 int p_tree_depth;
467 ocfs2_journal_access_func p_root_access;
468 struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
469};
470
471#define path_root_bh(_path) ((_path)->p_node[0].bh)
472#define path_root_el(_path) ((_path)->p_node[0].el)
473#define path_root_access(_path)((_path)->p_root_access)
474#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
475#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
476#define path_num_items(_path) ((_path)->p_tree_depth + 1)
477
478static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
479 u32 cpos);
480static void ocfs2_adjust_rightmost_records(struct inode *inode,
481 handle_t *handle,
482 struct ocfs2_path *path, 573 struct ocfs2_path *path,
483 struct ocfs2_extent_rec *insert_rec); 574 struct ocfs2_extent_rec *insert_rec);
484/* 575/*
@@ -486,7 +577,7 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode,
486 * to build another path. Generally, this involves freeing the buffer 577 * to build another path. Generally, this involves freeing the buffer
487 * heads. 578 * heads.
488 */ 579 */
489static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) 580void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
490{ 581{
491 int i, start = 0, depth = 0; 582 int i, start = 0, depth = 0;
492 struct ocfs2_path_item *node; 583 struct ocfs2_path_item *node;
@@ -515,7 +606,7 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
515 path->p_tree_depth = depth; 606 path->p_tree_depth = depth;
516} 607}
517 608
518static void ocfs2_free_path(struct ocfs2_path *path) 609void ocfs2_free_path(struct ocfs2_path *path)
519{ 610{
520 if (path) { 611 if (path) {
521 ocfs2_reinit_path(path, 0); 612 ocfs2_reinit_path(path, 0);
@@ -613,13 +704,13 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
613 return path; 704 return path;
614} 705}
615 706
616static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path) 707struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
617{ 708{
618 return ocfs2_new_path(path_root_bh(path), path_root_el(path), 709 return ocfs2_new_path(path_root_bh(path), path_root_el(path),
619 path_root_access(path)); 710 path_root_access(path));
620} 711}
621 712
622static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) 713struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
623{ 714{
624 return ocfs2_new_path(et->et_root_bh, et->et_root_el, 715 return ocfs2_new_path(et->et_root_bh, et->et_root_el,
625 et->et_root_journal_access); 716 et->et_root_journal_access);
@@ -632,10 +723,10 @@ static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
632 * I don't like the way this function's name looks next to 723 * I don't like the way this function's name looks next to
633 * ocfs2_journal_access_path(), but I don't have a better one. 724 * ocfs2_journal_access_path(), but I don't have a better one.
634 */ 725 */
635static int ocfs2_path_bh_journal_access(handle_t *handle, 726int ocfs2_path_bh_journal_access(handle_t *handle,
636 struct inode *inode, 727 struct ocfs2_caching_info *ci,
637 struct ocfs2_path *path, 728 struct ocfs2_path *path,
638 int idx) 729 int idx)
639{ 730{
640 ocfs2_journal_access_func access = path_root_access(path); 731 ocfs2_journal_access_func access = path_root_access(path);
641 732
@@ -645,15 +736,16 @@ static int ocfs2_path_bh_journal_access(handle_t *handle,
645 if (idx) 736 if (idx)
646 access = ocfs2_journal_access_eb; 737 access = ocfs2_journal_access_eb;
647 738
648 return access(handle, inode, path->p_node[idx].bh, 739 return access(handle, ci, path->p_node[idx].bh,
649 OCFS2_JOURNAL_ACCESS_WRITE); 740 OCFS2_JOURNAL_ACCESS_WRITE);
650} 741}
651 742
652/* 743/*
653 * Convenience function to journal all components in a path. 744 * Convenience function to journal all components in a path.
654 */ 745 */
655static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, 746int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
656 struct ocfs2_path *path) 747 handle_t *handle,
748 struct ocfs2_path *path)
657{ 749{
658 int i, ret = 0; 750 int i, ret = 0;
659 751
@@ -661,7 +753,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
661 goto out; 753 goto out;
662 754
663 for(i = 0; i < path_num_items(path); i++) { 755 for(i = 0; i < path_num_items(path); i++) {
664 ret = ocfs2_path_bh_journal_access(handle, inode, path, i); 756 ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
665 if (ret < 0) { 757 if (ret < 0) {
666 mlog_errno(ret); 758 mlog_errno(ret);
667 goto out; 759 goto out;
@@ -702,17 +794,9 @@ int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
702 return ret; 794 return ret;
703} 795}
704 796
705enum ocfs2_contig_type {
706 CONTIG_NONE = 0,
707 CONTIG_LEFT,
708 CONTIG_RIGHT,
709 CONTIG_LEFTRIGHT,
710};
711
712
713/* 797/*
714 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and 798 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
715 * ocfs2_extent_contig only work properly against leaf nodes! 799 * ocfs2_extent_rec_contig only work properly against leaf nodes!
716 */ 800 */
717static int ocfs2_block_extent_contig(struct super_block *sb, 801static int ocfs2_block_extent_contig(struct super_block *sb,
718 struct ocfs2_extent_rec *ext, 802 struct ocfs2_extent_rec *ext,
@@ -738,9 +822,9 @@ static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
738} 822}
739 823
740static enum ocfs2_contig_type 824static enum ocfs2_contig_type
741 ocfs2_extent_contig(struct inode *inode, 825 ocfs2_extent_rec_contig(struct super_block *sb,
742 struct ocfs2_extent_rec *ext, 826 struct ocfs2_extent_rec *ext,
743 struct ocfs2_extent_rec *insert_rec) 827 struct ocfs2_extent_rec *insert_rec)
744{ 828{
745 u64 blkno = le64_to_cpu(insert_rec->e_blkno); 829 u64 blkno = le64_to_cpu(insert_rec->e_blkno);
746 830
@@ -753,12 +837,12 @@ static enum ocfs2_contig_type
753 return CONTIG_NONE; 837 return CONTIG_NONE;
754 838
755 if (ocfs2_extents_adjacent(ext, insert_rec) && 839 if (ocfs2_extents_adjacent(ext, insert_rec) &&
756 ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) 840 ocfs2_block_extent_contig(sb, ext, blkno))
757 return CONTIG_RIGHT; 841 return CONTIG_RIGHT;
758 842
759 blkno = le64_to_cpu(ext->e_blkno); 843 blkno = le64_to_cpu(ext->e_blkno);
760 if (ocfs2_extents_adjacent(insert_rec, ext) && 844 if (ocfs2_extents_adjacent(insert_rec, ext) &&
761 ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) 845 ocfs2_block_extent_contig(sb, insert_rec, blkno))
762 return CONTIG_LEFT; 846 return CONTIG_LEFT;
763 847
764 return CONTIG_NONE; 848 return CONTIG_NONE;
@@ -853,13 +937,13 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
853 return 0; 937 return 0;
854} 938}
855 939
856int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno, 940int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
857 struct buffer_head **bh) 941 struct buffer_head **bh)
858{ 942{
859 int rc; 943 int rc;
860 struct buffer_head *tmp = *bh; 944 struct buffer_head *tmp = *bh;
861 945
862 rc = ocfs2_read_block(inode, eb_blkno, &tmp, 946 rc = ocfs2_read_block(ci, eb_blkno, &tmp,
863 ocfs2_validate_extent_block); 947 ocfs2_validate_extent_block);
864 948
865 /* If ocfs2_read_block() got us a new bh, pass it up. */ 949 /* If ocfs2_read_block() got us a new bh, pass it up. */
@@ -874,7 +958,6 @@ int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
874 * How many free extents have we got before we need more meta data? 958 * How many free extents have we got before we need more meta data?
875 */ 959 */
876int ocfs2_num_free_extents(struct ocfs2_super *osb, 960int ocfs2_num_free_extents(struct ocfs2_super *osb,
877 struct inode *inode,
878 struct ocfs2_extent_tree *et) 961 struct ocfs2_extent_tree *et)
879{ 962{
880 int retval; 963 int retval;
@@ -889,7 +972,8 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
889 last_eb_blk = ocfs2_et_get_last_eb_blk(et); 972 last_eb_blk = ocfs2_et_get_last_eb_blk(et);
890 973
891 if (last_eb_blk) { 974 if (last_eb_blk) {
892 retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh); 975 retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk,
976 &eb_bh);
893 if (retval < 0) { 977 if (retval < 0) {
894 mlog_errno(retval); 978 mlog_errno(retval);
895 goto bail; 979 goto bail;
@@ -913,9 +997,8 @@ bail:
913 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and 997 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
914 * l_count for you 998 * l_count for you
915 */ 999 */
916static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, 1000static int ocfs2_create_new_meta_bhs(handle_t *handle,
917 handle_t *handle, 1001 struct ocfs2_extent_tree *et,
918 struct inode *inode,
919 int wanted, 1002 int wanted,
920 struct ocfs2_alloc_context *meta_ac, 1003 struct ocfs2_alloc_context *meta_ac,
921 struct buffer_head *bhs[]) 1004 struct buffer_head *bhs[])
@@ -924,6 +1007,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
924 u16 suballoc_bit_start; 1007 u16 suballoc_bit_start;
925 u32 num_got; 1008 u32 num_got;
926 u64 first_blkno; 1009 u64 first_blkno;
1010 struct ocfs2_super *osb =
1011 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
927 struct ocfs2_extent_block *eb; 1012 struct ocfs2_extent_block *eb;
928 1013
929 mlog_entry_void(); 1014 mlog_entry_void();
@@ -949,9 +1034,10 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
949 mlog_errno(status); 1034 mlog_errno(status);
950 goto bail; 1035 goto bail;
951 } 1036 }
952 ocfs2_set_new_buffer_uptodate(inode, bhs[i]); 1037 ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]);
953 1038
954 status = ocfs2_journal_access_eb(handle, inode, bhs[i], 1039 status = ocfs2_journal_access_eb(handle, et->et_ci,
1040 bhs[i],
955 OCFS2_JOURNAL_ACCESS_CREATE); 1041 OCFS2_JOURNAL_ACCESS_CREATE);
956 if (status < 0) { 1042 if (status < 0) {
957 mlog_errno(status); 1043 mlog_errno(status);
@@ -1023,7 +1109,6 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
1023 * extent block's rightmost record. 1109 * extent block's rightmost record.
1024 */ 1110 */
1025static int ocfs2_adjust_rightmost_branch(handle_t *handle, 1111static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1026 struct inode *inode,
1027 struct ocfs2_extent_tree *et) 1112 struct ocfs2_extent_tree *et)
1028{ 1113{
1029 int status; 1114 int status;
@@ -1037,7 +1122,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1037 return status; 1122 return status;
1038 } 1123 }
1039 1124
1040 status = ocfs2_find_path(inode, path, UINT_MAX); 1125 status = ocfs2_find_path(et->et_ci, path, UINT_MAX);
1041 if (status < 0) { 1126 if (status < 0) {
1042 mlog_errno(status); 1127 mlog_errno(status);
1043 goto out; 1128 goto out;
@@ -1050,7 +1135,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1050 goto out; 1135 goto out;
1051 } 1136 }
1052 1137
1053 status = ocfs2_journal_access_path(inode, handle, path); 1138 status = ocfs2_journal_access_path(et->et_ci, handle, path);
1054 if (status < 0) { 1139 if (status < 0) {
1055 mlog_errno(status); 1140 mlog_errno(status);
1056 goto out; 1141 goto out;
@@ -1059,7 +1144,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1059 el = path_leaf_el(path); 1144 el = path_leaf_el(path);
1060 rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1]; 1145 rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1];
1061 1146
1062 ocfs2_adjust_rightmost_records(inode, handle, path, rec); 1147 ocfs2_adjust_rightmost_records(handle, et, path, rec);
1063 1148
1064out: 1149out:
1065 ocfs2_free_path(path); 1150 ocfs2_free_path(path);
@@ -1068,7 +1153,7 @@ out:
1068 1153
1069/* 1154/*
1070 * Add an entire tree branch to our inode. eb_bh is the extent block 1155 * Add an entire tree branch to our inode. eb_bh is the extent block
1071 * to start at, if we don't want to start the branch at the dinode 1156 * to start at, if we don't want to start the branch at the root
1072 * structure. 1157 * structure.
1073 * 1158 *
1074 * last_eb_bh is required as we have to update it's next_leaf pointer 1159 * last_eb_bh is required as we have to update it's next_leaf pointer
@@ -1077,9 +1162,7 @@ out:
1077 * the new branch will be 'empty' in the sense that every block will 1162 * the new branch will be 'empty' in the sense that every block will
1078 * contain a single record with cluster count == 0. 1163 * contain a single record with cluster count == 0.
1079 */ 1164 */
1080static int ocfs2_add_branch(struct ocfs2_super *osb, 1165static int ocfs2_add_branch(handle_t *handle,
1081 handle_t *handle,
1082 struct inode *inode,
1083 struct ocfs2_extent_tree *et, 1166 struct ocfs2_extent_tree *et,
1084 struct buffer_head *eb_bh, 1167 struct buffer_head *eb_bh,
1085 struct buffer_head **last_eb_bh, 1168 struct buffer_head **last_eb_bh,
@@ -1123,7 +1206,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
1123 if (root_end > new_cpos) { 1206 if (root_end > new_cpos) {
1124 mlog(0, "adjust the cluster end from %u to %u\n", 1207 mlog(0, "adjust the cluster end from %u to %u\n",
1125 root_end, new_cpos); 1208 root_end, new_cpos);
1126 status = ocfs2_adjust_rightmost_branch(handle, inode, et); 1209 status = ocfs2_adjust_rightmost_branch(handle, et);
1127 if (status) { 1210 if (status) {
1128 mlog_errno(status); 1211 mlog_errno(status);
1129 goto bail; 1212 goto bail;
@@ -1139,7 +1222,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
1139 goto bail; 1222 goto bail;
1140 } 1223 }
1141 1224
1142 status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks, 1225 status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
1143 meta_ac, new_eb_bhs); 1226 meta_ac, new_eb_bhs);
1144 if (status < 0) { 1227 if (status < 0) {
1145 mlog_errno(status); 1228 mlog_errno(status);
@@ -1161,7 +1244,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
1161 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); 1244 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1162 eb_el = &eb->h_list; 1245 eb_el = &eb->h_list;
1163 1246
1164 status = ocfs2_journal_access_eb(handle, inode, bh, 1247 status = ocfs2_journal_access_eb(handle, et->et_ci, bh,
1165 OCFS2_JOURNAL_ACCESS_CREATE); 1248 OCFS2_JOURNAL_ACCESS_CREATE);
1166 if (status < 0) { 1249 if (status < 0) {
1167 mlog_errno(status); 1250 mlog_errno(status);
@@ -1201,20 +1284,20 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
1201 * journal_dirty erroring as it won't unless we've aborted the 1284 * journal_dirty erroring as it won't unless we've aborted the
1202 * handle (in which case we would never be here) so reserving 1285 * handle (in which case we would never be here) so reserving
1203 * the write with journal_access is all we need to do. */ 1286 * the write with journal_access is all we need to do. */
1204 status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh, 1287 status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh,
1205 OCFS2_JOURNAL_ACCESS_WRITE); 1288 OCFS2_JOURNAL_ACCESS_WRITE);
1206 if (status < 0) { 1289 if (status < 0) {
1207 mlog_errno(status); 1290 mlog_errno(status);
1208 goto bail; 1291 goto bail;
1209 } 1292 }
1210 status = ocfs2_et_root_journal_access(handle, inode, et, 1293 status = ocfs2_et_root_journal_access(handle, et,
1211 OCFS2_JOURNAL_ACCESS_WRITE); 1294 OCFS2_JOURNAL_ACCESS_WRITE);
1212 if (status < 0) { 1295 if (status < 0) {
1213 mlog_errno(status); 1296 mlog_errno(status);
1214 goto bail; 1297 goto bail;
1215 } 1298 }
1216 if (eb_bh) { 1299 if (eb_bh) {
1217 status = ocfs2_journal_access_eb(handle, inode, eb_bh, 1300 status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh,
1218 OCFS2_JOURNAL_ACCESS_WRITE); 1301 OCFS2_JOURNAL_ACCESS_WRITE);
1219 if (status < 0) { 1302 if (status < 0) {
1220 mlog_errno(status); 1303 mlog_errno(status);
@@ -1274,9 +1357,7 @@ bail:
1274 * returns back the new extent block so you can add a branch to it 1357 * returns back the new extent block so you can add a branch to it
1275 * after this call. 1358 * after this call.
1276 */ 1359 */
1277static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, 1360static int ocfs2_shift_tree_depth(handle_t *handle,
1278 handle_t *handle,
1279 struct inode *inode,
1280 struct ocfs2_extent_tree *et, 1361 struct ocfs2_extent_tree *et,
1281 struct ocfs2_alloc_context *meta_ac, 1362 struct ocfs2_alloc_context *meta_ac,
1282 struct buffer_head **ret_new_eb_bh) 1363 struct buffer_head **ret_new_eb_bh)
@@ -1290,7 +1371,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1290 1371
1291 mlog_entry_void(); 1372 mlog_entry_void();
1292 1373
1293 status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, 1374 status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
1294 &new_eb_bh); 1375 &new_eb_bh);
1295 if (status < 0) { 1376 if (status < 0) {
1296 mlog_errno(status); 1377 mlog_errno(status);
@@ -1304,7 +1385,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1304 eb_el = &eb->h_list; 1385 eb_el = &eb->h_list;
1305 root_el = et->et_root_el; 1386 root_el = et->et_root_el;
1306 1387
1307 status = ocfs2_journal_access_eb(handle, inode, new_eb_bh, 1388 status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh,
1308 OCFS2_JOURNAL_ACCESS_CREATE); 1389 OCFS2_JOURNAL_ACCESS_CREATE);
1309 if (status < 0) { 1390 if (status < 0) {
1310 mlog_errno(status); 1391 mlog_errno(status);
@@ -1323,7 +1404,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1323 goto bail; 1404 goto bail;
1324 } 1405 }
1325 1406
1326 status = ocfs2_et_root_journal_access(handle, inode, et, 1407 status = ocfs2_et_root_journal_access(handle, et,
1327 OCFS2_JOURNAL_ACCESS_WRITE); 1408 OCFS2_JOURNAL_ACCESS_WRITE);
1328 if (status < 0) { 1409 if (status < 0) {
1329 mlog_errno(status); 1410 mlog_errno(status);
@@ -1379,9 +1460,7 @@ bail:
1379 * 1460 *
1380 * return status < 0 indicates an error. 1461 * return status < 0 indicates an error.
1381 */ 1462 */
1382static int ocfs2_find_branch_target(struct ocfs2_super *osb, 1463static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
1383 struct inode *inode,
1384 struct ocfs2_extent_tree *et,
1385 struct buffer_head **target_bh) 1464 struct buffer_head **target_bh)
1386{ 1465{
1387 int status = 0, i; 1466 int status = 0, i;
@@ -1399,19 +1478,21 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1399 1478
1400 while(le16_to_cpu(el->l_tree_depth) > 1) { 1479 while(le16_to_cpu(el->l_tree_depth) > 1) {
1401 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1480 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1402 ocfs2_error(inode->i_sb, "Dinode %llu has empty " 1481 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1482 "Owner %llu has empty "
1403 "extent list (next_free_rec == 0)", 1483 "extent list (next_free_rec == 0)",
1404 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1484 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
1405 status = -EIO; 1485 status = -EIO;
1406 goto bail; 1486 goto bail;
1407 } 1487 }
1408 i = le16_to_cpu(el->l_next_free_rec) - 1; 1488 i = le16_to_cpu(el->l_next_free_rec) - 1;
1409 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1489 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1410 if (!blkno) { 1490 if (!blkno) {
1411 ocfs2_error(inode->i_sb, "Dinode %llu has extent " 1491 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1492 "Owner %llu has extent "
1412 "list where extent # %d has no physical " 1493 "list where extent # %d has no physical "
1413 "block start", 1494 "block start",
1414 (unsigned long long)OCFS2_I(inode)->ip_blkno, i); 1495 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
1415 status = -EIO; 1496 status = -EIO;
1416 goto bail; 1497 goto bail;
1417 } 1498 }
@@ -1419,7 +1500,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1419 brelse(bh); 1500 brelse(bh);
1420 bh = NULL; 1501 bh = NULL;
1421 1502
1422 status = ocfs2_read_extent_block(inode, blkno, &bh); 1503 status = ocfs2_read_extent_block(et->et_ci, blkno, &bh);
1423 if (status < 0) { 1504 if (status < 0) {
1424 mlog_errno(status); 1505 mlog_errno(status);
1425 goto bail; 1506 goto bail;
@@ -1460,20 +1541,18 @@ bail:
1460 * 1541 *
1461 * *last_eb_bh will be updated by ocfs2_add_branch(). 1542 * *last_eb_bh will be updated by ocfs2_add_branch().
1462 */ 1543 */
1463static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, 1544static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
1464 struct ocfs2_extent_tree *et, int *final_depth, 1545 int *final_depth, struct buffer_head **last_eb_bh,
1465 struct buffer_head **last_eb_bh,
1466 struct ocfs2_alloc_context *meta_ac) 1546 struct ocfs2_alloc_context *meta_ac)
1467{ 1547{
1468 int ret, shift; 1548 int ret, shift;
1469 struct ocfs2_extent_list *el = et->et_root_el; 1549 struct ocfs2_extent_list *el = et->et_root_el;
1470 int depth = le16_to_cpu(el->l_tree_depth); 1550 int depth = le16_to_cpu(el->l_tree_depth);
1471 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1472 struct buffer_head *bh = NULL; 1551 struct buffer_head *bh = NULL;
1473 1552
1474 BUG_ON(meta_ac == NULL); 1553 BUG_ON(meta_ac == NULL);
1475 1554
1476 shift = ocfs2_find_branch_target(osb, inode, et, &bh); 1555 shift = ocfs2_find_branch_target(et, &bh);
1477 if (shift < 0) { 1556 if (shift < 0) {
1478 ret = shift; 1557 ret = shift;
1479 mlog_errno(ret); 1558 mlog_errno(ret);
@@ -1490,8 +1569,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1490 /* ocfs2_shift_tree_depth will return us a buffer with 1569 /* ocfs2_shift_tree_depth will return us a buffer with
1491 * the new extent block (so we can pass that to 1570 * the new extent block (so we can pass that to
1492 * ocfs2_add_branch). */ 1571 * ocfs2_add_branch). */
1493 ret = ocfs2_shift_tree_depth(osb, handle, inode, et, 1572 ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh);
1494 meta_ac, &bh);
1495 if (ret < 0) { 1573 if (ret < 0) {
1496 mlog_errno(ret); 1574 mlog_errno(ret);
1497 goto out; 1575 goto out;
@@ -1517,7 +1595,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1517 /* call ocfs2_add_branch to add the final part of the tree with 1595 /* call ocfs2_add_branch to add the final part of the tree with
1518 * the new data. */ 1596 * the new data. */
1519 mlog(0, "add branch. bh = %p\n", bh); 1597 mlog(0, "add branch. bh = %p\n", bh);
1520 ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh, 1598 ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
1521 meta_ac); 1599 meta_ac);
1522 if (ret < 0) { 1600 if (ret < 0) {
1523 mlog_errno(ret); 1601 mlog_errno(ret);
@@ -1687,7 +1765,7 @@ set_and_inc:
1687 * 1765 *
1688 * The array index of the subtree root is passed back. 1766 * The array index of the subtree root is passed back.
1689 */ 1767 */
1690static int ocfs2_find_subtree_root(struct inode *inode, 1768static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
1691 struct ocfs2_path *left, 1769 struct ocfs2_path *left,
1692 struct ocfs2_path *right) 1770 struct ocfs2_path *right)
1693{ 1771{
@@ -1705,10 +1783,10 @@ static int ocfs2_find_subtree_root(struct inode *inode,
1705 * The caller didn't pass two adjacent paths. 1783 * The caller didn't pass two adjacent paths.
1706 */ 1784 */
1707 mlog_bug_on_msg(i > left->p_tree_depth, 1785 mlog_bug_on_msg(i > left->p_tree_depth,
1708 "Inode %lu, left depth %u, right depth %u\n" 1786 "Owner %llu, left depth %u, right depth %u\n"
1709 "left leaf blk %llu, right leaf blk %llu\n", 1787 "left leaf blk %llu, right leaf blk %llu\n",
1710 inode->i_ino, left->p_tree_depth, 1788 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
1711 right->p_tree_depth, 1789 left->p_tree_depth, right->p_tree_depth,
1712 (unsigned long long)path_leaf_bh(left)->b_blocknr, 1790 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1713 (unsigned long long)path_leaf_bh(right)->b_blocknr); 1791 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1714 } while (left->p_node[i].bh->b_blocknr == 1792 } while (left->p_node[i].bh->b_blocknr ==
@@ -1725,7 +1803,7 @@ typedef void (path_insert_t)(void *, struct buffer_head *);
1725 * This code can be called with a cpos larger than the tree, in which 1803 * This code can be called with a cpos larger than the tree, in which
1726 * case it will return the rightmost path. 1804 * case it will return the rightmost path.
1727 */ 1805 */
1728static int __ocfs2_find_path(struct inode *inode, 1806static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
1729 struct ocfs2_extent_list *root_el, u32 cpos, 1807 struct ocfs2_extent_list *root_el, u32 cpos,
1730 path_insert_t *func, void *data) 1808 path_insert_t *func, void *data)
1731{ 1809{
@@ -1736,15 +1814,14 @@ static int __ocfs2_find_path(struct inode *inode,
1736 struct ocfs2_extent_block *eb; 1814 struct ocfs2_extent_block *eb;
1737 struct ocfs2_extent_list *el; 1815 struct ocfs2_extent_list *el;
1738 struct ocfs2_extent_rec *rec; 1816 struct ocfs2_extent_rec *rec;
1739 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1740 1817
1741 el = root_el; 1818 el = root_el;
1742 while (el->l_tree_depth) { 1819 while (el->l_tree_depth) {
1743 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1820 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1744 ocfs2_error(inode->i_sb, 1821 ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1745 "Inode %llu has empty extent list at " 1822 "Owner %llu has empty extent list at "
1746 "depth %u\n", 1823 "depth %u\n",
1747 (unsigned long long)oi->ip_blkno, 1824 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1748 le16_to_cpu(el->l_tree_depth)); 1825 le16_to_cpu(el->l_tree_depth));
1749 ret = -EROFS; 1826 ret = -EROFS;
1750 goto out; 1827 goto out;
@@ -1767,10 +1844,10 @@ static int __ocfs2_find_path(struct inode *inode,
1767 1844
1768 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1845 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1769 if (blkno == 0) { 1846 if (blkno == 0) {
1770 ocfs2_error(inode->i_sb, 1847 ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1771 "Inode %llu has bad blkno in extent list " 1848 "Owner %llu has bad blkno in extent list "
1772 "at depth %u (index %d)\n", 1849 "at depth %u (index %d)\n",
1773 (unsigned long long)oi->ip_blkno, 1850 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1774 le16_to_cpu(el->l_tree_depth), i); 1851 le16_to_cpu(el->l_tree_depth), i);
1775 ret = -EROFS; 1852 ret = -EROFS;
1776 goto out; 1853 goto out;
@@ -1778,7 +1855,7 @@ static int __ocfs2_find_path(struct inode *inode,
1778 1855
1779 brelse(bh); 1856 brelse(bh);
1780 bh = NULL; 1857 bh = NULL;
1781 ret = ocfs2_read_extent_block(inode, blkno, &bh); 1858 ret = ocfs2_read_extent_block(ci, blkno, &bh);
1782 if (ret) { 1859 if (ret) {
1783 mlog_errno(ret); 1860 mlog_errno(ret);
1784 goto out; 1861 goto out;
@@ -1789,10 +1866,10 @@ static int __ocfs2_find_path(struct inode *inode,
1789 1866
1790 if (le16_to_cpu(el->l_next_free_rec) > 1867 if (le16_to_cpu(el->l_next_free_rec) >
1791 le16_to_cpu(el->l_count)) { 1868 le16_to_cpu(el->l_count)) {
1792 ocfs2_error(inode->i_sb, 1869 ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1793 "Inode %llu has bad count in extent list " 1870 "Owner %llu has bad count in extent list "
1794 "at block %llu (next free=%u, count=%u)\n", 1871 "at block %llu (next free=%u, count=%u)\n",
1795 (unsigned long long)oi->ip_blkno, 1872 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1796 (unsigned long long)bh->b_blocknr, 1873 (unsigned long long)bh->b_blocknr,
1797 le16_to_cpu(el->l_next_free_rec), 1874 le16_to_cpu(el->l_next_free_rec),
1798 le16_to_cpu(el->l_count)); 1875 le16_to_cpu(el->l_count));
@@ -1836,14 +1913,14 @@ static void find_path_ins(void *data, struct buffer_head *bh)
1836 ocfs2_path_insert_eb(fp->path, fp->index, bh); 1913 ocfs2_path_insert_eb(fp->path, fp->index, bh);
1837 fp->index++; 1914 fp->index++;
1838} 1915}
1839static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, 1916int ocfs2_find_path(struct ocfs2_caching_info *ci,
1840 u32 cpos) 1917 struct ocfs2_path *path, u32 cpos)
1841{ 1918{
1842 struct find_path_data data; 1919 struct find_path_data data;
1843 1920
1844 data.index = 1; 1921 data.index = 1;
1845 data.path = path; 1922 data.path = path;
1846 return __ocfs2_find_path(inode, path_root_el(path), cpos, 1923 return __ocfs2_find_path(ci, path_root_el(path), cpos,
1847 find_path_ins, &data); 1924 find_path_ins, &data);
1848} 1925}
1849 1926
@@ -1868,13 +1945,14 @@ static void find_leaf_ins(void *data, struct buffer_head *bh)
1868 * 1945 *
1869 * This function doesn't handle non btree extent lists. 1946 * This function doesn't handle non btree extent lists.
1870 */ 1947 */
1871int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 1948int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
1872 u32 cpos, struct buffer_head **leaf_bh) 1949 struct ocfs2_extent_list *root_el, u32 cpos,
1950 struct buffer_head **leaf_bh)
1873{ 1951{
1874 int ret; 1952 int ret;
1875 struct buffer_head *bh = NULL; 1953 struct buffer_head *bh = NULL;
1876 1954
1877 ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh); 1955 ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh);
1878 if (ret) { 1956 if (ret) {
1879 mlog_errno(ret); 1957 mlog_errno(ret);
1880 goto out; 1958 goto out;
@@ -1980,7 +2058,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1980 * - When we've adjusted the last extent record in the left path leaf and the 2058 * - When we've adjusted the last extent record in the left path leaf and the
1981 * 1st extent record in the right path leaf during cross extent block merge. 2059 * 1st extent record in the right path leaf during cross extent block merge.
1982 */ 2060 */
1983static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, 2061static void ocfs2_complete_edge_insert(handle_t *handle,
1984 struct ocfs2_path *left_path, 2062 struct ocfs2_path *left_path,
1985 struct ocfs2_path *right_path, 2063 struct ocfs2_path *right_path,
1986 int subtree_index) 2064 int subtree_index)
@@ -2058,8 +2136,8 @@ static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
2058 mlog_errno(ret); 2136 mlog_errno(ret);
2059} 2137}
2060 2138
2061static int ocfs2_rotate_subtree_right(struct inode *inode, 2139static int ocfs2_rotate_subtree_right(handle_t *handle,
2062 handle_t *handle, 2140 struct ocfs2_extent_tree *et,
2063 struct ocfs2_path *left_path, 2141 struct ocfs2_path *left_path,
2064 struct ocfs2_path *right_path, 2142 struct ocfs2_path *right_path,
2065 int subtree_index) 2143 int subtree_index)
@@ -2075,10 +2153,10 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
2075 left_el = path_leaf_el(left_path); 2153 left_el = path_leaf_el(left_path);
2076 2154
2077 if (left_el->l_next_free_rec != left_el->l_count) { 2155 if (left_el->l_next_free_rec != left_el->l_count) {
2078 ocfs2_error(inode->i_sb, 2156 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
2079 "Inode %llu has non-full interior leaf node %llu" 2157 "Inode %llu has non-full interior leaf node %llu"
2080 "(next free = %u)", 2158 "(next free = %u)",
2081 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2159 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2082 (unsigned long long)left_leaf_bh->b_blocknr, 2160 (unsigned long long)left_leaf_bh->b_blocknr,
2083 le16_to_cpu(left_el->l_next_free_rec)); 2161 le16_to_cpu(left_el->l_next_free_rec));
2084 return -EROFS; 2162 return -EROFS;
@@ -2094,7 +2172,7 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
2094 root_bh = left_path->p_node[subtree_index].bh; 2172 root_bh = left_path->p_node[subtree_index].bh;
2095 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 2173 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2096 2174
2097 ret = ocfs2_path_bh_journal_access(handle, inode, right_path, 2175 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2098 subtree_index); 2176 subtree_index);
2099 if (ret) { 2177 if (ret) {
2100 mlog_errno(ret); 2178 mlog_errno(ret);
@@ -2102,14 +2180,14 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
2102 } 2180 }
2103 2181
2104 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 2182 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2105 ret = ocfs2_path_bh_journal_access(handle, inode, 2183 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2106 right_path, i); 2184 right_path, i);
2107 if (ret) { 2185 if (ret) {
2108 mlog_errno(ret); 2186 mlog_errno(ret);
2109 goto out; 2187 goto out;
2110 } 2188 }
2111 2189
2112 ret = ocfs2_path_bh_journal_access(handle, inode, 2190 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2113 left_path, i); 2191 left_path, i);
2114 if (ret) { 2192 if (ret) {
2115 mlog_errno(ret); 2193 mlog_errno(ret);
@@ -2123,7 +2201,7 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
2123 /* This is a code error, not a disk corruption. */ 2201 /* This is a code error, not a disk corruption. */
2124 mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " 2202 mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
2125 "because rightmost leaf block %llu is empty\n", 2203 "because rightmost leaf block %llu is empty\n",
2126 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2204 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2127 (unsigned long long)right_leaf_bh->b_blocknr); 2205 (unsigned long long)right_leaf_bh->b_blocknr);
2128 2206
2129 ocfs2_create_empty_extent(right_el); 2207 ocfs2_create_empty_extent(right_el);
@@ -2157,8 +2235,8 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
2157 goto out; 2235 goto out;
2158 } 2236 }
2159 2237
2160 ocfs2_complete_edge_insert(inode, handle, left_path, right_path, 2238 ocfs2_complete_edge_insert(handle, left_path, right_path,
2161 subtree_index); 2239 subtree_index);
2162 2240
2163out: 2241out:
2164 return ret; 2242 return ret;
@@ -2248,10 +2326,18 @@ static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2248 int op_credits, 2326 int op_credits,
2249 struct ocfs2_path *path) 2327 struct ocfs2_path *path)
2250{ 2328{
2329 int ret;
2251 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; 2330 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2252 2331
2253 if (handle->h_buffer_credits < credits) 2332 if (handle->h_buffer_credits < credits) {
2254 return ocfs2_extend_trans(handle, credits); 2333 ret = ocfs2_extend_trans(handle,
2334 credits - handle->h_buffer_credits);
2335 if (ret)
2336 return ret;
2337
2338 if (unlikely(handle->h_buffer_credits < credits))
2339 return ocfs2_extend_trans(handle, credits);
2340 }
2255 2341
2256 return 0; 2342 return 0;
2257} 2343}
@@ -2321,8 +2407,8 @@ static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2321 * *ret_left_path will contain a valid path which can be passed to 2407 * *ret_left_path will contain a valid path which can be passed to
2322 * ocfs2_insert_path(). 2408 * ocfs2_insert_path().
2323 */ 2409 */
2324static int ocfs2_rotate_tree_right(struct inode *inode, 2410static int ocfs2_rotate_tree_right(handle_t *handle,
2325 handle_t *handle, 2411 struct ocfs2_extent_tree *et,
2326 enum ocfs2_split_type split, 2412 enum ocfs2_split_type split,
2327 u32 insert_cpos, 2413 u32 insert_cpos,
2328 struct ocfs2_path *right_path, 2414 struct ocfs2_path *right_path,
@@ -2331,6 +2417,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2331 int ret, start, orig_credits = handle->h_buffer_credits; 2417 int ret, start, orig_credits = handle->h_buffer_credits;
2332 u32 cpos; 2418 u32 cpos;
2333 struct ocfs2_path *left_path = NULL; 2419 struct ocfs2_path *left_path = NULL;
2420 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2334 2421
2335 *ret_left_path = NULL; 2422 *ret_left_path = NULL;
2336 2423
@@ -2341,7 +2428,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2341 goto out; 2428 goto out;
2342 } 2429 }
2343 2430
2344 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos); 2431 ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2345 if (ret) { 2432 if (ret) {
2346 mlog_errno(ret); 2433 mlog_errno(ret);
2347 goto out; 2434 goto out;
@@ -2379,7 +2466,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2379 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", 2466 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2380 insert_cpos, cpos); 2467 insert_cpos, cpos);
2381 2468
2382 ret = ocfs2_find_path(inode, left_path, cpos); 2469 ret = ocfs2_find_path(et->et_ci, left_path, cpos);
2383 if (ret) { 2470 if (ret) {
2384 mlog_errno(ret); 2471 mlog_errno(ret);
2385 goto out; 2472 goto out;
@@ -2387,10 +2474,11 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2387 2474
2388 mlog_bug_on_msg(path_leaf_bh(left_path) == 2475 mlog_bug_on_msg(path_leaf_bh(left_path) ==
2389 path_leaf_bh(right_path), 2476 path_leaf_bh(right_path),
2390 "Inode %lu: error during insert of %u " 2477 "Owner %llu: error during insert of %u "
2391 "(left path cpos %u) results in two identical " 2478 "(left path cpos %u) results in two identical "
2392 "paths ending at %llu\n", 2479 "paths ending at %llu\n",
2393 inode->i_ino, insert_cpos, cpos, 2480 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2481 insert_cpos, cpos,
2394 (unsigned long long) 2482 (unsigned long long)
2395 path_leaf_bh(left_path)->b_blocknr); 2483 path_leaf_bh(left_path)->b_blocknr);
2396 2484
@@ -2416,7 +2504,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2416 goto out_ret_path; 2504 goto out_ret_path;
2417 } 2505 }
2418 2506
2419 start = ocfs2_find_subtree_root(inode, left_path, right_path); 2507 start = ocfs2_find_subtree_root(et, left_path, right_path);
2420 2508
2421 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 2509 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2422 start, 2510 start,
@@ -2430,7 +2518,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2430 goto out; 2518 goto out;
2431 } 2519 }
2432 2520
2433 ret = ocfs2_rotate_subtree_right(inode, handle, left_path, 2521 ret = ocfs2_rotate_subtree_right(handle, et, left_path,
2434 right_path, start); 2522 right_path, start);
2435 if (ret) { 2523 if (ret) {
2436 mlog_errno(ret); 2524 mlog_errno(ret);
@@ -2462,8 +2550,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
2462 */ 2550 */
2463 ocfs2_mv_path(right_path, left_path); 2551 ocfs2_mv_path(right_path, left_path);
2464 2552
2465 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, 2553 ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2466 &cpos);
2467 if (ret) { 2554 if (ret) {
2468 mlog_errno(ret); 2555 mlog_errno(ret);
2469 goto out; 2556 goto out;
@@ -2477,7 +2564,8 @@ out_ret_path:
2477 return ret; 2564 return ret;
2478} 2565}
2479 2566
2480static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, 2567static int ocfs2_update_edge_lengths(handle_t *handle,
2568 struct ocfs2_extent_tree *et,
2481 int subtree_index, struct ocfs2_path *path) 2569 int subtree_index, struct ocfs2_path *path)
2482{ 2570{
2483 int i, idx, ret; 2571 int i, idx, ret;
@@ -2502,7 +2590,7 @@ static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2502 goto out; 2590 goto out;
2503 } 2591 }
2504 2592
2505 ret = ocfs2_journal_access_path(inode, handle, path); 2593 ret = ocfs2_journal_access_path(et->et_ci, handle, path);
2506 if (ret) { 2594 if (ret) {
2507 mlog_errno(ret); 2595 mlog_errno(ret);
2508 goto out; 2596 goto out;
@@ -2532,7 +2620,8 @@ out:
2532 return ret; 2620 return ret;
2533} 2621}
2534 2622
2535static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, 2623static void ocfs2_unlink_path(handle_t *handle,
2624 struct ocfs2_extent_tree *et,
2536 struct ocfs2_cached_dealloc_ctxt *dealloc, 2625 struct ocfs2_cached_dealloc_ctxt *dealloc,
2537 struct ocfs2_path *path, int unlink_start) 2626 struct ocfs2_path *path, int unlink_start)
2538{ 2627{
@@ -2554,12 +2643,12 @@ static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2554 mlog(ML_ERROR, 2643 mlog(ML_ERROR,
2555 "Inode %llu, attempted to remove extent block " 2644 "Inode %llu, attempted to remove extent block "
2556 "%llu with %u records\n", 2645 "%llu with %u records\n",
2557 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2646 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2558 (unsigned long long)le64_to_cpu(eb->h_blkno), 2647 (unsigned long long)le64_to_cpu(eb->h_blkno),
2559 le16_to_cpu(el->l_next_free_rec)); 2648 le16_to_cpu(el->l_next_free_rec));
2560 2649
2561 ocfs2_journal_dirty(handle, bh); 2650 ocfs2_journal_dirty(handle, bh);
2562 ocfs2_remove_from_cache(inode, bh); 2651 ocfs2_remove_from_cache(et->et_ci, bh);
2563 continue; 2652 continue;
2564 } 2653 }
2565 2654
@@ -2572,11 +2661,12 @@ static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2572 if (ret) 2661 if (ret)
2573 mlog_errno(ret); 2662 mlog_errno(ret);
2574 2663
2575 ocfs2_remove_from_cache(inode, bh); 2664 ocfs2_remove_from_cache(et->et_ci, bh);
2576 } 2665 }
2577} 2666}
2578 2667
2579static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle, 2668static void ocfs2_unlink_subtree(handle_t *handle,
2669 struct ocfs2_extent_tree *et,
2580 struct ocfs2_path *left_path, 2670 struct ocfs2_path *left_path,
2581 struct ocfs2_path *right_path, 2671 struct ocfs2_path *right_path,
2582 int subtree_index, 2672 int subtree_index,
@@ -2607,17 +2697,17 @@ static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2607 ocfs2_journal_dirty(handle, root_bh); 2697 ocfs2_journal_dirty(handle, root_bh);
2608 ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 2698 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2609 2699
2610 ocfs2_unlink_path(inode, handle, dealloc, right_path, 2700 ocfs2_unlink_path(handle, et, dealloc, right_path,
2611 subtree_index + 1); 2701 subtree_index + 1);
2612} 2702}
2613 2703
2614static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, 2704static int ocfs2_rotate_subtree_left(handle_t *handle,
2705 struct ocfs2_extent_tree *et,
2615 struct ocfs2_path *left_path, 2706 struct ocfs2_path *left_path,
2616 struct ocfs2_path *right_path, 2707 struct ocfs2_path *right_path,
2617 int subtree_index, 2708 int subtree_index,
2618 struct ocfs2_cached_dealloc_ctxt *dealloc, 2709 struct ocfs2_cached_dealloc_ctxt *dealloc,
2619 int *deleted, 2710 int *deleted)
2620 struct ocfs2_extent_tree *et)
2621{ 2711{
2622 int ret, i, del_right_subtree = 0, right_has_empty = 0; 2712 int ret, i, del_right_subtree = 0, right_has_empty = 0;
2623 struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path); 2713 struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
@@ -2653,7 +2743,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2653 return -EAGAIN; 2743 return -EAGAIN;
2654 2744
2655 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { 2745 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2656 ret = ocfs2_journal_access_eb(handle, inode, 2746 ret = ocfs2_journal_access_eb(handle, et->et_ci,
2657 path_leaf_bh(right_path), 2747 path_leaf_bh(right_path),
2658 OCFS2_JOURNAL_ACCESS_WRITE); 2748 OCFS2_JOURNAL_ACCESS_WRITE);
2659 if (ret) { 2749 if (ret) {
@@ -2672,7 +2762,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2672 * We have to update i_last_eb_blk during the meta 2762 * We have to update i_last_eb_blk during the meta
2673 * data delete. 2763 * data delete.
2674 */ 2764 */
2675 ret = ocfs2_et_root_journal_access(handle, inode, et, 2765 ret = ocfs2_et_root_journal_access(handle, et,
2676 OCFS2_JOURNAL_ACCESS_WRITE); 2766 OCFS2_JOURNAL_ACCESS_WRITE);
2677 if (ret) { 2767 if (ret) {
2678 mlog_errno(ret); 2768 mlog_errno(ret);
@@ -2688,7 +2778,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2688 */ 2778 */
2689 BUG_ON(right_has_empty && !del_right_subtree); 2779 BUG_ON(right_has_empty && !del_right_subtree);
2690 2780
2691 ret = ocfs2_path_bh_journal_access(handle, inode, right_path, 2781 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2692 subtree_index); 2782 subtree_index);
2693 if (ret) { 2783 if (ret) {
2694 mlog_errno(ret); 2784 mlog_errno(ret);
@@ -2696,14 +2786,14 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2696 } 2786 }
2697 2787
2698 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 2788 for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2699 ret = ocfs2_path_bh_journal_access(handle, inode, 2789 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2700 right_path, i); 2790 right_path, i);
2701 if (ret) { 2791 if (ret) {
2702 mlog_errno(ret); 2792 mlog_errno(ret);
2703 goto out; 2793 goto out;
2704 } 2794 }
2705 2795
2706 ret = ocfs2_path_bh_journal_access(handle, inode, 2796 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2707 left_path, i); 2797 left_path, i);
2708 if (ret) { 2798 if (ret) {
2709 mlog_errno(ret); 2799 mlog_errno(ret);
@@ -2740,9 +2830,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2740 mlog_errno(ret); 2830 mlog_errno(ret);
2741 2831
2742 if (del_right_subtree) { 2832 if (del_right_subtree) {
2743 ocfs2_unlink_subtree(inode, handle, left_path, right_path, 2833 ocfs2_unlink_subtree(handle, et, left_path, right_path,
2744 subtree_index, dealloc); 2834 subtree_index, dealloc);
2745 ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, 2835 ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
2746 left_path); 2836 left_path);
2747 if (ret) { 2837 if (ret) {
2748 mlog_errno(ret); 2838 mlog_errno(ret);
@@ -2766,7 +2856,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2766 2856
2767 *deleted = 1; 2857 *deleted = 1;
2768 } else 2858 } else
2769 ocfs2_complete_edge_insert(inode, handle, left_path, right_path, 2859 ocfs2_complete_edge_insert(handle, left_path, right_path,
2770 subtree_index); 2860 subtree_index);
2771 2861
2772out: 2862out:
@@ -2852,8 +2942,8 @@ out:
2852 return ret; 2942 return ret;
2853} 2943}
2854 2944
2855static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, 2945static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
2856 handle_t *handle, 2946 struct ocfs2_extent_tree *et,
2857 struct ocfs2_path *path) 2947 struct ocfs2_path *path)
2858{ 2948{
2859 int ret; 2949 int ret;
@@ -2863,7 +2953,7 @@ static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2863 if (!ocfs2_is_empty_extent(&el->l_recs[0])) 2953 if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2864 return 0; 2954 return 0;
2865 2955
2866 ret = ocfs2_path_bh_journal_access(handle, inode, path, 2956 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
2867 path_num_items(path) - 1); 2957 path_num_items(path) - 1);
2868 if (ret) { 2958 if (ret) {
2869 mlog_errno(ret); 2959 mlog_errno(ret);
@@ -2880,24 +2970,24 @@ out:
2880 return ret; 2970 return ret;
2881} 2971}
2882 2972
2883static int __ocfs2_rotate_tree_left(struct inode *inode, 2973static int __ocfs2_rotate_tree_left(handle_t *handle,
2884 handle_t *handle, int orig_credits, 2974 struct ocfs2_extent_tree *et,
2975 int orig_credits,
2885 struct ocfs2_path *path, 2976 struct ocfs2_path *path,
2886 struct ocfs2_cached_dealloc_ctxt *dealloc, 2977 struct ocfs2_cached_dealloc_ctxt *dealloc,
2887 struct ocfs2_path **empty_extent_path, 2978 struct ocfs2_path **empty_extent_path)
2888 struct ocfs2_extent_tree *et)
2889{ 2979{
2890 int ret, subtree_root, deleted; 2980 int ret, subtree_root, deleted;
2891 u32 right_cpos; 2981 u32 right_cpos;
2892 struct ocfs2_path *left_path = NULL; 2982 struct ocfs2_path *left_path = NULL;
2893 struct ocfs2_path *right_path = NULL; 2983 struct ocfs2_path *right_path = NULL;
2984 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2894 2985
2895 BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); 2986 BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2896 2987
2897 *empty_extent_path = NULL; 2988 *empty_extent_path = NULL;
2898 2989
2899 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path, 2990 ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
2900 &right_cpos);
2901 if (ret) { 2991 if (ret) {
2902 mlog_errno(ret); 2992 mlog_errno(ret);
2903 goto out; 2993 goto out;
@@ -2920,13 +3010,13 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2920 } 3010 }
2921 3011
2922 while (right_cpos) { 3012 while (right_cpos) {
2923 ret = ocfs2_find_path(inode, right_path, right_cpos); 3013 ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
2924 if (ret) { 3014 if (ret) {
2925 mlog_errno(ret); 3015 mlog_errno(ret);
2926 goto out; 3016 goto out;
2927 } 3017 }
2928 3018
2929 subtree_root = ocfs2_find_subtree_root(inode, left_path, 3019 subtree_root = ocfs2_find_subtree_root(et, left_path,
2930 right_path); 3020 right_path);
2931 3021
2932 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 3022 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
@@ -2946,16 +3036,16 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2946 * Caller might still want to make changes to the 3036 * Caller might still want to make changes to the
2947 * tree root, so re-add it to the journal here. 3037 * tree root, so re-add it to the journal here.
2948 */ 3038 */
2949 ret = ocfs2_path_bh_journal_access(handle, inode, 3039 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2950 left_path, 0); 3040 left_path, 0);
2951 if (ret) { 3041 if (ret) {
2952 mlog_errno(ret); 3042 mlog_errno(ret);
2953 goto out; 3043 goto out;
2954 } 3044 }
2955 3045
2956 ret = ocfs2_rotate_subtree_left(inode, handle, left_path, 3046 ret = ocfs2_rotate_subtree_left(handle, et, left_path,
2957 right_path, subtree_root, 3047 right_path, subtree_root,
2958 dealloc, &deleted, et); 3048 dealloc, &deleted);
2959 if (ret == -EAGAIN) { 3049 if (ret == -EAGAIN) {
2960 /* 3050 /*
2961 * The rotation has to temporarily stop due to 3051 * The rotation has to temporarily stop due to
@@ -2982,7 +3072,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
2982 3072
2983 ocfs2_mv_path(left_path, right_path); 3073 ocfs2_mv_path(left_path, right_path);
2984 3074
2985 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, 3075 ret = ocfs2_find_cpos_for_right_leaf(sb, left_path,
2986 &right_cpos); 3076 &right_cpos);
2987 if (ret) { 3077 if (ret) {
2988 mlog_errno(ret); 3078 mlog_errno(ret);
@@ -2997,10 +3087,10 @@ out:
2997 return ret; 3087 return ret;
2998} 3088}
2999 3089
3000static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, 3090static int ocfs2_remove_rightmost_path(handle_t *handle,
3091 struct ocfs2_extent_tree *et,
3001 struct ocfs2_path *path, 3092 struct ocfs2_path *path,
3002 struct ocfs2_cached_dealloc_ctxt *dealloc, 3093 struct ocfs2_cached_dealloc_ctxt *dealloc)
3003 struct ocfs2_extent_tree *et)
3004{ 3094{
3005 int ret, subtree_index; 3095 int ret, subtree_index;
3006 u32 cpos; 3096 u32 cpos;
@@ -3009,7 +3099,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
3009 struct ocfs2_extent_list *el; 3099 struct ocfs2_extent_list *el;
3010 3100
3011 3101
3012 ret = ocfs2_et_sanity_check(inode, et); 3102 ret = ocfs2_et_sanity_check(et);
3013 if (ret) 3103 if (ret)
3014 goto out; 3104 goto out;
3015 /* 3105 /*
@@ -3024,13 +3114,14 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
3024 goto out; 3114 goto out;
3025 } 3115 }
3026 3116
3027 ret = ocfs2_journal_access_path(inode, handle, path); 3117 ret = ocfs2_journal_access_path(et->et_ci, handle, path);
3028 if (ret) { 3118 if (ret) {
3029 mlog_errno(ret); 3119 mlog_errno(ret);
3030 goto out; 3120 goto out;
3031 } 3121 }
3032 3122
3033 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); 3123 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3124 path, &cpos);
3034 if (ret) { 3125 if (ret) {
3035 mlog_errno(ret); 3126 mlog_errno(ret);
3036 goto out; 3127 goto out;
@@ -3048,23 +3139,23 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
3048 goto out; 3139 goto out;
3049 } 3140 }
3050 3141
3051 ret = ocfs2_find_path(inode, left_path, cpos); 3142 ret = ocfs2_find_path(et->et_ci, left_path, cpos);
3052 if (ret) { 3143 if (ret) {
3053 mlog_errno(ret); 3144 mlog_errno(ret);
3054 goto out; 3145 goto out;
3055 } 3146 }
3056 3147
3057 ret = ocfs2_journal_access_path(inode, handle, left_path); 3148 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
3058 if (ret) { 3149 if (ret) {
3059 mlog_errno(ret); 3150 mlog_errno(ret);
3060 goto out; 3151 goto out;
3061 } 3152 }
3062 3153
3063 subtree_index = ocfs2_find_subtree_root(inode, left_path, path); 3154 subtree_index = ocfs2_find_subtree_root(et, left_path, path);
3064 3155
3065 ocfs2_unlink_subtree(inode, handle, left_path, path, 3156 ocfs2_unlink_subtree(handle, et, left_path, path,
3066 subtree_index, dealloc); 3157 subtree_index, dealloc);
3067 ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, 3158 ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
3068 left_path); 3159 left_path);
3069 if (ret) { 3160 if (ret) {
3070 mlog_errno(ret); 3161 mlog_errno(ret);
@@ -3078,10 +3169,10 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
3078 * 'path' is also the leftmost path which 3169 * 'path' is also the leftmost path which
3079 * means it must be the only one. This gets 3170 * means it must be the only one. This gets
3080 * handled differently because we want to 3171 * handled differently because we want to
3081 * revert the inode back to having extents 3172 * revert the root back to having extents
3082 * in-line. 3173 * in-line.
3083 */ 3174 */
3084 ocfs2_unlink_path(inode, handle, dealloc, path, 1); 3175 ocfs2_unlink_path(handle, et, dealloc, path, 1);
3085 3176
3086 el = et->et_root_el; 3177 el = et->et_root_el;
3087 el->l_tree_depth = 0; 3178 el->l_tree_depth = 0;
@@ -3114,10 +3205,10 @@ out:
3114 * the rightmost tree leaf record is removed so the caller is 3205 * the rightmost tree leaf record is removed so the caller is
3115 * responsible for detecting and correcting that. 3206 * responsible for detecting and correcting that.
3116 */ 3207 */
3117static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, 3208static int ocfs2_rotate_tree_left(handle_t *handle,
3209 struct ocfs2_extent_tree *et,
3118 struct ocfs2_path *path, 3210 struct ocfs2_path *path,
3119 struct ocfs2_cached_dealloc_ctxt *dealloc, 3211 struct ocfs2_cached_dealloc_ctxt *dealloc)
3120 struct ocfs2_extent_tree *et)
3121{ 3212{
3122 int ret, orig_credits = handle->h_buffer_credits; 3213 int ret, orig_credits = handle->h_buffer_credits;
3123 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; 3214 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
@@ -3134,8 +3225,7 @@ rightmost_no_delete:
3134 * Inline extents. This is trivially handled, so do 3225 * Inline extents. This is trivially handled, so do
3135 * it up front. 3226 * it up front.
3136 */ 3227 */
3137 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, 3228 ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path);
3138 path);
3139 if (ret) 3229 if (ret)
3140 mlog_errno(ret); 3230 mlog_errno(ret);
3141 goto out; 3231 goto out;
@@ -3151,7 +3241,7 @@ rightmost_no_delete:
3151 * 3241 *
3152 * 1) is handled via ocfs2_rotate_rightmost_leaf_left() 3242 * 1) is handled via ocfs2_rotate_rightmost_leaf_left()
3153 * 2a) we need the left branch so that we can update it with the unlink 3243 * 2a) we need the left branch so that we can update it with the unlink
3154 * 2b) we need to bring the inode back to inline extents. 3244 * 2b) we need to bring the root back to inline extents.
3155 */ 3245 */
3156 3246
3157 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 3247 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
@@ -3167,9 +3257,9 @@ rightmost_no_delete:
3167 3257
3168 if (le16_to_cpu(el->l_next_free_rec) == 0) { 3258 if (le16_to_cpu(el->l_next_free_rec) == 0) {
3169 ret = -EIO; 3259 ret = -EIO;
3170 ocfs2_error(inode->i_sb, 3260 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3171 "Inode %llu has empty extent block at %llu", 3261 "Owner %llu has empty extent block at %llu",
3172 (unsigned long long)OCFS2_I(inode)->ip_blkno, 3262 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
3173 (unsigned long long)le64_to_cpu(eb->h_blkno)); 3263 (unsigned long long)le64_to_cpu(eb->h_blkno));
3174 goto out; 3264 goto out;
3175 } 3265 }
@@ -3183,8 +3273,8 @@ rightmost_no_delete:
3183 * nonempty list. 3273 * nonempty list.
3184 */ 3274 */
3185 3275
3186 ret = ocfs2_remove_rightmost_path(inode, handle, path, 3276 ret = ocfs2_remove_rightmost_path(handle, et, path,
3187 dealloc, et); 3277 dealloc);
3188 if (ret) 3278 if (ret)
3189 mlog_errno(ret); 3279 mlog_errno(ret);
3190 goto out; 3280 goto out;
@@ -3195,8 +3285,8 @@ rightmost_no_delete:
3195 * and restarting from there. 3285 * and restarting from there.
3196 */ 3286 */
3197try_rotate: 3287try_rotate:
3198 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path, 3288 ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path,
3199 dealloc, &restart_path, et); 3289 dealloc, &restart_path);
3200 if (ret && ret != -EAGAIN) { 3290 if (ret && ret != -EAGAIN) {
3201 mlog_errno(ret); 3291 mlog_errno(ret);
3202 goto out; 3292 goto out;
@@ -3206,9 +3296,9 @@ try_rotate:
3206 tmp_path = restart_path; 3296 tmp_path = restart_path;
3207 restart_path = NULL; 3297 restart_path = NULL;
3208 3298
3209 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, 3299 ret = __ocfs2_rotate_tree_left(handle, et, orig_credits,
3210 tmp_path, dealloc, 3300 tmp_path, dealloc,
3211 &restart_path, et); 3301 &restart_path);
3212 if (ret && ret != -EAGAIN) { 3302 if (ret && ret != -EAGAIN) {
3213 mlog_errno(ret); 3303 mlog_errno(ret);
3214 goto out; 3304 goto out;
@@ -3259,7 +3349,7 @@ static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3259 } 3349 }
3260} 3350}
3261 3351
3262static int ocfs2_get_right_path(struct inode *inode, 3352static int ocfs2_get_right_path(struct ocfs2_extent_tree *et,
3263 struct ocfs2_path *left_path, 3353 struct ocfs2_path *left_path,
3264 struct ocfs2_path **ret_right_path) 3354 struct ocfs2_path **ret_right_path)
3265{ 3355{
@@ -3276,8 +3366,8 @@ static int ocfs2_get_right_path(struct inode *inode,
3276 left_el = path_leaf_el(left_path); 3366 left_el = path_leaf_el(left_path);
3277 BUG_ON(left_el->l_next_free_rec != left_el->l_count); 3367 BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3278 3368
3279 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, 3369 ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3280 &right_cpos); 3370 left_path, &right_cpos);
3281 if (ret) { 3371 if (ret) {
3282 mlog_errno(ret); 3372 mlog_errno(ret);
3283 goto out; 3373 goto out;
@@ -3293,7 +3383,7 @@ static int ocfs2_get_right_path(struct inode *inode,
3293 goto out; 3383 goto out;
3294 } 3384 }
3295 3385
3296 ret = ocfs2_find_path(inode, right_path, right_cpos); 3386 ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
3297 if (ret) { 3387 if (ret) {
3298 mlog_errno(ret); 3388 mlog_errno(ret);
3299 goto out; 3389 goto out;
@@ -3313,9 +3403,9 @@ out:
3313 * For index == l_count - 1, the "next" means the 1st extent rec of the 3403 * For index == l_count - 1, the "next" means the 1st extent rec of the
3314 * next extent block. 3404 * next extent block.
3315 */ 3405 */
3316static int ocfs2_merge_rec_right(struct inode *inode, 3406static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
3317 struct ocfs2_path *left_path,
3318 handle_t *handle, 3407 handle_t *handle,
3408 struct ocfs2_extent_tree *et,
3319 struct ocfs2_extent_rec *split_rec, 3409 struct ocfs2_extent_rec *split_rec,
3320 int index) 3410 int index)
3321{ 3411{
@@ -3336,7 +3426,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3336 if (index == le16_to_cpu(el->l_next_free_rec) - 1 && 3426 if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3337 le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) { 3427 le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3338 /* we meet with a cross extent block merge. */ 3428 /* we meet with a cross extent block merge. */
3339 ret = ocfs2_get_right_path(inode, left_path, &right_path); 3429 ret = ocfs2_get_right_path(et, left_path, &right_path);
3340 if (ret) { 3430 if (ret) {
3341 mlog_errno(ret); 3431 mlog_errno(ret);
3342 goto out; 3432 goto out;
@@ -3355,8 +3445,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3355 le16_to_cpu(left_rec->e_leaf_clusters) != 3445 le16_to_cpu(left_rec->e_leaf_clusters) !=
3356 le32_to_cpu(right_rec->e_cpos)); 3446 le32_to_cpu(right_rec->e_cpos));
3357 3447
3358 subtree_index = ocfs2_find_subtree_root(inode, 3448 subtree_index = ocfs2_find_subtree_root(et, left_path,
3359 left_path, right_path); 3449 right_path);
3360 3450
3361 ret = ocfs2_extend_rotate_transaction(handle, subtree_index, 3451 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3362 handle->h_buffer_credits, 3452 handle->h_buffer_credits,
@@ -3369,7 +3459,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3369 root_bh = left_path->p_node[subtree_index].bh; 3459 root_bh = left_path->p_node[subtree_index].bh;
3370 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3460 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3371 3461
3372 ret = ocfs2_path_bh_journal_access(handle, inode, right_path, 3462 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3373 subtree_index); 3463 subtree_index);
3374 if (ret) { 3464 if (ret) {
3375 mlog_errno(ret); 3465 mlog_errno(ret);
@@ -3378,14 +3468,14 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3378 3468
3379 for (i = subtree_index + 1; 3469 for (i = subtree_index + 1;
3380 i < path_num_items(right_path); i++) { 3470 i < path_num_items(right_path); i++) {
3381 ret = ocfs2_path_bh_journal_access(handle, inode, 3471 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3382 right_path, i); 3472 right_path, i);
3383 if (ret) { 3473 if (ret) {
3384 mlog_errno(ret); 3474 mlog_errno(ret);
3385 goto out; 3475 goto out;
3386 } 3476 }
3387 3477
3388 ret = ocfs2_path_bh_journal_access(handle, inode, 3478 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3389 left_path, i); 3479 left_path, i);
3390 if (ret) { 3480 if (ret) {
3391 mlog_errno(ret); 3481 mlog_errno(ret);
@@ -3398,7 +3488,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3398 right_rec = &el->l_recs[index + 1]; 3488 right_rec = &el->l_recs[index + 1];
3399 } 3489 }
3400 3490
3401 ret = ocfs2_path_bh_journal_access(handle, inode, left_path, 3491 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path,
3402 path_num_items(left_path) - 1); 3492 path_num_items(left_path) - 1);
3403 if (ret) { 3493 if (ret) {
3404 mlog_errno(ret); 3494 mlog_errno(ret);
@@ -3409,7 +3499,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3409 3499
3410 le32_add_cpu(&right_rec->e_cpos, -split_clusters); 3500 le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3411 le64_add_cpu(&right_rec->e_blkno, 3501 le64_add_cpu(&right_rec->e_blkno,
3412 -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); 3502 -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3503 split_clusters));
3413 le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters); 3504 le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3414 3505
3415 ocfs2_cleanup_merge(el, index); 3506 ocfs2_cleanup_merge(el, index);
@@ -3423,8 +3514,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
3423 if (ret) 3514 if (ret)
3424 mlog_errno(ret); 3515 mlog_errno(ret);
3425 3516
3426 ocfs2_complete_edge_insert(inode, handle, left_path, 3517 ocfs2_complete_edge_insert(handle, left_path, right_path,
3427 right_path, subtree_index); 3518 subtree_index);
3428 } 3519 }
3429out: 3520out:
3430 if (right_path) 3521 if (right_path)
@@ -3432,7 +3523,7 @@ out:
3432 return ret; 3523 return ret;
3433} 3524}
3434 3525
3435static int ocfs2_get_left_path(struct inode *inode, 3526static int ocfs2_get_left_path(struct ocfs2_extent_tree *et,
3436 struct ocfs2_path *right_path, 3527 struct ocfs2_path *right_path,
3437 struct ocfs2_path **ret_left_path) 3528 struct ocfs2_path **ret_left_path)
3438{ 3529{
@@ -3445,7 +3536,7 @@ static int ocfs2_get_left_path(struct inode *inode,
3445 /* This function shouldn't be called for non-trees. */ 3536 /* This function shouldn't be called for non-trees. */
3446 BUG_ON(right_path->p_tree_depth == 0); 3537 BUG_ON(right_path->p_tree_depth == 0);
3447 3538
3448 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, 3539 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3449 right_path, &left_cpos); 3540 right_path, &left_cpos);
3450 if (ret) { 3541 if (ret) {
3451 mlog_errno(ret); 3542 mlog_errno(ret);
@@ -3462,7 +3553,7 @@ static int ocfs2_get_left_path(struct inode *inode,
3462 goto out; 3553 goto out;
3463 } 3554 }
3464 3555
3465 ret = ocfs2_find_path(inode, left_path, left_cpos); 3556 ret = ocfs2_find_path(et->et_ci, left_path, left_cpos);
3466 if (ret) { 3557 if (ret) {
3467 mlog_errno(ret); 3558 mlog_errno(ret);
3468 goto out; 3559 goto out;
@@ -3485,12 +3576,11 @@ out:
3485 * remove the rightmost leaf extent block in the right_path and change 3576 * remove the rightmost leaf extent block in the right_path and change
3486 * the right path to indicate the new rightmost path. 3577 * the right path to indicate the new rightmost path.
3487 */ 3578 */
3488static int ocfs2_merge_rec_left(struct inode *inode, 3579static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3489 struct ocfs2_path *right_path,
3490 handle_t *handle, 3580 handle_t *handle,
3581 struct ocfs2_extent_tree *et,
3491 struct ocfs2_extent_rec *split_rec, 3582 struct ocfs2_extent_rec *split_rec,
3492 struct ocfs2_cached_dealloc_ctxt *dealloc, 3583 struct ocfs2_cached_dealloc_ctxt *dealloc,
3493 struct ocfs2_extent_tree *et,
3494 int index) 3584 int index)
3495{ 3585{
3496 int ret, i, subtree_index = 0, has_empty_extent = 0; 3586 int ret, i, subtree_index = 0, has_empty_extent = 0;
@@ -3508,7 +3598,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3508 right_rec = &el->l_recs[index]; 3598 right_rec = &el->l_recs[index];
3509 if (index == 0) { 3599 if (index == 0) {
3510 /* we meet with a cross extent block merge. */ 3600 /* we meet with a cross extent block merge. */
3511 ret = ocfs2_get_left_path(inode, right_path, &left_path); 3601 ret = ocfs2_get_left_path(et, right_path, &left_path);
3512 if (ret) { 3602 if (ret) {
3513 mlog_errno(ret); 3603 mlog_errno(ret);
3514 goto out; 3604 goto out;
@@ -3524,8 +3614,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3524 le16_to_cpu(left_rec->e_leaf_clusters) != 3614 le16_to_cpu(left_rec->e_leaf_clusters) !=
3525 le32_to_cpu(split_rec->e_cpos)); 3615 le32_to_cpu(split_rec->e_cpos));
3526 3616
3527 subtree_index = ocfs2_find_subtree_root(inode, 3617 subtree_index = ocfs2_find_subtree_root(et, left_path,
3528 left_path, right_path); 3618 right_path);
3529 3619
3530 ret = ocfs2_extend_rotate_transaction(handle, subtree_index, 3620 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3531 handle->h_buffer_credits, 3621 handle->h_buffer_credits,
@@ -3538,7 +3628,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3538 root_bh = left_path->p_node[subtree_index].bh; 3628 root_bh = left_path->p_node[subtree_index].bh;
3539 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3629 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3540 3630
3541 ret = ocfs2_path_bh_journal_access(handle, inode, right_path, 3631 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3542 subtree_index); 3632 subtree_index);
3543 if (ret) { 3633 if (ret) {
3544 mlog_errno(ret); 3634 mlog_errno(ret);
@@ -3547,14 +3637,14 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3547 3637
3548 for (i = subtree_index + 1; 3638 for (i = subtree_index + 1;
3549 i < path_num_items(right_path); i++) { 3639 i < path_num_items(right_path); i++) {
3550 ret = ocfs2_path_bh_journal_access(handle, inode, 3640 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3551 right_path, i); 3641 right_path, i);
3552 if (ret) { 3642 if (ret) {
3553 mlog_errno(ret); 3643 mlog_errno(ret);
3554 goto out; 3644 goto out;
3555 } 3645 }
3556 3646
3557 ret = ocfs2_path_bh_journal_access(handle, inode, 3647 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3558 left_path, i); 3648 left_path, i);
3559 if (ret) { 3649 if (ret) {
3560 mlog_errno(ret); 3650 mlog_errno(ret);
@@ -3567,7 +3657,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3567 has_empty_extent = 1; 3657 has_empty_extent = 1;
3568 } 3658 }
3569 3659
3570 ret = ocfs2_path_bh_journal_access(handle, inode, right_path, 3660 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3571 path_num_items(right_path) - 1); 3661 path_num_items(right_path) - 1);
3572 if (ret) { 3662 if (ret) {
3573 mlog_errno(ret); 3663 mlog_errno(ret);
@@ -3586,7 +3676,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3586 3676
3587 le32_add_cpu(&right_rec->e_cpos, split_clusters); 3677 le32_add_cpu(&right_rec->e_cpos, split_clusters);
3588 le64_add_cpu(&right_rec->e_blkno, 3678 le64_add_cpu(&right_rec->e_blkno,
3589 ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); 3679 ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3680 split_clusters));
3590 le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters); 3681 le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3591 3682
3592 ocfs2_cleanup_merge(el, index); 3683 ocfs2_cleanup_merge(el, index);
@@ -3608,9 +3699,9 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3608 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && 3699 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3609 le16_to_cpu(el->l_next_free_rec) == 1) { 3700 le16_to_cpu(el->l_next_free_rec) == 1) {
3610 3701
3611 ret = ocfs2_remove_rightmost_path(inode, handle, 3702 ret = ocfs2_remove_rightmost_path(handle, et,
3612 right_path, 3703 right_path,
3613 dealloc, et); 3704 dealloc);
3614 if (ret) { 3705 if (ret) {
3615 mlog_errno(ret); 3706 mlog_errno(ret);
3616 goto out; 3707 goto out;
@@ -3622,7 +3713,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
3622 ocfs2_mv_path(right_path, left_path); 3713 ocfs2_mv_path(right_path, left_path);
3623 left_path = NULL; 3714 left_path = NULL;
3624 } else 3715 } else
3625 ocfs2_complete_edge_insert(inode, handle, left_path, 3716 ocfs2_complete_edge_insert(handle, left_path,
3626 right_path, subtree_index); 3717 right_path, subtree_index);
3627 } 3718 }
3628out: 3719out:
@@ -3631,15 +3722,13 @@ out:
3631 return ret; 3722 return ret;
3632} 3723}
3633 3724
3634static int ocfs2_try_to_merge_extent(struct inode *inode, 3725static int ocfs2_try_to_merge_extent(handle_t *handle,
3635 handle_t *handle, 3726 struct ocfs2_extent_tree *et,
3636 struct ocfs2_path *path, 3727 struct ocfs2_path *path,
3637 int split_index, 3728 int split_index,
3638 struct ocfs2_extent_rec *split_rec, 3729 struct ocfs2_extent_rec *split_rec,
3639 struct ocfs2_cached_dealloc_ctxt *dealloc, 3730 struct ocfs2_cached_dealloc_ctxt *dealloc,
3640 struct ocfs2_merge_ctxt *ctxt, 3731 struct ocfs2_merge_ctxt *ctxt)
3641 struct ocfs2_extent_tree *et)
3642
3643{ 3732{
3644 int ret = 0; 3733 int ret = 0;
3645 struct ocfs2_extent_list *el = path_leaf_el(path); 3734 struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -3655,8 +3744,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3655 * extents - having more than one in a leaf is 3744 * extents - having more than one in a leaf is
3656 * illegal. 3745 * illegal.
3657 */ 3746 */
3658 ret = ocfs2_rotate_tree_left(inode, handle, path, 3747 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3659 dealloc, et);
3660 if (ret) { 3748 if (ret) {
3661 mlog_errno(ret); 3749 mlog_errno(ret);
3662 goto out; 3750 goto out;
@@ -3685,8 +3773,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3685 * prevoius extent block. It is more efficient and easier 3773 * prevoius extent block. It is more efficient and easier
3686 * if we do merge_right first and merge_left later. 3774 * if we do merge_right first and merge_left later.
3687 */ 3775 */
3688 ret = ocfs2_merge_rec_right(inode, path, 3776 ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
3689 handle, split_rec,
3690 split_index); 3777 split_index);
3691 if (ret) { 3778 if (ret) {
3692 mlog_errno(ret); 3779 mlog_errno(ret);
@@ -3699,8 +3786,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3699 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3786 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3700 3787
3701 /* The merge left us with an empty extent, remove it. */ 3788 /* The merge left us with an empty extent, remove it. */
3702 ret = ocfs2_rotate_tree_left(inode, handle, path, 3789 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3703 dealloc, et);
3704 if (ret) { 3790 if (ret) {
3705 mlog_errno(ret); 3791 mlog_errno(ret);
3706 goto out; 3792 goto out;
@@ -3712,18 +3798,15 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3712 * Note that we don't pass split_rec here on purpose - 3798 * Note that we don't pass split_rec here on purpose -
3713 * we've merged it into the rec already. 3799 * we've merged it into the rec already.
3714 */ 3800 */
3715 ret = ocfs2_merge_rec_left(inode, path, 3801 ret = ocfs2_merge_rec_left(path, handle, et, rec,
3716 handle, rec, 3802 dealloc, split_index);
3717 dealloc, et,
3718 split_index);
3719 3803
3720 if (ret) { 3804 if (ret) {
3721 mlog_errno(ret); 3805 mlog_errno(ret);
3722 goto out; 3806 goto out;
3723 } 3807 }
3724 3808
3725 ret = ocfs2_rotate_tree_left(inode, handle, path, 3809 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3726 dealloc, et);
3727 /* 3810 /*
3728 * Error from this last rotate is not critical, so 3811 * Error from this last rotate is not critical, so
3729 * print but don't bubble it up. 3812 * print but don't bubble it up.
@@ -3740,19 +3823,16 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3740 * the record on the left (hence the left merge). 3823 * the record on the left (hence the left merge).
3741 */ 3824 */
3742 if (ctxt->c_contig_type == CONTIG_RIGHT) { 3825 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3743 ret = ocfs2_merge_rec_left(inode, 3826 ret = ocfs2_merge_rec_left(path, handle, et,
3744 path, 3827 split_rec, dealloc,
3745 handle, split_rec,
3746 dealloc, et,
3747 split_index); 3828 split_index);
3748 if (ret) { 3829 if (ret) {
3749 mlog_errno(ret); 3830 mlog_errno(ret);
3750 goto out; 3831 goto out;
3751 } 3832 }
3752 } else { 3833 } else {
3753 ret = ocfs2_merge_rec_right(inode, 3834 ret = ocfs2_merge_rec_right(path, handle,
3754 path, 3835 et, split_rec,
3755 handle, split_rec,
3756 split_index); 3836 split_index);
3757 if (ret) { 3837 if (ret) {
3758 mlog_errno(ret); 3838 mlog_errno(ret);
@@ -3765,8 +3845,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
3765 * The merge may have left an empty extent in 3845 * The merge may have left an empty extent in
3766 * our leaf. Try to rotate it away. 3846 * our leaf. Try to rotate it away.
3767 */ 3847 */
3768 ret = ocfs2_rotate_tree_left(inode, handle, path, 3848 ret = ocfs2_rotate_tree_left(handle, et, path,
3769 dealloc, et); 3849 dealloc);
3770 if (ret) 3850 if (ret)
3771 mlog_errno(ret); 3851 mlog_errno(ret);
3772 ret = 0; 3852 ret = 0;
@@ -3812,10 +3892,10 @@ static void ocfs2_subtract_from_rec(struct super_block *sb,
3812 * list. If this leaf is part of an allocation tree, it is assumed 3892 * list. If this leaf is part of an allocation tree, it is assumed
3813 * that the tree above has been prepared. 3893 * that the tree above has been prepared.
3814 */ 3894 */
3815static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, 3895static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
3896 struct ocfs2_extent_rec *insert_rec,
3816 struct ocfs2_extent_list *el, 3897 struct ocfs2_extent_list *el,
3817 struct ocfs2_insert_type *insert, 3898 struct ocfs2_insert_type *insert)
3818 struct inode *inode)
3819{ 3899{
3820 int i = insert->ins_contig_index; 3900 int i = insert->ins_contig_index;
3821 unsigned int range; 3901 unsigned int range;
@@ -3827,7 +3907,8 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3827 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos)); 3907 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3828 BUG_ON(i == -1); 3908 BUG_ON(i == -1);
3829 rec = &el->l_recs[i]; 3909 rec = &el->l_recs[i];
3830 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec, 3910 ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
3911 insert->ins_split, rec,
3831 insert_rec); 3912 insert_rec);
3832 goto rotate; 3913 goto rotate;
3833 } 3914 }
@@ -3869,10 +3950,10 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3869 3950
3870 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= 3951 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3871 le16_to_cpu(el->l_count), 3952 le16_to_cpu(el->l_count),
3872 "inode %lu, depth %u, count %u, next free %u, " 3953 "owner %llu, depth %u, count %u, next free %u, "
3873 "rec.cpos %u, rec.clusters %u, " 3954 "rec.cpos %u, rec.clusters %u, "
3874 "insert.cpos %u, insert.clusters %u\n", 3955 "insert.cpos %u, insert.clusters %u\n",
3875 inode->i_ino, 3956 ocfs2_metadata_cache_owner(et->et_ci),
3876 le16_to_cpu(el->l_tree_depth), 3957 le16_to_cpu(el->l_tree_depth),
3877 le16_to_cpu(el->l_count), 3958 le16_to_cpu(el->l_count),
3878 le16_to_cpu(el->l_next_free_rec), 3959 le16_to_cpu(el->l_next_free_rec),
@@ -3900,8 +3981,8 @@ rotate:
3900 ocfs2_rotate_leaf(el, insert_rec); 3981 ocfs2_rotate_leaf(el, insert_rec);
3901} 3982}
3902 3983
3903static void ocfs2_adjust_rightmost_records(struct inode *inode, 3984static void ocfs2_adjust_rightmost_records(handle_t *handle,
3904 handle_t *handle, 3985 struct ocfs2_extent_tree *et,
3905 struct ocfs2_path *path, 3986 struct ocfs2_path *path,
3906 struct ocfs2_extent_rec *insert_rec) 3987 struct ocfs2_extent_rec *insert_rec)
3907{ 3988{
@@ -3919,9 +4000,9 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode,
3919 4000
3920 next_free = le16_to_cpu(el->l_next_free_rec); 4001 next_free = le16_to_cpu(el->l_next_free_rec);
3921 if (next_free == 0) { 4002 if (next_free == 0) {
3922 ocfs2_error(inode->i_sb, 4003 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3923 "Dinode %llu has a bad extent list", 4004 "Owner %llu has a bad extent list",
3924 (unsigned long long)OCFS2_I(inode)->ip_blkno); 4005 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
3925 ret = -EIO; 4006 ret = -EIO;
3926 return; 4007 return;
3927 } 4008 }
@@ -3941,7 +4022,8 @@ static void ocfs2_adjust_rightmost_records(struct inode *inode,
3941 } 4022 }
3942} 4023}
3943 4024
3944static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, 4025static int ocfs2_append_rec_to_path(handle_t *handle,
4026 struct ocfs2_extent_tree *et,
3945 struct ocfs2_extent_rec *insert_rec, 4027 struct ocfs2_extent_rec *insert_rec,
3946 struct ocfs2_path *right_path, 4028 struct ocfs2_path *right_path,
3947 struct ocfs2_path **ret_left_path) 4029 struct ocfs2_path **ret_left_path)
@@ -3969,8 +4051,8 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3969 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { 4051 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3970 u32 left_cpos; 4052 u32 left_cpos;
3971 4053
3972 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, 4054 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3973 &left_cpos); 4055 right_path, &left_cpos);
3974 if (ret) { 4056 if (ret) {
3975 mlog_errno(ret); 4057 mlog_errno(ret);
3976 goto out; 4058 goto out;
@@ -3992,7 +4074,8 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3992 goto out; 4074 goto out;
3993 } 4075 }
3994 4076
3995 ret = ocfs2_find_path(inode, left_path, left_cpos); 4077 ret = ocfs2_find_path(et->et_ci, left_path,
4078 left_cpos);
3996 if (ret) { 4079 if (ret) {
3997 mlog_errno(ret); 4080 mlog_errno(ret);
3998 goto out; 4081 goto out;
@@ -4005,13 +4088,13 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
4005 } 4088 }
4006 } 4089 }
4007 4090
4008 ret = ocfs2_journal_access_path(inode, handle, right_path); 4091 ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4009 if (ret) { 4092 if (ret) {
4010 mlog_errno(ret); 4093 mlog_errno(ret);
4011 goto out; 4094 goto out;
4012 } 4095 }
4013 4096
4014 ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec); 4097 ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
4015 4098
4016 *ret_left_path = left_path; 4099 *ret_left_path = left_path;
4017 ret = 0; 4100 ret = 0;
@@ -4022,7 +4105,7 @@ out:
4022 return ret; 4105 return ret;
4023} 4106}
4024 4107
4025static void ocfs2_split_record(struct inode *inode, 4108static void ocfs2_split_record(struct ocfs2_extent_tree *et,
4026 struct ocfs2_path *left_path, 4109 struct ocfs2_path *left_path,
4027 struct ocfs2_path *right_path, 4110 struct ocfs2_path *right_path,
4028 struct ocfs2_extent_rec *split_rec, 4111 struct ocfs2_extent_rec *split_rec,
@@ -4095,7 +4178,8 @@ static void ocfs2_split_record(struct inode *inode,
4095 } 4178 }
4096 4179
4097 rec = &el->l_recs[index]; 4180 rec = &el->l_recs[index];
4098 ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec); 4181 ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4182 split, rec, split_rec);
4099 ocfs2_rotate_leaf(insert_el, split_rec); 4183 ocfs2_rotate_leaf(insert_el, split_rec);
4100} 4184}
4101 4185
@@ -4107,8 +4191,8 @@ static void ocfs2_split_record(struct inode *inode,
4107 * in. left_path should only be passed in if we need to update that 4191 * in. left_path should only be passed in if we need to update that
4108 * portion of the tree after an edge insert. 4192 * portion of the tree after an edge insert.
4109 */ 4193 */
4110static int ocfs2_insert_path(struct inode *inode, 4194static int ocfs2_insert_path(handle_t *handle,
4111 handle_t *handle, 4195 struct ocfs2_extent_tree *et,
4112 struct ocfs2_path *left_path, 4196 struct ocfs2_path *left_path,
4113 struct ocfs2_path *right_path, 4197 struct ocfs2_path *right_path,
4114 struct ocfs2_extent_rec *insert_rec, 4198 struct ocfs2_extent_rec *insert_rec,
@@ -4134,7 +4218,7 @@ static int ocfs2_insert_path(struct inode *inode,
4134 goto out; 4218 goto out;
4135 } 4219 }
4136 4220
4137 ret = ocfs2_journal_access_path(inode, handle, left_path); 4221 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
4138 if (ret < 0) { 4222 if (ret < 0) {
4139 mlog_errno(ret); 4223 mlog_errno(ret);
4140 goto out; 4224 goto out;
@@ -4145,7 +4229,7 @@ static int ocfs2_insert_path(struct inode *inode,
4145 * Pass both paths to the journal. The majority of inserts 4229 * Pass both paths to the journal. The majority of inserts
4146 * will be touching all components anyway. 4230 * will be touching all components anyway.
4147 */ 4231 */
4148 ret = ocfs2_journal_access_path(inode, handle, right_path); 4232 ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4149 if (ret < 0) { 4233 if (ret < 0) {
4150 mlog_errno(ret); 4234 mlog_errno(ret);
4151 goto out; 4235 goto out;
@@ -4157,7 +4241,7 @@ static int ocfs2_insert_path(struct inode *inode,
4157 * of splits, but it's easier to just let one separate 4241 * of splits, but it's easier to just let one separate
4158 * function sort it all out. 4242 * function sort it all out.
4159 */ 4243 */
4160 ocfs2_split_record(inode, left_path, right_path, 4244 ocfs2_split_record(et, left_path, right_path,
4161 insert_rec, insert->ins_split); 4245 insert_rec, insert->ins_split);
4162 4246
4163 /* 4247 /*
@@ -4171,8 +4255,8 @@ static int ocfs2_insert_path(struct inode *inode,
4171 if (ret) 4255 if (ret)
4172 mlog_errno(ret); 4256 mlog_errno(ret);
4173 } else 4257 } else
4174 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path), 4258 ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
4175 insert, inode); 4259 insert);
4176 4260
4177 ret = ocfs2_journal_dirty(handle, leaf_bh); 4261 ret = ocfs2_journal_dirty(handle, leaf_bh);
4178 if (ret) 4262 if (ret)
@@ -4185,10 +4269,10 @@ static int ocfs2_insert_path(struct inode *inode,
4185 * 4269 *
4186 * XXX: Should we extend the transaction here? 4270 * XXX: Should we extend the transaction here?
4187 */ 4271 */
4188 subtree_index = ocfs2_find_subtree_root(inode, left_path, 4272 subtree_index = ocfs2_find_subtree_root(et, left_path,
4189 right_path); 4273 right_path);
4190 ocfs2_complete_edge_insert(inode, handle, left_path, 4274 ocfs2_complete_edge_insert(handle, left_path, right_path,
4191 right_path, subtree_index); 4275 subtree_index);
4192 } 4276 }
4193 4277
4194 ret = 0; 4278 ret = 0;
@@ -4196,8 +4280,7 @@ out:
4196 return ret; 4280 return ret;
4197} 4281}
4198 4282
4199static int ocfs2_do_insert_extent(struct inode *inode, 4283static int ocfs2_do_insert_extent(handle_t *handle,
4200 handle_t *handle,
4201 struct ocfs2_extent_tree *et, 4284 struct ocfs2_extent_tree *et,
4202 struct ocfs2_extent_rec *insert_rec, 4285 struct ocfs2_extent_rec *insert_rec,
4203 struct ocfs2_insert_type *type) 4286 struct ocfs2_insert_type *type)
@@ -4210,7 +4293,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4210 4293
4211 el = et->et_root_el; 4294 el = et->et_root_el;
4212 4295
4213 ret = ocfs2_et_root_journal_access(handle, inode, et, 4296 ret = ocfs2_et_root_journal_access(handle, et,
4214 OCFS2_JOURNAL_ACCESS_WRITE); 4297 OCFS2_JOURNAL_ACCESS_WRITE);
4215 if (ret) { 4298 if (ret) {
4216 mlog_errno(ret); 4299 mlog_errno(ret);
@@ -4218,7 +4301,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4218 } 4301 }
4219 4302
4220 if (le16_to_cpu(el->l_tree_depth) == 0) { 4303 if (le16_to_cpu(el->l_tree_depth) == 0) {
4221 ocfs2_insert_at_leaf(insert_rec, el, type, inode); 4304 ocfs2_insert_at_leaf(et, insert_rec, el, type);
4222 goto out_update_clusters; 4305 goto out_update_clusters;
4223 } 4306 }
4224 4307
@@ -4241,7 +4324,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4241 cpos = UINT_MAX; 4324 cpos = UINT_MAX;
4242 } 4325 }
4243 4326
4244 ret = ocfs2_find_path(inode, right_path, cpos); 4327 ret = ocfs2_find_path(et->et_ci, right_path, cpos);
4245 if (ret) { 4328 if (ret) {
4246 mlog_errno(ret); 4329 mlog_errno(ret);
4247 goto out; 4330 goto out;
@@ -4260,7 +4343,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4260 * can wind up skipping both of these two special cases... 4343 * can wind up skipping both of these two special cases...
4261 */ 4344 */
4262 if (rotate) { 4345 if (rotate) {
4263 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split, 4346 ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
4264 le32_to_cpu(insert_rec->e_cpos), 4347 le32_to_cpu(insert_rec->e_cpos),
4265 right_path, &left_path); 4348 right_path, &left_path);
4266 if (ret) { 4349 if (ret) {
@@ -4272,7 +4355,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4272 * ocfs2_rotate_tree_right() might have extended the 4355 * ocfs2_rotate_tree_right() might have extended the
4273 * transaction without re-journaling our tree root. 4356 * transaction without re-journaling our tree root.
4274 */ 4357 */
4275 ret = ocfs2_et_root_journal_access(handle, inode, et, 4358 ret = ocfs2_et_root_journal_access(handle, et,
4276 OCFS2_JOURNAL_ACCESS_WRITE); 4359 OCFS2_JOURNAL_ACCESS_WRITE);
4277 if (ret) { 4360 if (ret) {
4278 mlog_errno(ret); 4361 mlog_errno(ret);
@@ -4280,7 +4363,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4280 } 4363 }
4281 } else if (type->ins_appending == APPEND_TAIL 4364 } else if (type->ins_appending == APPEND_TAIL
4282 && type->ins_contig != CONTIG_LEFT) { 4365 && type->ins_contig != CONTIG_LEFT) {
4283 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, 4366 ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
4284 right_path, &left_path); 4367 right_path, &left_path);
4285 if (ret) { 4368 if (ret) {
4286 mlog_errno(ret); 4369 mlog_errno(ret);
@@ -4288,7 +4371,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4288 } 4371 }
4289 } 4372 }
4290 4373
4291 ret = ocfs2_insert_path(inode, handle, left_path, right_path, 4374 ret = ocfs2_insert_path(handle, et, left_path, right_path,
4292 insert_rec, type); 4375 insert_rec, type);
4293 if (ret) { 4376 if (ret) {
4294 mlog_errno(ret); 4377 mlog_errno(ret);
@@ -4297,7 +4380,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
4297 4380
4298out_update_clusters: 4381out_update_clusters:
4299 if (type->ins_split == SPLIT_NONE) 4382 if (type->ins_split == SPLIT_NONE)
4300 ocfs2_et_update_clusters(inode, et, 4383 ocfs2_et_update_clusters(et,
4301 le16_to_cpu(insert_rec->e_leaf_clusters)); 4384 le16_to_cpu(insert_rec->e_leaf_clusters));
4302 4385
4303 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 4386 ret = ocfs2_journal_dirty(handle, et->et_root_bh);
@@ -4312,7 +4395,8 @@ out:
4312} 4395}
4313 4396
4314static enum ocfs2_contig_type 4397static enum ocfs2_contig_type
4315ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, 4398ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4399 struct ocfs2_path *path,
4316 struct ocfs2_extent_list *el, int index, 4400 struct ocfs2_extent_list *el, int index,
4317 struct ocfs2_extent_rec *split_rec) 4401 struct ocfs2_extent_rec *split_rec)
4318{ 4402{
@@ -4324,12 +4408,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4324 struct ocfs2_path *left_path = NULL, *right_path = NULL; 4408 struct ocfs2_path *left_path = NULL, *right_path = NULL;
4325 struct buffer_head *bh; 4409 struct buffer_head *bh;
4326 struct ocfs2_extent_block *eb; 4410 struct ocfs2_extent_block *eb;
4411 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
4327 4412
4328 if (index > 0) { 4413 if (index > 0) {
4329 rec = &el->l_recs[index - 1]; 4414 rec = &el->l_recs[index - 1];
4330 } else if (path->p_tree_depth > 0) { 4415 } else if (path->p_tree_depth > 0) {
4331 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb, 4416 status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
4332 path, &left_cpos);
4333 if (status) 4417 if (status)
4334 goto out; 4418 goto out;
4335 4419
@@ -4338,7 +4422,8 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4338 if (!left_path) 4422 if (!left_path)
4339 goto out; 4423 goto out;
4340 4424
4341 status = ocfs2_find_path(inode, left_path, left_cpos); 4425 status = ocfs2_find_path(et->et_ci, left_path,
4426 left_cpos);
4342 if (status) 4427 if (status)
4343 goto out; 4428 goto out;
4344 4429
@@ -4348,7 +4433,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4348 le16_to_cpu(new_el->l_count)) { 4433 le16_to_cpu(new_el->l_count)) {
4349 bh = path_leaf_bh(left_path); 4434 bh = path_leaf_bh(left_path);
4350 eb = (struct ocfs2_extent_block *)bh->b_data; 4435 eb = (struct ocfs2_extent_block *)bh->b_data;
4351 ocfs2_error(inode->i_sb, 4436 ocfs2_error(sb,
4352 "Extent block #%llu has an " 4437 "Extent block #%llu has an "
4353 "invalid l_next_free_rec of " 4438 "invalid l_next_free_rec of "
4354 "%d. It should have " 4439 "%d. It should have "
@@ -4373,7 +4458,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4373 if (split_rec->e_cpos == el->l_recs[index].e_cpos) 4458 if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4374 ret = CONTIG_RIGHT; 4459 ret = CONTIG_RIGHT;
4375 } else { 4460 } else {
4376 ret = ocfs2_extent_contig(inode, rec, split_rec); 4461 ret = ocfs2_et_extent_contig(et, rec, split_rec);
4377 } 4462 }
4378 } 4463 }
4379 4464
@@ -4382,8 +4467,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4382 rec = &el->l_recs[index + 1]; 4467 rec = &el->l_recs[index + 1];
4383 else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) && 4468 else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4384 path->p_tree_depth > 0) { 4469 path->p_tree_depth > 0) {
4385 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb, 4470 status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
4386 path, &right_cpos);
4387 if (status) 4471 if (status)
4388 goto out; 4472 goto out;
4389 4473
@@ -4394,7 +4478,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4394 if (!right_path) 4478 if (!right_path)
4395 goto out; 4479 goto out;
4396 4480
4397 status = ocfs2_find_path(inode, right_path, right_cpos); 4481 status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
4398 if (status) 4482 if (status)
4399 goto out; 4483 goto out;
4400 4484
@@ -4404,7 +4488,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4404 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { 4488 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4405 bh = path_leaf_bh(right_path); 4489 bh = path_leaf_bh(right_path);
4406 eb = (struct ocfs2_extent_block *)bh->b_data; 4490 eb = (struct ocfs2_extent_block *)bh->b_data;
4407 ocfs2_error(inode->i_sb, 4491 ocfs2_error(sb,
4408 "Extent block #%llu has an " 4492 "Extent block #%llu has an "
4409 "invalid l_next_free_rec of %d", 4493 "invalid l_next_free_rec of %d",
4410 (unsigned long long)le64_to_cpu(eb->h_blkno), 4494 (unsigned long long)le64_to_cpu(eb->h_blkno),
@@ -4419,7 +4503,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4419 if (rec) { 4503 if (rec) {
4420 enum ocfs2_contig_type contig_type; 4504 enum ocfs2_contig_type contig_type;
4421 4505
4422 contig_type = ocfs2_extent_contig(inode, rec, split_rec); 4506 contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
4423 4507
4424 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) 4508 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4425 ret = CONTIG_LEFTRIGHT; 4509 ret = CONTIG_LEFTRIGHT;
@@ -4436,11 +4520,10 @@ out:
4436 return ret; 4520 return ret;
4437} 4521}
4438 4522
4439static void ocfs2_figure_contig_type(struct inode *inode, 4523static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
4440 struct ocfs2_insert_type *insert, 4524 struct ocfs2_insert_type *insert,
4441 struct ocfs2_extent_list *el, 4525 struct ocfs2_extent_list *el,
4442 struct ocfs2_extent_rec *insert_rec, 4526 struct ocfs2_extent_rec *insert_rec)
4443 struct ocfs2_extent_tree *et)
4444{ 4527{
4445 int i; 4528 int i;
4446 enum ocfs2_contig_type contig_type = CONTIG_NONE; 4529 enum ocfs2_contig_type contig_type = CONTIG_NONE;
@@ -4448,8 +4531,8 @@ static void ocfs2_figure_contig_type(struct inode *inode,
4448 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 4531 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4449 4532
4450 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 4533 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4451 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i], 4534 contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
4452 insert_rec); 4535 insert_rec);
4453 if (contig_type != CONTIG_NONE) { 4536 if (contig_type != CONTIG_NONE) {
4454 insert->ins_contig_index = i; 4537 insert->ins_contig_index = i;
4455 break; 4538 break;
@@ -4530,8 +4613,7 @@ set_tail_append:
4530 * All of the information is stored on the ocfs2_insert_type 4613 * All of the information is stored on the ocfs2_insert_type
4531 * structure. 4614 * structure.
4532 */ 4615 */
4533static int ocfs2_figure_insert_type(struct inode *inode, 4616static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
4534 struct ocfs2_extent_tree *et,
4535 struct buffer_head **last_eb_bh, 4617 struct buffer_head **last_eb_bh,
4536 struct ocfs2_extent_rec *insert_rec, 4618 struct ocfs2_extent_rec *insert_rec,
4537 int *free_records, 4619 int *free_records,
@@ -4555,7 +4637,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4555 * ocfs2_figure_insert_type() and ocfs2_add_branch() 4637 * ocfs2_figure_insert_type() and ocfs2_add_branch()
4556 * may want it later. 4638 * may want it later.
4557 */ 4639 */
4558 ret = ocfs2_read_extent_block(inode, 4640 ret = ocfs2_read_extent_block(et->et_ci,
4559 ocfs2_et_get_last_eb_blk(et), 4641 ocfs2_et_get_last_eb_blk(et),
4560 &bh); 4642 &bh);
4561 if (ret) { 4643 if (ret) {
@@ -4578,7 +4660,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4578 le16_to_cpu(el->l_next_free_rec); 4660 le16_to_cpu(el->l_next_free_rec);
4579 4661
4580 if (!insert->ins_tree_depth) { 4662 if (!insert->ins_tree_depth) {
4581 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); 4663 ocfs2_figure_contig_type(et, insert, el, insert_rec);
4582 ocfs2_figure_appending_type(insert, el, insert_rec); 4664 ocfs2_figure_appending_type(insert, el, insert_rec);
4583 return 0; 4665 return 0;
4584 } 4666 }
@@ -4596,7 +4678,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4596 * us the rightmost tree path. This is accounted for below in 4678 * us the rightmost tree path. This is accounted for below in
4597 * the appending code. 4679 * the appending code.
4598 */ 4680 */
4599 ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos)); 4681 ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
4600 if (ret) { 4682 if (ret) {
4601 mlog_errno(ret); 4683 mlog_errno(ret);
4602 goto out; 4684 goto out;
@@ -4612,7 +4694,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
4612 * into two types of appends: simple record append, or a 4694 * into two types of appends: simple record append, or a
4613 * rotate inside the tail leaf. 4695 * rotate inside the tail leaf.
4614 */ 4696 */
4615 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); 4697 ocfs2_figure_contig_type(et, insert, el, insert_rec);
4616 4698
4617 /* 4699 /*
4618 * The insert code isn't quite ready to deal with all cases of 4700 * The insert code isn't quite ready to deal with all cases of
@@ -4657,13 +4739,11 @@ out:
4657} 4739}
4658 4740
4659/* 4741/*
4660 * Insert an extent into an inode btree. 4742 * Insert an extent into a btree.
4661 * 4743 *
4662 * The caller needs to update fe->i_clusters 4744 * The caller needs to update the owning btree's cluster count.
4663 */ 4745 */
4664int ocfs2_insert_extent(struct ocfs2_super *osb, 4746int ocfs2_insert_extent(handle_t *handle,
4665 handle_t *handle,
4666 struct inode *inode,
4667 struct ocfs2_extent_tree *et, 4747 struct ocfs2_extent_tree *et,
4668 u32 cpos, 4748 u32 cpos,
4669 u64 start_blk, 4749 u64 start_blk,
@@ -4677,21 +4757,22 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
4677 struct ocfs2_insert_type insert = {0, }; 4757 struct ocfs2_insert_type insert = {0, };
4678 struct ocfs2_extent_rec rec; 4758 struct ocfs2_extent_rec rec;
4679 4759
4680 mlog(0, "add %u clusters at position %u to inode %llu\n", 4760 mlog(0, "add %u clusters at position %u to owner %llu\n",
4681 new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); 4761 new_clusters, cpos,
4762 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
4682 4763
4683 memset(&rec, 0, sizeof(rec)); 4764 memset(&rec, 0, sizeof(rec));
4684 rec.e_cpos = cpu_to_le32(cpos); 4765 rec.e_cpos = cpu_to_le32(cpos);
4685 rec.e_blkno = cpu_to_le64(start_blk); 4766 rec.e_blkno = cpu_to_le64(start_blk);
4686 rec.e_leaf_clusters = cpu_to_le16(new_clusters); 4767 rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4687 rec.e_flags = flags; 4768 rec.e_flags = flags;
4688 status = ocfs2_et_insert_check(inode, et, &rec); 4769 status = ocfs2_et_insert_check(et, &rec);
4689 if (status) { 4770 if (status) {
4690 mlog_errno(status); 4771 mlog_errno(status);
4691 goto bail; 4772 goto bail;
4692 } 4773 }
4693 4774
4694 status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec, 4775 status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
4695 &free_records, &insert); 4776 &free_records, &insert);
4696 if (status < 0) { 4777 if (status < 0) {
4697 mlog_errno(status); 4778 mlog_errno(status);
@@ -4705,7 +4786,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
4705 free_records, insert.ins_tree_depth); 4786 free_records, insert.ins_tree_depth);
4706 4787
4707 if (insert.ins_contig == CONTIG_NONE && free_records == 0) { 4788 if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4708 status = ocfs2_grow_tree(inode, handle, et, 4789 status = ocfs2_grow_tree(handle, et,
4709 &insert.ins_tree_depth, &last_eb_bh, 4790 &insert.ins_tree_depth, &last_eb_bh,
4710 meta_ac); 4791 meta_ac);
4711 if (status) { 4792 if (status) {
@@ -4715,11 +4796,11 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
4715 } 4796 }
4716 4797
4717 /* Finally, we can add clusters. This might rotate the tree for us. */ 4798 /* Finally, we can add clusters. This might rotate the tree for us. */
4718 status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert); 4799 status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
4719 if (status < 0) 4800 if (status < 0)
4720 mlog_errno(status); 4801 mlog_errno(status);
4721 else if (et->et_ops == &ocfs2_dinode_et_ops) 4802 else
4722 ocfs2_extent_map_insert_rec(inode, &rec); 4803 ocfs2_et_extent_map_insert(et, &rec);
4723 4804
4724bail: 4805bail:
4725 brelse(last_eb_bh); 4806 brelse(last_eb_bh);
@@ -4735,13 +4816,11 @@ bail:
4735 * it is not limited to the file storage. Any extent tree can use this 4816 * it is not limited to the file storage. Any extent tree can use this
4736 * function if it implements the proper ocfs2_extent_tree. 4817 * function if it implements the proper ocfs2_extent_tree.
4737 */ 4818 */
4738int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, 4819int ocfs2_add_clusters_in_btree(handle_t *handle,
4739 struct inode *inode, 4820 struct ocfs2_extent_tree *et,
4740 u32 *logical_offset, 4821 u32 *logical_offset,
4741 u32 clusters_to_add, 4822 u32 clusters_to_add,
4742 int mark_unwritten, 4823 int mark_unwritten,
4743 struct ocfs2_extent_tree *et,
4744 handle_t *handle,
4745 struct ocfs2_alloc_context *data_ac, 4824 struct ocfs2_alloc_context *data_ac,
4746 struct ocfs2_alloc_context *meta_ac, 4825 struct ocfs2_alloc_context *meta_ac,
4747 enum ocfs2_alloc_restarted *reason_ret) 4826 enum ocfs2_alloc_restarted *reason_ret)
@@ -4752,13 +4831,15 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4752 u32 bit_off, num_bits; 4831 u32 bit_off, num_bits;
4753 u64 block; 4832 u64 block;
4754 u8 flags = 0; 4833 u8 flags = 0;
4834 struct ocfs2_super *osb =
4835 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
4755 4836
4756 BUG_ON(!clusters_to_add); 4837 BUG_ON(!clusters_to_add);
4757 4838
4758 if (mark_unwritten) 4839 if (mark_unwritten)
4759 flags = OCFS2_EXT_UNWRITTEN; 4840 flags = OCFS2_EXT_UNWRITTEN;
4760 4841
4761 free_extents = ocfs2_num_free_extents(osb, inode, et); 4842 free_extents = ocfs2_num_free_extents(osb, et);
4762 if (free_extents < 0) { 4843 if (free_extents < 0) {
4763 status = free_extents; 4844 status = free_extents;
4764 mlog_errno(status); 4845 mlog_errno(status);
@@ -4795,7 +4876,7 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4795 BUG_ON(num_bits > clusters_to_add); 4876 BUG_ON(num_bits > clusters_to_add);
4796 4877
4797 /* reserve our write early -- insert_extent may update the tree root */ 4878 /* reserve our write early -- insert_extent may update the tree root */
4798 status = ocfs2_et_root_journal_access(handle, inode, et, 4879 status = ocfs2_et_root_journal_access(handle, et,
4799 OCFS2_JOURNAL_ACCESS_WRITE); 4880 OCFS2_JOURNAL_ACCESS_WRITE);
4800 if (status < 0) { 4881 if (status < 0) {
4801 mlog_errno(status); 4882 mlog_errno(status);
@@ -4803,10 +4884,10 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4803 } 4884 }
4804 4885
4805 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 4886 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4806 mlog(0, "Allocating %u clusters at block %u for inode %llu\n", 4887 mlog(0, "Allocating %u clusters at block %u for owner %llu\n",
4807 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 4888 num_bits, bit_off,
4808 status = ocfs2_insert_extent(osb, handle, inode, et, 4889 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
4809 *logical_offset, block, 4890 status = ocfs2_insert_extent(handle, et, *logical_offset, block,
4810 num_bits, flags, meta_ac); 4891 num_bits, flags, meta_ac);
4811 if (status < 0) { 4892 if (status < 0) {
4812 mlog_errno(status); 4893 mlog_errno(status);
@@ -4856,10 +4937,9 @@ static void ocfs2_make_right_split_rec(struct super_block *sb,
4856 split_rec->e_flags = rec->e_flags; 4937 split_rec->e_flags = rec->e_flags;
4857} 4938}
4858 4939
4859static int ocfs2_split_and_insert(struct inode *inode, 4940static int ocfs2_split_and_insert(handle_t *handle,
4860 handle_t *handle,
4861 struct ocfs2_path *path,
4862 struct ocfs2_extent_tree *et, 4941 struct ocfs2_extent_tree *et,
4942 struct ocfs2_path *path,
4863 struct buffer_head **last_eb_bh, 4943 struct buffer_head **last_eb_bh,
4864 int split_index, 4944 int split_index,
4865 struct ocfs2_extent_rec *orig_split_rec, 4945 struct ocfs2_extent_rec *orig_split_rec,
@@ -4892,7 +4972,7 @@ leftright:
4892 4972
4893 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 4973 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4894 le16_to_cpu(rightmost_el->l_count)) { 4974 le16_to_cpu(rightmost_el->l_count)) {
4895 ret = ocfs2_grow_tree(inode, handle, et, 4975 ret = ocfs2_grow_tree(handle, et,
4896 &depth, last_eb_bh, meta_ac); 4976 &depth, last_eb_bh, meta_ac);
4897 if (ret) { 4977 if (ret) {
4898 mlog_errno(ret); 4978 mlog_errno(ret);
@@ -4921,8 +5001,8 @@ leftright:
4921 */ 5001 */
4922 insert.ins_split = SPLIT_RIGHT; 5002 insert.ins_split = SPLIT_RIGHT;
4923 5003
4924 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range, 5004 ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4925 &rec); 5005 &tmprec, insert_range, &rec);
4926 5006
4927 split_rec = tmprec; 5007 split_rec = tmprec;
4928 5008
@@ -4930,7 +5010,7 @@ leftright:
4930 do_leftright = 1; 5010 do_leftright = 1;
4931 } 5011 }
4932 5012
4933 ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); 5013 ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
4934 if (ret) { 5014 if (ret) {
4935 mlog_errno(ret); 5015 mlog_errno(ret);
4936 goto out; 5016 goto out;
@@ -4946,7 +5026,7 @@ leftright:
4946 ocfs2_reinit_path(path, 1); 5026 ocfs2_reinit_path(path, 1);
4947 5027
4948 cpos = le32_to_cpu(split_rec.e_cpos); 5028 cpos = le32_to_cpu(split_rec.e_cpos);
4949 ret = ocfs2_find_path(inode, path, cpos); 5029 ret = ocfs2_find_path(et->et_ci, path, cpos);
4950 if (ret) { 5030 if (ret) {
4951 mlog_errno(ret); 5031 mlog_errno(ret);
4952 goto out; 5032 goto out;
@@ -4961,8 +5041,8 @@ out:
4961 return ret; 5041 return ret;
4962} 5042}
4963 5043
4964static int ocfs2_replace_extent_rec(struct inode *inode, 5044static int ocfs2_replace_extent_rec(handle_t *handle,
4965 handle_t *handle, 5045 struct ocfs2_extent_tree *et,
4966 struct ocfs2_path *path, 5046 struct ocfs2_path *path,
4967 struct ocfs2_extent_list *el, 5047 struct ocfs2_extent_list *el,
4968 int split_index, 5048 int split_index,
@@ -4970,7 +5050,7 @@ static int ocfs2_replace_extent_rec(struct inode *inode,
4970{ 5050{
4971 int ret; 5051 int ret;
4972 5052
4973 ret = ocfs2_path_bh_journal_access(handle, inode, path, 5053 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
4974 path_num_items(path) - 1); 5054 path_num_items(path) - 1);
4975 if (ret) { 5055 if (ret) {
4976 mlog_errno(ret); 5056 mlog_errno(ret);
@@ -4985,9 +5065,8 @@ out:
4985} 5065}
4986 5066
4987/* 5067/*
4988 * Mark part or all of the extent record at split_index in the leaf 5068 * Split part or all of the extent record at split_index in the leaf
4989 * pointed to by path as written. This removes the unwritten 5069 * pointed to by path. Merge with the contiguous extent record if needed.
4990 * extent flag.
4991 * 5070 *
4992 * Care is taken to handle contiguousness so as to not grow the tree. 5071 * Care is taken to handle contiguousness so as to not grow the tree.
4993 * 5072 *
@@ -5004,14 +5083,13 @@ out:
5004 * have been brought into cache (and pinned via the journal), so the 5083 * have been brought into cache (and pinned via the journal), so the
5005 * extra overhead is not expressed in terms of disk reads. 5084 * extra overhead is not expressed in terms of disk reads.
5006 */ 5085 */
5007static int __ocfs2_mark_extent_written(struct inode *inode, 5086int ocfs2_split_extent(handle_t *handle,
5008 struct ocfs2_extent_tree *et, 5087 struct ocfs2_extent_tree *et,
5009 handle_t *handle, 5088 struct ocfs2_path *path,
5010 struct ocfs2_path *path, 5089 int split_index,
5011 int split_index, 5090 struct ocfs2_extent_rec *split_rec,
5012 struct ocfs2_extent_rec *split_rec, 5091 struct ocfs2_alloc_context *meta_ac,
5013 struct ocfs2_alloc_context *meta_ac, 5092 struct ocfs2_cached_dealloc_ctxt *dealloc)
5014 struct ocfs2_cached_dealloc_ctxt *dealloc)
5015{ 5093{
5016 int ret = 0; 5094 int ret = 0;
5017 struct ocfs2_extent_list *el = path_leaf_el(path); 5095 struct ocfs2_extent_list *el = path_leaf_el(path);
@@ -5020,12 +5098,6 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
5020 struct ocfs2_merge_ctxt ctxt; 5098 struct ocfs2_merge_ctxt ctxt;
5021 struct ocfs2_extent_list *rightmost_el; 5099 struct ocfs2_extent_list *rightmost_el;
5022 5100
5023 if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
5024 ret = -EIO;
5025 mlog_errno(ret);
5026 goto out;
5027 }
5028
5029 if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || 5101 if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
5030 ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < 5102 ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
5031 (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) { 5103 (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
@@ -5034,19 +5106,19 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
5034 goto out; 5106 goto out;
5035 } 5107 }
5036 5108
5037 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el, 5109 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
5038 split_index, 5110 split_index,
5039 split_rec); 5111 split_rec);
5040 5112
5041 /* 5113 /*
5042 * The core merge / split code wants to know how much room is 5114 * The core merge / split code wants to know how much room is
5043 * left in this inodes allocation tree, so we pass the 5115 * left in this allocation tree, so we pass the
5044 * rightmost extent list. 5116 * rightmost extent list.
5045 */ 5117 */
5046 if (path->p_tree_depth) { 5118 if (path->p_tree_depth) {
5047 struct ocfs2_extent_block *eb; 5119 struct ocfs2_extent_block *eb;
5048 5120
5049 ret = ocfs2_read_extent_block(inode, 5121 ret = ocfs2_read_extent_block(et->et_ci,
5050 ocfs2_et_get_last_eb_blk(et), 5122 ocfs2_et_get_last_eb_blk(et),
5051 &last_eb_bh); 5123 &last_eb_bh);
5052 if (ret) { 5124 if (ret) {
@@ -5073,19 +5145,18 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
5073 5145
5074 if (ctxt.c_contig_type == CONTIG_NONE) { 5146 if (ctxt.c_contig_type == CONTIG_NONE) {
5075 if (ctxt.c_split_covers_rec) 5147 if (ctxt.c_split_covers_rec)
5076 ret = ocfs2_replace_extent_rec(inode, handle, 5148 ret = ocfs2_replace_extent_rec(handle, et, path, el,
5077 path, el,
5078 split_index, split_rec); 5149 split_index, split_rec);
5079 else 5150 else
5080 ret = ocfs2_split_and_insert(inode, handle, path, et, 5151 ret = ocfs2_split_and_insert(handle, et, path,
5081 &last_eb_bh, split_index, 5152 &last_eb_bh, split_index,
5082 split_rec, meta_ac); 5153 split_rec, meta_ac);
5083 if (ret) 5154 if (ret)
5084 mlog_errno(ret); 5155 mlog_errno(ret);
5085 } else { 5156 } else {
5086 ret = ocfs2_try_to_merge_extent(inode, handle, path, 5157 ret = ocfs2_try_to_merge_extent(handle, et, path,
5087 split_index, split_rec, 5158 split_index, split_rec,
5088 dealloc, &ctxt, et); 5159 dealloc, &ctxt);
5089 if (ret) 5160 if (ret)
5090 mlog_errno(ret); 5161 mlog_errno(ret);
5091 } 5162 }
@@ -5096,46 +5167,31 @@ out:
5096} 5167}
5097 5168
5098/* 5169/*
5099 * Mark the already-existing extent at cpos as written for len clusters. 5170 * Change the flags of the already-existing extent at cpos for len clusters.
5171 *
5172 * new_flags: the flags we want to set.
5173 * clear_flags: the flags we want to clear.
5174 * phys: the new physical offset we want this new extent starts from.
5100 * 5175 *
5101 * If the existing extent is larger than the request, initiate a 5176 * If the existing extent is larger than the request, initiate a
5102 * split. An attempt will be made at merging with adjacent extents. 5177 * split. An attempt will be made at merging with adjacent extents.
5103 * 5178 *
5104 * The caller is responsible for passing down meta_ac if we'll need it. 5179 * The caller is responsible for passing down meta_ac if we'll need it.
5105 */ 5180 */
5106int ocfs2_mark_extent_written(struct inode *inode, 5181int ocfs2_change_extent_flag(handle_t *handle,
5107 struct ocfs2_extent_tree *et, 5182 struct ocfs2_extent_tree *et,
5108 handle_t *handle, u32 cpos, u32 len, u32 phys, 5183 u32 cpos, u32 len, u32 phys,
5109 struct ocfs2_alloc_context *meta_ac, 5184 struct ocfs2_alloc_context *meta_ac,
5110 struct ocfs2_cached_dealloc_ctxt *dealloc) 5185 struct ocfs2_cached_dealloc_ctxt *dealloc,
5186 int new_flags, int clear_flags)
5111{ 5187{
5112 int ret, index; 5188 int ret, index;
5113 u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys); 5189 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5190 u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
5114 struct ocfs2_extent_rec split_rec; 5191 struct ocfs2_extent_rec split_rec;
5115 struct ocfs2_path *left_path = NULL; 5192 struct ocfs2_path *left_path = NULL;
5116 struct ocfs2_extent_list *el; 5193 struct ocfs2_extent_list *el;
5117 5194 struct ocfs2_extent_rec *rec;
5118 mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
5119 inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
5120
5121 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5122 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
5123 "that are being written to, but the feature bit "
5124 "is not set in the super block.",
5125 (unsigned long long)OCFS2_I(inode)->ip_blkno);
5126 ret = -EROFS;
5127 goto out;
5128 }
5129
5130 /*
5131 * XXX: This should be fixed up so that we just re-insert the
5132 * next extent records.
5133 *
5134 * XXX: This is a hack on the extent tree, maybe it should be
5135 * an op?
5136 */
5137 if (et->et_ops == &ocfs2_dinode_et_ops)
5138 ocfs2_extent_map_trunc(inode, 0);
5139 5195
5140 left_path = ocfs2_new_path_from_et(et); 5196 left_path = ocfs2_new_path_from_et(et);
5141 if (!left_path) { 5197 if (!left_path) {
@@ -5144,7 +5200,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
5144 goto out; 5200 goto out;
5145 } 5201 }
5146 5202
5147 ret = ocfs2_find_path(inode, left_path, cpos); 5203 ret = ocfs2_find_path(et->et_ci, left_path, cpos);
5148 if (ret) { 5204 if (ret) {
5149 mlog_errno(ret); 5205 mlog_errno(ret);
5150 goto out; 5206 goto out;
@@ -5153,34 +5209,102 @@ int ocfs2_mark_extent_written(struct inode *inode,
5153 5209
5154 index = ocfs2_search_extent_list(el, cpos); 5210 index = ocfs2_search_extent_list(el, cpos);
5155 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5211 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5156 ocfs2_error(inode->i_sb, 5212 ocfs2_error(sb,
5157 "Inode %llu has an extent at cpos %u which can no " 5213 "Owner %llu has an extent at cpos %u which can no "
5158 "longer be found.\n", 5214 "longer be found.\n",
5159 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); 5215 (unsigned long long)
5216 ocfs2_metadata_cache_owner(et->et_ci), cpos);
5160 ret = -EROFS; 5217 ret = -EROFS;
5161 goto out; 5218 goto out;
5162 } 5219 }
5163 5220
5221 ret = -EIO;
5222 rec = &el->l_recs[index];
5223 if (new_flags && (rec->e_flags & new_flags)) {
5224 mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
5225 "extent that already had them",
5226 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5227 new_flags);
5228 goto out;
5229 }
5230
5231 if (clear_flags && !(rec->e_flags & clear_flags)) {
5232 mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
5233 "extent that didn't have them",
5234 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5235 clear_flags);
5236 goto out;
5237 }
5238
5164 memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec)); 5239 memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
5165 split_rec.e_cpos = cpu_to_le32(cpos); 5240 split_rec.e_cpos = cpu_to_le32(cpos);
5166 split_rec.e_leaf_clusters = cpu_to_le16(len); 5241 split_rec.e_leaf_clusters = cpu_to_le16(len);
5167 split_rec.e_blkno = cpu_to_le64(start_blkno); 5242 split_rec.e_blkno = cpu_to_le64(start_blkno);
5168 split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags; 5243 split_rec.e_flags = rec->e_flags;
5169 split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN; 5244 if (new_flags)
5170 5245 split_rec.e_flags |= new_flags;
5171 ret = __ocfs2_mark_extent_written(inode, et, handle, left_path, 5246 if (clear_flags)
5172 index, &split_rec, meta_ac, 5247 split_rec.e_flags &= ~clear_flags;
5173 dealloc); 5248
5249 ret = ocfs2_split_extent(handle, et, left_path,
5250 index, &split_rec, meta_ac,
5251 dealloc);
5174 if (ret) 5252 if (ret)
5175 mlog_errno(ret); 5253 mlog_errno(ret);
5176 5254
5177out: 5255out:
5178 ocfs2_free_path(left_path); 5256 ocfs2_free_path(left_path);
5179 return ret; 5257 return ret;
5258
5180} 5259}
5181 5260
5182static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, 5261/*
5183 handle_t *handle, struct ocfs2_path *path, 5262 * Mark the already-existing extent at cpos as written for len clusters.
5263 * This removes the unwritten extent flag.
5264 *
5265 * If the existing extent is larger than the request, initiate a
5266 * split. An attempt will be made at merging with adjacent extents.
5267 *
5268 * The caller is responsible for passing down meta_ac if we'll need it.
5269 */
5270int ocfs2_mark_extent_written(struct inode *inode,
5271 struct ocfs2_extent_tree *et,
5272 handle_t *handle, u32 cpos, u32 len, u32 phys,
5273 struct ocfs2_alloc_context *meta_ac,
5274 struct ocfs2_cached_dealloc_ctxt *dealloc)
5275{
5276 int ret;
5277
5278 mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n",
5279 inode->i_ino, cpos, len, phys);
5280
5281 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5282 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
5283 "that are being written to, but the feature bit "
5284 "is not set in the super block.",
5285 (unsigned long long)OCFS2_I(inode)->ip_blkno);
5286 ret = -EROFS;
5287 goto out;
5288 }
5289
5290 /*
5291 * XXX: This should be fixed up so that we just re-insert the
5292 * next extent records.
5293 */
5294 ocfs2_et_extent_map_truncate(et, 0);
5295
5296 ret = ocfs2_change_extent_flag(handle, et, cpos,
5297 len, phys, meta_ac, dealloc,
5298 0, OCFS2_EXT_UNWRITTEN);
5299 if (ret)
5300 mlog_errno(ret);
5301
5302out:
5303 return ret;
5304}
5305
5306static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5307 struct ocfs2_path *path,
5184 int index, u32 new_range, 5308 int index, u32 new_range,
5185 struct ocfs2_alloc_context *meta_ac) 5309 struct ocfs2_alloc_context *meta_ac)
5186{ 5310{
@@ -5197,11 +5321,12 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
5197 */ 5321 */
5198 el = path_leaf_el(path); 5322 el = path_leaf_el(path);
5199 rec = &el->l_recs[index]; 5323 rec = &el->l_recs[index];
5200 ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec); 5324 ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
5325 &split_rec, new_range, rec);
5201 5326
5202 depth = path->p_tree_depth; 5327 depth = path->p_tree_depth;
5203 if (depth > 0) { 5328 if (depth > 0) {
5204 ret = ocfs2_read_extent_block(inode, 5329 ret = ocfs2_read_extent_block(et->et_ci,
5205 ocfs2_et_get_last_eb_blk(et), 5330 ocfs2_et_get_last_eb_blk(et),
5206 &last_eb_bh); 5331 &last_eb_bh);
5207 if (ret < 0) { 5332 if (ret < 0) {
@@ -5224,7 +5349,7 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
5224 5349
5225 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 5350 if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
5226 le16_to_cpu(rightmost_el->l_count)) { 5351 le16_to_cpu(rightmost_el->l_count)) {
5227 ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh, 5352 ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
5228 meta_ac); 5353 meta_ac);
5229 if (ret) { 5354 if (ret) {
5230 mlog_errno(ret); 5355 mlog_errno(ret);
@@ -5238,7 +5363,7 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
5238 insert.ins_split = SPLIT_RIGHT; 5363 insert.ins_split = SPLIT_RIGHT;
5239 insert.ins_tree_depth = depth; 5364 insert.ins_tree_depth = depth;
5240 5365
5241 ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); 5366 ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
5242 if (ret) 5367 if (ret)
5243 mlog_errno(ret); 5368 mlog_errno(ret);
5244 5369
@@ -5247,23 +5372,23 @@ out:
5247 return ret; 5372 return ret;
5248} 5373}
5249 5374
5250static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, 5375static int ocfs2_truncate_rec(handle_t *handle,
5376 struct ocfs2_extent_tree *et,
5251 struct ocfs2_path *path, int index, 5377 struct ocfs2_path *path, int index,
5252 struct ocfs2_cached_dealloc_ctxt *dealloc, 5378 struct ocfs2_cached_dealloc_ctxt *dealloc,
5253 u32 cpos, u32 len, 5379 u32 cpos, u32 len)
5254 struct ocfs2_extent_tree *et)
5255{ 5380{
5256 int ret; 5381 int ret;
5257 u32 left_cpos, rec_range, trunc_range; 5382 u32 left_cpos, rec_range, trunc_range;
5258 int wants_rotate = 0, is_rightmost_tree_rec = 0; 5383 int wants_rotate = 0, is_rightmost_tree_rec = 0;
5259 struct super_block *sb = inode->i_sb; 5384 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5260 struct ocfs2_path *left_path = NULL; 5385 struct ocfs2_path *left_path = NULL;
5261 struct ocfs2_extent_list *el = path_leaf_el(path); 5386 struct ocfs2_extent_list *el = path_leaf_el(path);
5262 struct ocfs2_extent_rec *rec; 5387 struct ocfs2_extent_rec *rec;
5263 struct ocfs2_extent_block *eb; 5388 struct ocfs2_extent_block *eb;
5264 5389
5265 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { 5390 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5266 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); 5391 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5267 if (ret) { 5392 if (ret) {
5268 mlog_errno(ret); 5393 mlog_errno(ret);
5269 goto out; 5394 goto out;
@@ -5295,14 +5420,13 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5295 * by this leaf and the one to it's left. 5420 * by this leaf and the one to it's left.
5296 * 5421 *
5297 * There are two cases we can skip: 5422 * There are two cases we can skip:
5298 * 1) Path is the leftmost one in our inode tree. 5423 * 1) Path is the leftmost one in our btree.
5299 * 2) The leaf is rightmost and will be empty after 5424 * 2) The leaf is rightmost and will be empty after
5300 * we remove the extent record - the rotate code 5425 * we remove the extent record - the rotate code
5301 * knows how to update the newly formed edge. 5426 * knows how to update the newly formed edge.
5302 */ 5427 */
5303 5428
5304 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, 5429 ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
5305 &left_cpos);
5306 if (ret) { 5430 if (ret) {
5307 mlog_errno(ret); 5431 mlog_errno(ret);
5308 goto out; 5432 goto out;
@@ -5316,7 +5440,8 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5316 goto out; 5440 goto out;
5317 } 5441 }
5318 5442
5319 ret = ocfs2_find_path(inode, left_path, left_cpos); 5443 ret = ocfs2_find_path(et->et_ci, left_path,
5444 left_cpos);
5320 if (ret) { 5445 if (ret) {
5321 mlog_errno(ret); 5446 mlog_errno(ret);
5322 goto out; 5447 goto out;
@@ -5332,13 +5457,13 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5332 goto out; 5457 goto out;
5333 } 5458 }
5334 5459
5335 ret = ocfs2_journal_access_path(inode, handle, path); 5460 ret = ocfs2_journal_access_path(et->et_ci, handle, path);
5336 if (ret) { 5461 if (ret) {
5337 mlog_errno(ret); 5462 mlog_errno(ret);
5338 goto out; 5463 goto out;
5339 } 5464 }
5340 5465
5341 ret = ocfs2_journal_access_path(inode, handle, left_path); 5466 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
5342 if (ret) { 5467 if (ret) {
5343 mlog_errno(ret); 5468 mlog_errno(ret);
5344 goto out; 5469 goto out;
@@ -5361,7 +5486,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5361 * be deleted by the rotate code. 5486 * be deleted by the rotate code.
5362 */ 5487 */
5363 rec = &el->l_recs[next_free - 1]; 5488 rec = &el->l_recs[next_free - 1];
5364 ocfs2_adjust_rightmost_records(inode, handle, path, 5489 ocfs2_adjust_rightmost_records(handle, et, path,
5365 rec); 5490 rec);
5366 } 5491 }
5367 } else if (le32_to_cpu(rec->e_cpos) == cpos) { 5492 } else if (le32_to_cpu(rec->e_cpos) == cpos) {
@@ -5373,11 +5498,12 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5373 /* Remove rightmost portion of the record */ 5498 /* Remove rightmost portion of the record */
5374 le16_add_cpu(&rec->e_leaf_clusters, -len); 5499 le16_add_cpu(&rec->e_leaf_clusters, -len);
5375 if (is_rightmost_tree_rec) 5500 if (is_rightmost_tree_rec)
5376 ocfs2_adjust_rightmost_records(inode, handle, path, rec); 5501 ocfs2_adjust_rightmost_records(handle, et, path, rec);
5377 } else { 5502 } else {
5378 /* Caller should have trapped this. */ 5503 /* Caller should have trapped this. */
5379 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) " 5504 mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
5380 "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, 5505 "(%u, %u)\n",
5506 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5381 le32_to_cpu(rec->e_cpos), 5507 le32_to_cpu(rec->e_cpos),
5382 le16_to_cpu(rec->e_leaf_clusters), cpos, len); 5508 le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5383 BUG(); 5509 BUG();
@@ -5386,14 +5512,14 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5386 if (left_path) { 5512 if (left_path) {
5387 int subtree_index; 5513 int subtree_index;
5388 5514
5389 subtree_index = ocfs2_find_subtree_root(inode, left_path, path); 5515 subtree_index = ocfs2_find_subtree_root(et, left_path, path);
5390 ocfs2_complete_edge_insert(inode, handle, left_path, path, 5516 ocfs2_complete_edge_insert(handle, left_path, path,
5391 subtree_index); 5517 subtree_index);
5392 } 5518 }
5393 5519
5394 ocfs2_journal_dirty(handle, path_leaf_bh(path)); 5520 ocfs2_journal_dirty(handle, path_leaf_bh(path));
5395 5521
5396 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); 5522 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5397 if (ret) { 5523 if (ret) {
5398 mlog_errno(ret); 5524 mlog_errno(ret);
5399 goto out; 5525 goto out;
@@ -5404,9 +5530,9 @@ out:
5404 return ret; 5530 return ret;
5405} 5531}
5406 5532
5407int ocfs2_remove_extent(struct inode *inode, 5533int ocfs2_remove_extent(handle_t *handle,
5408 struct ocfs2_extent_tree *et, 5534 struct ocfs2_extent_tree *et,
5409 u32 cpos, u32 len, handle_t *handle, 5535 u32 cpos, u32 len,
5410 struct ocfs2_alloc_context *meta_ac, 5536 struct ocfs2_alloc_context *meta_ac,
5411 struct ocfs2_cached_dealloc_ctxt *dealloc) 5537 struct ocfs2_cached_dealloc_ctxt *dealloc)
5412{ 5538{
@@ -5416,7 +5542,11 @@ int ocfs2_remove_extent(struct inode *inode,
5416 struct ocfs2_extent_list *el; 5542 struct ocfs2_extent_list *el;
5417 struct ocfs2_path *path = NULL; 5543 struct ocfs2_path *path = NULL;
5418 5544
5419 ocfs2_extent_map_trunc(inode, 0); 5545 /*
5546 * XXX: Why are we truncating to 0 instead of wherever this
5547 * affects us?
5548 */
5549 ocfs2_et_extent_map_truncate(et, 0);
5420 5550
5421 path = ocfs2_new_path_from_et(et); 5551 path = ocfs2_new_path_from_et(et);
5422 if (!path) { 5552 if (!path) {
@@ -5425,7 +5555,7 @@ int ocfs2_remove_extent(struct inode *inode,
5425 goto out; 5555 goto out;
5426 } 5556 }
5427 5557
5428 ret = ocfs2_find_path(inode, path, cpos); 5558 ret = ocfs2_find_path(et->et_ci, path, cpos);
5429 if (ret) { 5559 if (ret) {
5430 mlog_errno(ret); 5560 mlog_errno(ret);
5431 goto out; 5561 goto out;
@@ -5434,10 +5564,11 @@ int ocfs2_remove_extent(struct inode *inode,
5434 el = path_leaf_el(path); 5564 el = path_leaf_el(path);
5435 index = ocfs2_search_extent_list(el, cpos); 5565 index = ocfs2_search_extent_list(el, cpos);
5436 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5566 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5437 ocfs2_error(inode->i_sb, 5567 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5438 "Inode %llu has an extent at cpos %u which can no " 5568 "Owner %llu has an extent at cpos %u which can no "
5439 "longer be found.\n", 5569 "longer be found.\n",
5440 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); 5570 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5571 cpos);
5441 ret = -EROFS; 5572 ret = -EROFS;
5442 goto out; 5573 goto out;
5443 } 5574 }
@@ -5464,20 +5595,21 @@ int ocfs2_remove_extent(struct inode *inode,
5464 5595
5465 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range); 5596 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5466 5597
5467 mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d " 5598 mlog(0, "Owner %llu, remove (cpos %u, len %u). Existing index %d "
5468 "(cpos %u, len %u)\n", 5599 "(cpos %u, len %u)\n",
5469 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index, 5600 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5601 cpos, len, index,
5470 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); 5602 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5471 5603
5472 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { 5604 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5473 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, 5605 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5474 cpos, len, et); 5606 cpos, len);
5475 if (ret) { 5607 if (ret) {
5476 mlog_errno(ret); 5608 mlog_errno(ret);
5477 goto out; 5609 goto out;
5478 } 5610 }
5479 } else { 5611 } else {
5480 ret = ocfs2_split_tree(inode, et, handle, path, index, 5612 ret = ocfs2_split_tree(handle, et, path, index,
5481 trunc_range, meta_ac); 5613 trunc_range, meta_ac);
5482 if (ret) { 5614 if (ret) {
5483 mlog_errno(ret); 5615 mlog_errno(ret);
@@ -5490,7 +5622,7 @@ int ocfs2_remove_extent(struct inode *inode,
5490 */ 5622 */
5491 ocfs2_reinit_path(path, 1); 5623 ocfs2_reinit_path(path, 1);
5492 5624
5493 ret = ocfs2_find_path(inode, path, cpos); 5625 ret = ocfs2_find_path(et->et_ci, path, cpos);
5494 if (ret) { 5626 if (ret) {
5495 mlog_errno(ret); 5627 mlog_errno(ret);
5496 goto out; 5628 goto out;
@@ -5499,9 +5631,9 @@ int ocfs2_remove_extent(struct inode *inode,
5499 el = path_leaf_el(path); 5631 el = path_leaf_el(path);
5500 index = ocfs2_search_extent_list(el, cpos); 5632 index = ocfs2_search_extent_list(el, cpos);
5501 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5633 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5502 ocfs2_error(inode->i_sb, 5634 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5503 "Inode %llu: split at cpos %u lost record.", 5635 "Owner %llu: split at cpos %u lost record.",
5504 (unsigned long long)OCFS2_I(inode)->ip_blkno, 5636 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5505 cpos); 5637 cpos);
5506 ret = -EROFS; 5638 ret = -EROFS;
5507 goto out; 5639 goto out;
@@ -5515,18 +5647,18 @@ int ocfs2_remove_extent(struct inode *inode,
5515 rec_range = le32_to_cpu(rec->e_cpos) + 5647 rec_range = le32_to_cpu(rec->e_cpos) +
5516 ocfs2_rec_clusters(el, rec); 5648 ocfs2_rec_clusters(el, rec);
5517 if (rec_range != trunc_range) { 5649 if (rec_range != trunc_range) {
5518 ocfs2_error(inode->i_sb, 5650 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5519 "Inode %llu: error after split at cpos %u" 5651 "Owner %llu: error after split at cpos %u"
5520 "trunc len %u, existing record is (%u,%u)", 5652 "trunc len %u, existing record is (%u,%u)",
5521 (unsigned long long)OCFS2_I(inode)->ip_blkno, 5653 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5522 cpos, len, le32_to_cpu(rec->e_cpos), 5654 cpos, len, le32_to_cpu(rec->e_cpos),
5523 ocfs2_rec_clusters(el, rec)); 5655 ocfs2_rec_clusters(el, rec));
5524 ret = -EROFS; 5656 ret = -EROFS;
5525 goto out; 5657 goto out;
5526 } 5658 }
5527 5659
5528 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, 5660 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5529 cpos, len, et); 5661 cpos, len);
5530 if (ret) { 5662 if (ret) {
5531 mlog_errno(ret); 5663 mlog_errno(ret);
5532 goto out; 5664 goto out;
@@ -5573,7 +5705,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
5573 goto out; 5705 goto out;
5574 } 5706 }
5575 5707
5576 ret = ocfs2_et_root_journal_access(handle, inode, et, 5708 ret = ocfs2_et_root_journal_access(handle, et,
5577 OCFS2_JOURNAL_ACCESS_WRITE); 5709 OCFS2_JOURNAL_ACCESS_WRITE);
5578 if (ret) { 5710 if (ret) {
5579 mlog_errno(ret); 5711 mlog_errno(ret);
@@ -5583,14 +5715,13 @@ int ocfs2_remove_btree_range(struct inode *inode,
5583 vfs_dq_free_space_nodirty(inode, 5715 vfs_dq_free_space_nodirty(inode,
5584 ocfs2_clusters_to_bytes(inode->i_sb, len)); 5716 ocfs2_clusters_to_bytes(inode->i_sb, len));
5585 5717
5586 ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac, 5718 ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
5587 dealloc);
5588 if (ret) { 5719 if (ret) {
5589 mlog_errno(ret); 5720 mlog_errno(ret);
5590 goto out_commit; 5721 goto out_commit;
5591 } 5722 }
5592 5723
5593 ocfs2_et_update_clusters(inode, et, -len); 5724 ocfs2_et_update_clusters(et, -len);
5594 5725
5595 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 5726 ret = ocfs2_journal_dirty(handle, et->et_root_bh);
5596 if (ret) { 5727 if (ret) {
@@ -5690,7 +5821,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5690 goto bail; 5821 goto bail;
5691 } 5822 }
5692 5823
5693 status = ocfs2_journal_access_di(handle, tl_inode, tl_bh, 5824 status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5694 OCFS2_JOURNAL_ACCESS_WRITE); 5825 OCFS2_JOURNAL_ACCESS_WRITE);
5695 if (status < 0) { 5826 if (status < 0) {
5696 mlog_errno(status); 5827 mlog_errno(status);
@@ -5752,7 +5883,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5752 while (i >= 0) { 5883 while (i >= 0) {
5753 /* Caller has given us at least enough credits to 5884 /* Caller has given us at least enough credits to
5754 * update the truncate log dinode */ 5885 * update the truncate log dinode */
5755 status = ocfs2_journal_access_di(handle, tl_inode, tl_bh, 5886 status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5756 OCFS2_JOURNAL_ACCESS_WRITE); 5887 OCFS2_JOURNAL_ACCESS_WRITE);
5757 if (status < 0) { 5888 if (status < 0) {
5758 mlog_errno(status); 5889 mlog_errno(status);
@@ -6010,7 +6141,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
6010 tl->tl_used = 0; 6141 tl->tl_used = 0;
6011 6142
6012 ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check); 6143 ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
6013 status = ocfs2_write_block(osb, tl_bh, tl_inode); 6144 status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode));
6014 if (status < 0) { 6145 if (status < 0) {
6015 mlog_errno(status); 6146 mlog_errno(status);
6016 goto bail; 6147 goto bail;
@@ -6400,9 +6531,9 @@ ocfs2_find_per_slot_free_list(int type,
6400 return fl; 6531 return fl;
6401} 6532}
6402 6533
6403static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 6534int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6404 int type, int slot, u64 blkno, 6535 int type, int slot, u64 blkno,
6405 unsigned int bit) 6536 unsigned int bit)
6406{ 6537{
6407 int ret; 6538 int ret;
6408 struct ocfs2_per_slot_free_list *fl; 6539 struct ocfs2_per_slot_free_list *fl;
@@ -6518,7 +6649,7 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6518 goto out; 6649 goto out;
6519 } 6650 }
6520 6651
6521 ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh); 6652 ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh);
6522 if (ret) { 6653 if (ret) {
6523 mlog_errno(ret); 6654 mlog_errno(ret);
6524 goto out; 6655 goto out;
@@ -6551,7 +6682,7 @@ out:
6551 */ 6682 */
6552static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, 6683static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6553 handle_t *handle, struct ocfs2_truncate_context *tc, 6684 handle_t *handle, struct ocfs2_truncate_context *tc,
6554 u32 clusters_to_del, u64 *delete_start) 6685 u32 clusters_to_del, u64 *delete_start, u8 *flags)
6555{ 6686{
6556 int ret, i, index = path->p_tree_depth; 6687 int ret, i, index = path->p_tree_depth;
6557 u32 new_edge = 0; 6688 u32 new_edge = 0;
@@ -6561,6 +6692,7 @@ static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6561 struct ocfs2_extent_rec *rec; 6692 struct ocfs2_extent_rec *rec;
6562 6693
6563 *delete_start = 0; 6694 *delete_start = 0;
6695 *flags = 0;
6564 6696
6565 while (index >= 0) { 6697 while (index >= 0) {
6566 bh = path->p_node[index].bh; 6698 bh = path->p_node[index].bh;
@@ -6648,6 +6780,7 @@ find_tail_record:
6648 *delete_start = le64_to_cpu(rec->e_blkno) 6780 *delete_start = le64_to_cpu(rec->e_blkno)
6649 + ocfs2_clusters_to_blocks(inode->i_sb, 6781 + ocfs2_clusters_to_blocks(inode->i_sb,
6650 le16_to_cpu(rec->e_leaf_clusters)); 6782 le16_to_cpu(rec->e_leaf_clusters));
6783 *flags = rec->e_flags;
6651 6784
6652 /* 6785 /*
6653 * If it's now empty, remove this record. 6786 * If it's now empty, remove this record.
@@ -6719,7 +6852,7 @@ delete:
6719 6852
6720 mlog(0, "deleting this extent block.\n"); 6853 mlog(0, "deleting this extent block.\n");
6721 6854
6722 ocfs2_remove_from_cache(inode, bh); 6855 ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
6723 6856
6724 BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); 6857 BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6725 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); 6858 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
@@ -6747,7 +6880,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6747 struct buffer_head *fe_bh, 6880 struct buffer_head *fe_bh,
6748 handle_t *handle, 6881 handle_t *handle,
6749 struct ocfs2_truncate_context *tc, 6882 struct ocfs2_truncate_context *tc,
6750 struct ocfs2_path *path) 6883 struct ocfs2_path *path,
6884 struct ocfs2_alloc_context *meta_ac)
6751{ 6885{
6752 int status; 6886 int status;
6753 struct ocfs2_dinode *fe; 6887 struct ocfs2_dinode *fe;
@@ -6755,6 +6889,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6755 struct ocfs2_extent_list *el; 6889 struct ocfs2_extent_list *el;
6756 struct buffer_head *last_eb_bh = NULL; 6890 struct buffer_head *last_eb_bh = NULL;
6757 u64 delete_blk = 0; 6891 u64 delete_blk = 0;
6892 u8 rec_flags;
6758 6893
6759 fe = (struct ocfs2_dinode *) fe_bh->b_data; 6894 fe = (struct ocfs2_dinode *) fe_bh->b_data;
6760 6895
@@ -6769,14 +6904,14 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6769 * Each component will be touched, so we might as well journal 6904 * Each component will be touched, so we might as well journal
6770 * here to avoid having to handle errors later. 6905 * here to avoid having to handle errors later.
6771 */ 6906 */
6772 status = ocfs2_journal_access_path(inode, handle, path); 6907 status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
6773 if (status < 0) { 6908 if (status < 0) {
6774 mlog_errno(status); 6909 mlog_errno(status);
6775 goto bail; 6910 goto bail;
6776 } 6911 }
6777 6912
6778 if (last_eb_bh) { 6913 if (last_eb_bh) {
6779 status = ocfs2_journal_access_eb(handle, inode, last_eb_bh, 6914 status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
6780 OCFS2_JOURNAL_ACCESS_WRITE); 6915 OCFS2_JOURNAL_ACCESS_WRITE);
6781 if (status < 0) { 6916 if (status < 0) {
6782 mlog_errno(status); 6917 mlog_errno(status);
@@ -6810,7 +6945,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6810 inode->i_blocks = ocfs2_inode_sector_count(inode); 6945 inode->i_blocks = ocfs2_inode_sector_count(inode);
6811 6946
6812 status = ocfs2_trim_tree(inode, path, handle, tc, 6947 status = ocfs2_trim_tree(inode, path, handle, tc,
6813 clusters_to_del, &delete_blk); 6948 clusters_to_del, &delete_blk, &rec_flags);
6814 if (status) { 6949 if (status) {
6815 mlog_errno(status); 6950 mlog_errno(status);
6816 goto bail; 6951 goto bail;
@@ -6842,8 +6977,16 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
6842 } 6977 }
6843 6978
6844 if (delete_blk) { 6979 if (delete_blk) {
6845 status = ocfs2_truncate_log_append(osb, handle, delete_blk, 6980 if (rec_flags & OCFS2_EXT_REFCOUNTED)
6846 clusters_to_del); 6981 status = ocfs2_decrease_refcount(inode, handle,
6982 ocfs2_blocks_to_clusters(osb->sb,
6983 delete_blk),
6984 clusters_to_del, meta_ac,
6985 &tc->tc_dealloc, 1);
6986 else
6987 status = ocfs2_truncate_log_append(osb, handle,
6988 delete_blk,
6989 clusters_to_del);
6847 if (status < 0) { 6990 if (status < 0) {
6848 mlog_errno(status); 6991 mlog_errno(status);
6849 goto bail; 6992 goto bail;
@@ -6863,9 +7006,9 @@ static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6863 return 0; 7006 return 0;
6864} 7007}
6865 7008
6866static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, 7009void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6867 unsigned int from, unsigned int to, 7010 unsigned int from, unsigned int to,
6868 struct page *page, int zero, u64 *phys) 7011 struct page *page, int zero, u64 *phys)
6869{ 7012{
6870 int ret, partial = 0; 7013 int ret, partial = 0;
6871 7014
@@ -6933,20 +7076,16 @@ out:
6933 ocfs2_unlock_and_free_pages(pages, numpages); 7076 ocfs2_unlock_and_free_pages(pages, numpages);
6934} 7077}
6935 7078
6936static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, 7079int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
6937 struct page **pages, int *num) 7080 struct page **pages, int *num)
6938{ 7081{
6939 int numpages, ret = 0; 7082 int numpages, ret = 0;
6940 struct super_block *sb = inode->i_sb;
6941 struct address_space *mapping = inode->i_mapping; 7083 struct address_space *mapping = inode->i_mapping;
6942 unsigned long index; 7084 unsigned long index;
6943 loff_t last_page_bytes; 7085 loff_t last_page_bytes;
6944 7086
6945 BUG_ON(start > end); 7087 BUG_ON(start > end);
6946 7088
6947 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6948 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6949
6950 numpages = 0; 7089 numpages = 0;
6951 last_page_bytes = PAGE_ALIGN(end); 7090 last_page_bytes = PAGE_ALIGN(end);
6952 index = start >> PAGE_CACHE_SHIFT; 7091 index = start >> PAGE_CACHE_SHIFT;
@@ -6974,6 +7113,17 @@ out:
6974 return ret; 7113 return ret;
6975} 7114}
6976 7115
7116static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
7117 struct page **pages, int *num)
7118{
7119 struct super_block *sb = inode->i_sb;
7120
7121 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
7122 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
7123
7124 return ocfs2_grab_pages(inode, start, end, pages, num);
7125}
7126
6977/* 7127/*
6978 * Zero the area past i_size but still within an allocated 7128 * Zero the area past i_size but still within an allocated
6979 * cluster. This avoids exposing nonzero data on subsequent file 7129 * cluster. This avoids exposing nonzero data on subsequent file
@@ -7138,7 +7288,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7138 goto out_unlock; 7288 goto out_unlock;
7139 } 7289 }
7140 7290
7141 ret = ocfs2_journal_access_di(handle, inode, di_bh, 7291 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7142 OCFS2_JOURNAL_ACCESS_WRITE); 7292 OCFS2_JOURNAL_ACCESS_WRITE);
7143 if (ret) { 7293 if (ret) {
7144 mlog_errno(ret); 7294 mlog_errno(ret);
@@ -7218,9 +7368,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7218 * this proves to be false, we could always re-build 7368 * this proves to be false, we could always re-build
7219 * the in-inode data from our pages. 7369 * the in-inode data from our pages.
7220 */ 7370 */
7221 ocfs2_init_dinode_extent_tree(&et, inode, di_bh); 7371 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
7222 ret = ocfs2_insert_extent(osb, handle, inode, &et, 7372 ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
7223 0, block, 1, 0, NULL);
7224 if (ret) { 7373 if (ret) {
7225 mlog_errno(ret); 7374 mlog_errno(ret);
7226 goto out_commit; 7375 goto out_commit;
@@ -7262,11 +7411,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
7262{ 7411{
7263 int status, i, credits, tl_sem = 0; 7412 int status, i, credits, tl_sem = 0;
7264 u32 clusters_to_del, new_highest_cpos, range; 7413 u32 clusters_to_del, new_highest_cpos, range;
7414 u64 blkno = 0;
7265 struct ocfs2_extent_list *el; 7415 struct ocfs2_extent_list *el;
7266 handle_t *handle = NULL; 7416 handle_t *handle = NULL;
7267 struct inode *tl_inode = osb->osb_tl_inode; 7417 struct inode *tl_inode = osb->osb_tl_inode;
7268 struct ocfs2_path *path = NULL; 7418 struct ocfs2_path *path = NULL;
7269 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; 7419 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
7420 struct ocfs2_alloc_context *meta_ac = NULL;
7421 struct ocfs2_refcount_tree *ref_tree = NULL;
7270 7422
7271 mlog_entry_void(); 7423 mlog_entry_void();
7272 7424
@@ -7292,10 +7444,12 @@ start:
7292 goto bail; 7444 goto bail;
7293 } 7445 }
7294 7446
7447 credits = 0;
7448
7295 /* 7449 /*
7296 * Truncate always works against the rightmost tree branch. 7450 * Truncate always works against the rightmost tree branch.
7297 */ 7451 */
7298 status = ocfs2_find_path(inode, path, UINT_MAX); 7452 status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX);
7299 if (status) { 7453 if (status) {
7300 mlog_errno(status); 7454 mlog_errno(status);
7301 goto bail; 7455 goto bail;
@@ -7332,10 +7486,15 @@ start:
7332 clusters_to_del = 0; 7486 clusters_to_del = 0;
7333 } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { 7487 } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
7334 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); 7488 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
7489 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
7335 } else if (range > new_highest_cpos) { 7490 } else if (range > new_highest_cpos) {
7336 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + 7491 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
7337 le32_to_cpu(el->l_recs[i].e_cpos)) - 7492 le32_to_cpu(el->l_recs[i].e_cpos)) -
7338 new_highest_cpos; 7493 new_highest_cpos;
7494 blkno = le64_to_cpu(el->l_recs[i].e_blkno) +
7495 ocfs2_clusters_to_blocks(inode->i_sb,
7496 ocfs2_rec_clusters(el, &el->l_recs[i]) -
7497 clusters_to_del);
7339 } else { 7498 } else {
7340 status = 0; 7499 status = 0;
7341 goto bail; 7500 goto bail;
@@ -7344,6 +7503,29 @@ start:
7344 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", 7503 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
7345 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); 7504 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
7346 7505
7506 if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) {
7507 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
7508 OCFS2_HAS_REFCOUNT_FL));
7509
7510 status = ocfs2_lock_refcount_tree(osb,
7511 le64_to_cpu(di->i_refcount_loc),
7512 1, &ref_tree, NULL);
7513 if (status) {
7514 mlog_errno(status);
7515 goto bail;
7516 }
7517
7518 status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
7519 blkno,
7520 clusters_to_del,
7521 &credits,
7522 &meta_ac);
7523 if (status < 0) {
7524 mlog_errno(status);
7525 goto bail;
7526 }
7527 }
7528
7347 mutex_lock(&tl_inode->i_mutex); 7529 mutex_lock(&tl_inode->i_mutex);
7348 tl_sem = 1; 7530 tl_sem = 1;
7349 /* ocfs2_truncate_log_needs_flush guarantees us at least one 7531 /* ocfs2_truncate_log_needs_flush guarantees us at least one
@@ -7357,7 +7539,7 @@ start:
7357 } 7539 }
7358 } 7540 }
7359 7541
7360 credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, 7542 credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
7361 (struct ocfs2_dinode *)fe_bh->b_data, 7543 (struct ocfs2_dinode *)fe_bh->b_data,
7362 el); 7544 el);
7363 handle = ocfs2_start_trans(osb, credits); 7545 handle = ocfs2_start_trans(osb, credits);
@@ -7369,7 +7551,7 @@ start:
7369 } 7551 }
7370 7552
7371 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, 7553 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
7372 tc, path); 7554 tc, path, meta_ac);
7373 if (status < 0) { 7555 if (status < 0) {
7374 mlog_errno(status); 7556 mlog_errno(status);
7375 goto bail; 7557 goto bail;
@@ -7383,6 +7565,16 @@ start:
7383 7565
7384 ocfs2_reinit_path(path, 1); 7566 ocfs2_reinit_path(path, 1);
7385 7567
7568 if (meta_ac) {
7569 ocfs2_free_alloc_context(meta_ac);
7570 meta_ac = NULL;
7571 }
7572
7573 if (ref_tree) {
7574 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7575 ref_tree = NULL;
7576 }
7577
7386 /* 7578 /*
7387 * The check above will catch the case where we've truncated 7579 * The check above will catch the case where we've truncated
7388 * away all allocation. 7580 * away all allocation.
@@ -7399,6 +7591,12 @@ bail:
7399 if (handle) 7591 if (handle)
7400 ocfs2_commit_trans(osb, handle); 7592 ocfs2_commit_trans(osb, handle);
7401 7593
7594 if (meta_ac)
7595 ocfs2_free_alloc_context(meta_ac);
7596
7597 if (ref_tree)
7598 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7599
7402 ocfs2_run_deallocs(osb, &tc->tc_dealloc); 7600 ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7403 7601
7404 ocfs2_free_path(path); 7602 ocfs2_free_path(path);
@@ -7445,7 +7643,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
7445 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); 7643 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7446 7644
7447 if (fe->id2.i_list.l_tree_depth) { 7645 if (fe->id2.i_list.l_tree_depth) {
7448 status = ocfs2_read_extent_block(inode, 7646 status = ocfs2_read_extent_block(INODE_CACHE(inode),
7449 le64_to_cpu(fe->i_last_eb_blk), 7647 le64_to_cpu(fe->i_last_eb_blk),
7450 &last_eb_bh); 7648 &last_eb_bh);
7451 if (status < 0) { 7649 if (status < 0) {
@@ -7507,7 +7705,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7507 goto out; 7705 goto out;
7508 } 7706 }
7509 7707
7510 ret = ocfs2_journal_access_di(handle, inode, di_bh, 7708 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7511 OCFS2_JOURNAL_ACCESS_WRITE); 7709 OCFS2_JOURNAL_ACCESS_WRITE);
7512 if (ret) { 7710 if (ret) {
7513 mlog_errno(ret); 7711 mlog_errno(ret);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 353254ba29e1..9c122d574464 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -45,7 +45,8 @@
45 * 45 *
46 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a 46 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
47 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree 47 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
48 * functions. With metadata ecc, we now call different journal_access 48 * functions. It needs the ocfs2_caching_info structure associated with
49 * I/O on the tree. With metadata ecc, we now call different journal_access
49 * functions for each type of metadata, so it must have the 50 * functions for each type of metadata, so it must have the
50 * root_journal_access function. 51 * root_journal_access function.
51 * ocfs2_extent_tree_operations abstract the normal operations we do for 52 * ocfs2_extent_tree_operations abstract the normal operations we do for
@@ -56,6 +57,7 @@ struct ocfs2_extent_tree {
56 struct ocfs2_extent_tree_operations *et_ops; 57 struct ocfs2_extent_tree_operations *et_ops;
57 struct buffer_head *et_root_bh; 58 struct buffer_head *et_root_bh;
58 struct ocfs2_extent_list *et_root_el; 59 struct ocfs2_extent_list *et_root_el;
60 struct ocfs2_caching_info *et_ci;
59 ocfs2_journal_access_func et_root_journal_access; 61 ocfs2_journal_access_func et_root_journal_access;
60 void *et_object; 62 void *et_object;
61 unsigned int et_max_leaf_clusters; 63 unsigned int et_max_leaf_clusters;
@@ -66,31 +68,32 @@ struct ocfs2_extent_tree {
66 * specified object buffer. 68 * specified object buffer.
67 */ 69 */
68void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, 70void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
69 struct inode *inode, 71 struct ocfs2_caching_info *ci,
70 struct buffer_head *bh); 72 struct buffer_head *bh);
71void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, 73void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
72 struct inode *inode, 74 struct ocfs2_caching_info *ci,
73 struct buffer_head *bh); 75 struct buffer_head *bh);
74struct ocfs2_xattr_value_buf; 76struct ocfs2_xattr_value_buf;
75void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 77void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
76 struct inode *inode, 78 struct ocfs2_caching_info *ci,
77 struct ocfs2_xattr_value_buf *vb); 79 struct ocfs2_xattr_value_buf *vb);
78void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et, 80void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
79 struct inode *inode, 81 struct ocfs2_caching_info *ci,
80 struct buffer_head *bh); 82 struct buffer_head *bh);
83void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
84 struct ocfs2_caching_info *ci,
85 struct buffer_head *bh);
81 86
82/* 87/*
83 * Read an extent block into *bh. If *bh is NULL, a bh will be 88 * Read an extent block into *bh. If *bh is NULL, a bh will be
84 * allocated. This is a cached read. The extent block will be validated 89 * allocated. This is a cached read. The extent block will be validated
85 * with ocfs2_validate_extent_block(). 90 * with ocfs2_validate_extent_block().
86 */ 91 */
87int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno, 92int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
88 struct buffer_head **bh); 93 struct buffer_head **bh);
89 94
90struct ocfs2_alloc_context; 95struct ocfs2_alloc_context;
91int ocfs2_insert_extent(struct ocfs2_super *osb, 96int ocfs2_insert_extent(handle_t *handle,
92 handle_t *handle,
93 struct inode *inode,
94 struct ocfs2_extent_tree *et, 97 struct ocfs2_extent_tree *et,
95 u32 cpos, 98 u32 cpos,
96 u64 start_blk, 99 u64 start_blk,
@@ -103,25 +106,36 @@ enum ocfs2_alloc_restarted {
103 RESTART_TRANS, 106 RESTART_TRANS,
104 RESTART_META 107 RESTART_META
105}; 108};
106int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, 109int ocfs2_add_clusters_in_btree(handle_t *handle,
107 struct inode *inode, 110 struct ocfs2_extent_tree *et,
108 u32 *logical_offset, 111 u32 *logical_offset,
109 u32 clusters_to_add, 112 u32 clusters_to_add,
110 int mark_unwritten, 113 int mark_unwritten,
111 struct ocfs2_extent_tree *et,
112 handle_t *handle,
113 struct ocfs2_alloc_context *data_ac, 114 struct ocfs2_alloc_context *data_ac,
114 struct ocfs2_alloc_context *meta_ac, 115 struct ocfs2_alloc_context *meta_ac,
115 enum ocfs2_alloc_restarted *reason_ret); 116 enum ocfs2_alloc_restarted *reason_ret);
116struct ocfs2_cached_dealloc_ctxt; 117struct ocfs2_cached_dealloc_ctxt;
118struct ocfs2_path;
119int ocfs2_split_extent(handle_t *handle,
120 struct ocfs2_extent_tree *et,
121 struct ocfs2_path *path,
122 int split_index,
123 struct ocfs2_extent_rec *split_rec,
124 struct ocfs2_alloc_context *meta_ac,
125 struct ocfs2_cached_dealloc_ctxt *dealloc);
117int ocfs2_mark_extent_written(struct inode *inode, 126int ocfs2_mark_extent_written(struct inode *inode,
118 struct ocfs2_extent_tree *et, 127 struct ocfs2_extent_tree *et,
119 handle_t *handle, u32 cpos, u32 len, u32 phys, 128 handle_t *handle, u32 cpos, u32 len, u32 phys,
120 struct ocfs2_alloc_context *meta_ac, 129 struct ocfs2_alloc_context *meta_ac,
121 struct ocfs2_cached_dealloc_ctxt *dealloc); 130 struct ocfs2_cached_dealloc_ctxt *dealloc);
122int ocfs2_remove_extent(struct inode *inode, 131int ocfs2_change_extent_flag(handle_t *handle,
123 struct ocfs2_extent_tree *et, 132 struct ocfs2_extent_tree *et,
124 u32 cpos, u32 len, handle_t *handle, 133 u32 cpos, u32 len, u32 phys,
134 struct ocfs2_alloc_context *meta_ac,
135 struct ocfs2_cached_dealloc_ctxt *dealloc,
136 int new_flags, int clear_flags);
137int ocfs2_remove_extent(handle_t *handle, struct ocfs2_extent_tree *et,
138 u32 cpos, u32 len,
125 struct ocfs2_alloc_context *meta_ac, 139 struct ocfs2_alloc_context *meta_ac,
126 struct ocfs2_cached_dealloc_ctxt *dealloc); 140 struct ocfs2_cached_dealloc_ctxt *dealloc);
127int ocfs2_remove_btree_range(struct inode *inode, 141int ocfs2_remove_btree_range(struct inode *inode,
@@ -130,7 +144,6 @@ int ocfs2_remove_btree_range(struct inode *inode,
130 struct ocfs2_cached_dealloc_ctxt *dealloc); 144 struct ocfs2_cached_dealloc_ctxt *dealloc);
131 145
132int ocfs2_num_free_extents(struct ocfs2_super *osb, 146int ocfs2_num_free_extents(struct ocfs2_super *osb,
133 struct inode *inode,
134 struct ocfs2_extent_tree *et); 147 struct ocfs2_extent_tree *et);
135 148
136/* 149/*
@@ -195,6 +208,9 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
195} 208}
196int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 209int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
197 u64 blkno, unsigned int bit); 210 u64 blkno, unsigned int bit);
211int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
212 int type, int slot, u64 blkno,
213 unsigned int bit);
198static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c) 214static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
199{ 215{
200 return c->c_global_allocator != NULL; 216 return c->c_global_allocator != NULL;
@@ -222,8 +238,9 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
222int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, 238int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
223 unsigned int start, unsigned int end, int trunc); 239 unsigned int start, unsigned int end, int trunc);
224 240
225int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 241int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
226 u32 cpos, struct buffer_head **leaf_bh); 242 struct ocfs2_extent_list *root_el, u32 cpos,
243 struct buffer_head **leaf_bh);
227int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster); 244int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
228 245
229/* 246/*
@@ -254,4 +271,50 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
254 return !rec->e_leaf_clusters; 271 return !rec->e_leaf_clusters;
255} 272}
256 273
274int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
275 struct page **pages, int *num);
276void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
277 unsigned int from, unsigned int to,
278 struct page *page, int zero, u64 *phys);
279/*
280 * Structures which describe a path through a btree, and functions to
281 * manipulate them.
282 *
283 * The idea here is to be as generic as possible with the tree
284 * manipulation code.
285 */
286struct ocfs2_path_item {
287 struct buffer_head *bh;
288 struct ocfs2_extent_list *el;
289};
290
291#define OCFS2_MAX_PATH_DEPTH 5
292
293struct ocfs2_path {
294 int p_tree_depth;
295 ocfs2_journal_access_func p_root_access;
296 struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
297};
298
299#define path_root_bh(_path) ((_path)->p_node[0].bh)
300#define path_root_el(_path) ((_path)->p_node[0].el)
301#define path_root_access(_path)((_path)->p_root_access)
302#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
303#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
304#define path_num_items(_path) ((_path)->p_tree_depth + 1)
305
306void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root);
307void ocfs2_free_path(struct ocfs2_path *path);
308int ocfs2_find_path(struct ocfs2_caching_info *ci,
309 struct ocfs2_path *path,
310 u32 cpos);
311struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path);
312struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et);
313int ocfs2_path_bh_journal_access(handle_t *handle,
314 struct ocfs2_caching_info *ci,
315 struct ocfs2_path *path,
316 int idx);
317int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
318 handle_t *handle,
319 struct ocfs2_path *path);
257#endif /* OCFS2_ALLOC_H */ 320#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 747f15eefd82..deb2b132ae5e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -44,6 +44,7 @@
44#include "suballoc.h" 44#include "suballoc.h"
45#include "super.h" 45#include "super.h"
46#include "symlink.h" 46#include "symlink.h"
47#include "refcounttree.h"
47 48
48#include "buffer_head_io.h" 49#include "buffer_head_io.h"
49 50
@@ -126,8 +127,8 @@ bail:
126 return err; 127 return err;
127} 128}
128 129
129static int ocfs2_get_block(struct inode *inode, sector_t iblock, 130int ocfs2_get_block(struct inode *inode, sector_t iblock,
130 struct buffer_head *bh_result, int create) 131 struct buffer_head *bh_result, int create)
131{ 132{
132 int err = 0; 133 int err = 0;
133 unsigned int ext_flags; 134 unsigned int ext_flags;
@@ -590,6 +591,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
590 goto bail; 591 goto bail;
591 } 592 }
592 593
594 /* We should already CoW the refcounted extent. */
595 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
593 /* 596 /*
594 * get_more_blocks() expects us to describe a hole by clearing 597 * get_more_blocks() expects us to describe a hole by clearing
595 * the mapped bit on bh_result(). 598 * the mapped bit on bh_result().
@@ -687,6 +690,10 @@ static ssize_t ocfs2_direct_IO(int rw,
687 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 690 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
688 return 0; 691 return 0;
689 692
693 /* Fallback to buffered I/O if we are appending. */
694 if (i_size_read(inode) <= offset)
695 return 0;
696
690 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 697 ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
691 inode->i_sb->s_bdev, iov, offset, 698 inode->i_sb->s_bdev, iov, offset,
692 nr_segs, 699 nr_segs,
@@ -1259,7 +1266,8 @@ static int ocfs2_write_cluster(struct address_space *mapping,
1259 goto out; 1266 goto out;
1260 } 1267 }
1261 } else if (unwritten) { 1268 } else if (unwritten) {
1262 ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); 1269 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
1270 wc->w_di_bh);
1263 ret = ocfs2_mark_extent_written(inode, &et, 1271 ret = ocfs2_mark_extent_written(inode, &et,
1264 wc->w_handle, cpos, 1, phys, 1272 wc->w_handle, cpos, 1, phys,
1265 meta_ac, &wc->w_dealloc); 1273 meta_ac, &wc->w_dealloc);
@@ -1448,6 +1456,9 @@ static int ocfs2_populate_write_desc(struct inode *inode,
1448 goto out; 1456 goto out;
1449 } 1457 }
1450 1458
1459 /* We should already CoW the refcountd extent. */
1460 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
1461
1451 /* 1462 /*
1452 * Assume worst case - that we're writing in 1463 * Assume worst case - that we're writing in
1453 * the middle of the extent. 1464 * the middle of the extent.
@@ -1528,7 +1539,7 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
1528 goto out; 1539 goto out;
1529 } 1540 }
1530 1541
1531 ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh, 1542 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
1532 OCFS2_JOURNAL_ACCESS_WRITE); 1543 OCFS2_JOURNAL_ACCESS_WRITE);
1533 if (ret) { 1544 if (ret) {
1534 ocfs2_commit_trans(osb, handle); 1545 ocfs2_commit_trans(osb, handle);
@@ -1699,6 +1710,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1699 goto out; 1710 goto out;
1700 } 1711 }
1701 1712
1713 ret = ocfs2_check_range_for_refcount(inode, pos, len);
1714 if (ret < 0) {
1715 mlog_errno(ret);
1716 goto out;
1717 } else if (ret == 1) {
1718 ret = ocfs2_refcount_cow(inode, di_bh,
1719 wc->w_cpos, wc->w_clen, UINT_MAX);
1720 if (ret) {
1721 mlog_errno(ret);
1722 goto out;
1723 }
1724 }
1725
1702 ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc, 1726 ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
1703 &extents_to_split); 1727 &extents_to_split);
1704 if (ret) { 1728 if (ret) {
@@ -1726,7 +1750,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1726 (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), 1750 (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
1727 clusters_to_alloc, extents_to_split); 1751 clusters_to_alloc, extents_to_split);
1728 1752
1729 ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); 1753 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
1754 wc->w_di_bh);
1730 ret = ocfs2_lock_allocators(inode, &et, 1755 ret = ocfs2_lock_allocators(inode, &et,
1731 clusters_to_alloc, extents_to_split, 1756 clusters_to_alloc, extents_to_split,
1732 &data_ac, &meta_ac); 1757 &data_ac, &meta_ac);
@@ -1773,7 +1798,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1773 * We don't want this to fail in ocfs2_write_end(), so do it 1798 * We don't want this to fail in ocfs2_write_end(), so do it
1774 * here. 1799 * here.
1775 */ 1800 */
1776 ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh, 1801 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
1777 OCFS2_JOURNAL_ACCESS_WRITE); 1802 OCFS2_JOURNAL_ACCESS_WRITE);
1778 if (ret) { 1803 if (ret) {
1779 mlog_errno(ret); 1804 mlog_errno(ret);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 503e49232e11..c48e93ffc513 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -57,6 +57,8 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
57 struct buffer_head *di_bh); 57 struct buffer_head *di_bh);
58int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size); 58int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size);
59 59
60int ocfs2_get_block(struct inode *inode, sector_t iblock,
61 struct buffer_head *bh_result, int create);
60/* all ocfs2_dio_end_io()'s fault */ 62/* all ocfs2_dio_end_io()'s fault */
61#define ocfs2_iocb_is_rw_locked(iocb) \ 63#define ocfs2_iocb_is_rw_locked(iocb) \
62 test_bit(0, (unsigned long *)&iocb->private) 64 test_bit(0, (unsigned long *)&iocb->private)
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 15c8e6deee2e..d43d34a1dd31 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -52,12 +52,12 @@ enum ocfs2_state_bits {
52BUFFER_FNS(NeedsValidate, needs_validate); 52BUFFER_FNS(NeedsValidate, needs_validate);
53 53
54int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, 54int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
55 struct inode *inode) 55 struct ocfs2_caching_info *ci)
56{ 56{
57 int ret = 0; 57 int ret = 0;
58 58
59 mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n", 59 mlog_entry("(bh->b_blocknr = %llu, ci=%p)\n",
60 (unsigned long long)bh->b_blocknr, inode); 60 (unsigned long long)bh->b_blocknr, ci);
61 61
62 BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO); 62 BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
63 BUG_ON(buffer_jbd(bh)); 63 BUG_ON(buffer_jbd(bh));
@@ -70,7 +70,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
70 goto out; 70 goto out;
71 } 71 }
72 72
73 mutex_lock(&OCFS2_I(inode)->ip_io_mutex); 73 ocfs2_metadata_cache_io_lock(ci);
74 74
75 lock_buffer(bh); 75 lock_buffer(bh);
76 set_buffer_uptodate(bh); 76 set_buffer_uptodate(bh);
@@ -85,7 +85,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
85 wait_on_buffer(bh); 85 wait_on_buffer(bh);
86 86
87 if (buffer_uptodate(bh)) { 87 if (buffer_uptodate(bh)) {
88 ocfs2_set_buffer_uptodate(inode, bh); 88 ocfs2_set_buffer_uptodate(ci, bh);
89 } else { 89 } else {
90 /* We don't need to remove the clustered uptodate 90 /* We don't need to remove the clustered uptodate
91 * information for this bh as it's not marked locally 91 * information for this bh as it's not marked locally
@@ -94,7 +94,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
94 put_bh(bh); 94 put_bh(bh);
95 } 95 }
96 96
97 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 97 ocfs2_metadata_cache_io_unlock(ci);
98out: 98out:
99 mlog_exit(ret); 99 mlog_exit(ret);
100 return ret; 100 return ret;
@@ -177,7 +177,7 @@ bail:
177 return status; 177 return status;
178} 178}
179 179
180int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, 180int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
181 struct buffer_head *bhs[], int flags, 181 struct buffer_head *bhs[], int flags,
182 int (*validate)(struct super_block *sb, 182 int (*validate)(struct super_block *sb,
183 struct buffer_head *bh)) 183 struct buffer_head *bh))
@@ -185,11 +185,12 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
185 int status = 0; 185 int status = 0;
186 int i, ignore_cache = 0; 186 int i, ignore_cache = 0;
187 struct buffer_head *bh; 187 struct buffer_head *bh;
188 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
188 189
189 mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n", 190 mlog_entry("(ci=%p, block=(%llu), nr=(%d), flags=%d)\n",
190 inode, (unsigned long long)block, nr, flags); 191 ci, (unsigned long long)block, nr, flags);
191 192
192 BUG_ON(!inode); 193 BUG_ON(!ci);
193 BUG_ON((flags & OCFS2_BH_READAHEAD) && 194 BUG_ON((flags & OCFS2_BH_READAHEAD) &&
194 (flags & OCFS2_BH_IGNORE_CACHE)); 195 (flags & OCFS2_BH_IGNORE_CACHE));
195 196
@@ -212,12 +213,12 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
212 goto bail; 213 goto bail;
213 } 214 }
214 215
215 mutex_lock(&OCFS2_I(inode)->ip_io_mutex); 216 ocfs2_metadata_cache_io_lock(ci);
216 for (i = 0 ; i < nr ; i++) { 217 for (i = 0 ; i < nr ; i++) {
217 if (bhs[i] == NULL) { 218 if (bhs[i] == NULL) {
218 bhs[i] = sb_getblk(inode->i_sb, block++); 219 bhs[i] = sb_getblk(sb, block++);
219 if (bhs[i] == NULL) { 220 if (bhs[i] == NULL) {
220 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 221 ocfs2_metadata_cache_io_unlock(ci);
221 status = -EIO; 222 status = -EIO;
222 mlog_errno(status); 223 mlog_errno(status);
223 goto bail; 224 goto bail;
@@ -250,11 +251,11 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
250 * before our is-it-in-flight check. 251 * before our is-it-in-flight check.
251 */ 252 */
252 253
253 if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) { 254 if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
254 mlog(ML_UPTODATE, 255 mlog(ML_UPTODATE,
255 "bh (%llu), inode %llu not uptodate\n", 256 "bh (%llu), owner %llu not uptodate\n",
256 (unsigned long long)bh->b_blocknr, 257 (unsigned long long)bh->b_blocknr,
257 (unsigned long long)OCFS2_I(inode)->ip_blkno); 258 (unsigned long long)ocfs2_metadata_cache_owner(ci));
258 /* We're using ignore_cache here to say 259 /* We're using ignore_cache here to say
259 * "go to disk" */ 260 * "go to disk" */
260 ignore_cache = 1; 261 ignore_cache = 1;
@@ -283,7 +284,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
283 * previously submitted request than we are 284 * previously submitted request than we are
284 * done here. */ 285 * done here. */
285 if ((flags & OCFS2_BH_READAHEAD) 286 if ((flags & OCFS2_BH_READAHEAD)
286 && ocfs2_buffer_read_ahead(inode, bh)) 287 && ocfs2_buffer_read_ahead(ci, bh))
287 continue; 288 continue;
288 289
289 lock_buffer(bh); 290 lock_buffer(bh);
@@ -305,7 +306,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
305 * buffer lock. */ 306 * buffer lock. */
306 if (!(flags & OCFS2_BH_IGNORE_CACHE) 307 if (!(flags & OCFS2_BH_IGNORE_CACHE)
307 && !(flags & OCFS2_BH_READAHEAD) 308 && !(flags & OCFS2_BH_READAHEAD)
308 && ocfs2_buffer_uptodate(inode, bh)) { 309 && ocfs2_buffer_uptodate(ci, bh)) {
309 unlock_buffer(bh); 310 unlock_buffer(bh);
310 continue; 311 continue;
311 } 312 }
@@ -327,7 +328,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
327 328
328 if (!(flags & OCFS2_BH_READAHEAD)) { 329 if (!(flags & OCFS2_BH_READAHEAD)) {
329 /* We know this can't have changed as we hold the 330 /* We know this can't have changed as we hold the
330 * inode sem. Avoid doing any work on the bh if the 331 * owner sem. Avoid doing any work on the bh if the
331 * journal has it. */ 332 * journal has it. */
332 if (!buffer_jbd(bh)) 333 if (!buffer_jbd(bh))
333 wait_on_buffer(bh); 334 wait_on_buffer(bh);
@@ -351,7 +352,7 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
351 * that better not have changed */ 352 * that better not have changed */
352 BUG_ON(buffer_jbd(bh)); 353 BUG_ON(buffer_jbd(bh));
353 clear_buffer_needs_validate(bh); 354 clear_buffer_needs_validate(bh);
354 status = validate(inode->i_sb, bh); 355 status = validate(sb, bh);
355 if (status) { 356 if (status) {
356 put_bh(bh); 357 put_bh(bh);
357 bhs[i] = NULL; 358 bhs[i] = NULL;
@@ -363,9 +364,9 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
363 /* Always set the buffer in the cache, even if it was 364 /* Always set the buffer in the cache, even if it was
364 * a forced read, or read-ahead which hasn't yet 365 * a forced read, or read-ahead which hasn't yet
365 * completed. */ 366 * completed. */
366 ocfs2_set_buffer_uptodate(inode, bh); 367 ocfs2_set_buffer_uptodate(ci, bh);
367 } 368 }
368 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 369 ocfs2_metadata_cache_io_unlock(ci);
369 370
370 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 371 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
371 (unsigned long long)block, nr, 372 (unsigned long long)block, nr,
@@ -399,7 +400,7 @@ static void ocfs2_check_super_or_backup(struct super_block *sb,
399 400
400/* 401/*
401 * Write super block and backups doesn't need to collaborate with journal, 402 * Write super block and backups doesn't need to collaborate with journal,
402 * so we don't need to lock ip_io_mutex and inode doesn't need to bea passed 403 * so we don't need to lock ip_io_mutex and ci doesn't need to bea passed
403 * into this function. 404 * into this function.
404 */ 405 */
405int ocfs2_write_super_or_backup(struct ocfs2_super *osb, 406int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index c75d682dadd8..b97bcc6dde7c 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -33,7 +33,7 @@ void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
33 33
34int ocfs2_write_block(struct ocfs2_super *osb, 34int ocfs2_write_block(struct ocfs2_super *osb,
35 struct buffer_head *bh, 35 struct buffer_head *bh,
36 struct inode *inode); 36 struct ocfs2_caching_info *ci);
37int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, 37int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
38 unsigned int nr, struct buffer_head *bhs[]); 38 unsigned int nr, struct buffer_head *bhs[]);
39 39
@@ -44,7 +44,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
44 * be set even for a READAHEAD call, as it marks the buffer for later 44 * be set even for a READAHEAD call, as it marks the buffer for later
45 * validation. 45 * validation.
46 */ 46 */
47int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, 47int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
48 struct buffer_head *bhs[], int flags, 48 struct buffer_head *bhs[], int flags,
49 int (*validate)(struct super_block *sb, 49 int (*validate)(struct super_block *sb,
50 struct buffer_head *bh)); 50 struct buffer_head *bh));
@@ -55,7 +55,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
55#define OCFS2_BH_IGNORE_CACHE 1 55#define OCFS2_BH_IGNORE_CACHE 1
56#define OCFS2_BH_READAHEAD 8 56#define OCFS2_BH_READAHEAD 8
57 57
58static inline int ocfs2_read_block(struct inode *inode, u64 off, 58static inline int ocfs2_read_block(struct ocfs2_caching_info *ci, u64 off,
59 struct buffer_head **bh, 59 struct buffer_head **bh,
60 int (*validate)(struct super_block *sb, 60 int (*validate)(struct super_block *sb,
61 struct buffer_head *bh)) 61 struct buffer_head *bh))
@@ -68,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
68 goto bail; 68 goto bail;
69 } 69 }
70 70
71 status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate); 71 status = ocfs2_read_blocks(ci, off, 1, bh, 0, validate);
72 72
73bail: 73bail:
74 return status; 74 return status;
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 96df5416993e..1cd2934de615 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -111,6 +111,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
111 define_mask(EXPORT), 111 define_mask(EXPORT),
112 define_mask(XATTR), 112 define_mask(XATTR),
113 define_mask(QUOTA), 113 define_mask(QUOTA),
114 define_mask(REFCOUNT),
114 define_mask(ERROR), 115 define_mask(ERROR),
115 define_mask(NOTICE), 116 define_mask(NOTICE),
116 define_mask(KTHREAD), 117 define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 696c32e50716..9b4d11726cf2 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -113,6 +113,7 @@
113#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ 113#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ 114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ 115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */
116/* bits that are infrequently given and frequently matched in the high word */ 117/* bits that are infrequently given and frequently matched in the high word */
117#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ 118#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
118#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ 119#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index f8424874fa07..cfb2be708abe 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -163,7 +163,7 @@ static void nst_seq_stop(struct seq_file *seq, void *v)
163{ 163{
164} 164}
165 165
166static struct seq_operations nst_seq_ops = { 166static const struct seq_operations nst_seq_ops = {
167 .start = nst_seq_start, 167 .start = nst_seq_start,
168 .next = nst_seq_next, 168 .next = nst_seq_next,
169 .stop = nst_seq_stop, 169 .stop = nst_seq_stop,
@@ -344,7 +344,7 @@ static void sc_seq_stop(struct seq_file *seq, void *v)
344{ 344{
345} 345}
346 346
347static struct seq_operations sc_seq_ops = { 347static const struct seq_operations sc_seq_ops = {
348 .start = sc_seq_start, 348 .start = sc_seq_start,
349 .next = sc_seq_next, 349 .next = sc_seq_next,
350 .stop = sc_seq_stop, 350 .stop = sc_seq_stop,
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index b358f3bf896d..28c3ec238796 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -176,7 +176,7 @@ static int ocfs2_dx_dir_link_trailer(struct inode *dir, handle_t *handle,
176 struct ocfs2_dx_root_block *dx_root; 176 struct ocfs2_dx_root_block *dx_root;
177 struct ocfs2_dir_block_trailer *trailer; 177 struct ocfs2_dir_block_trailer *trailer;
178 178
179 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 179 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
180 OCFS2_JOURNAL_ACCESS_WRITE); 180 OCFS2_JOURNAL_ACCESS_WRITE);
181 if (ret) { 181 if (ret) {
182 mlog_errno(ret); 182 mlog_errno(ret);
@@ -564,7 +564,8 @@ static int ocfs2_read_dir_block_direct(struct inode *dir, u64 phys,
564 int ret; 564 int ret;
565 struct buffer_head *tmp = *bh; 565 struct buffer_head *tmp = *bh;
566 566
567 ret = ocfs2_read_block(dir, phys, &tmp, ocfs2_validate_dir_block); 567 ret = ocfs2_read_block(INODE_CACHE(dir), phys, &tmp,
568 ocfs2_validate_dir_block);
568 if (ret) { 569 if (ret) {
569 mlog_errno(ret); 570 mlog_errno(ret);
570 goto out; 571 goto out;
@@ -622,7 +623,8 @@ static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
622 u64 blkno = le64_to_cpu(di->i_dx_root); 623 u64 blkno = le64_to_cpu(di->i_dx_root);
623 struct buffer_head *tmp = *dx_root_bh; 624 struct buffer_head *tmp = *dx_root_bh;
624 625
625 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_root); 626 ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
627 ocfs2_validate_dx_root);
626 628
627 /* If ocfs2_read_block() got us a new bh, pass it up. */ 629 /* If ocfs2_read_block() got us a new bh, pass it up. */
628 if (!ret && !*dx_root_bh) 630 if (!ret && !*dx_root_bh)
@@ -662,7 +664,8 @@ static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
662 int ret; 664 int ret;
663 struct buffer_head *tmp = *dx_leaf_bh; 665 struct buffer_head *tmp = *dx_leaf_bh;
664 666
665 ret = ocfs2_read_block(dir, blkno, &tmp, ocfs2_validate_dx_leaf); 667 ret = ocfs2_read_block(INODE_CACHE(dir), blkno, &tmp,
668 ocfs2_validate_dx_leaf);
666 669
667 /* If ocfs2_read_block() got us a new bh, pass it up. */ 670 /* If ocfs2_read_block() got us a new bh, pass it up. */
668 if (!ret && !*dx_leaf_bh) 671 if (!ret && !*dx_leaf_bh)
@@ -680,7 +683,7 @@ static int ocfs2_read_dx_leaves(struct inode *dir, u64 start, int num,
680{ 683{
681 int ret; 684 int ret;
682 685
683 ret = ocfs2_read_blocks(dir, start, num, dx_leaf_bhs, 0, 686 ret = ocfs2_read_blocks(INODE_CACHE(dir), start, num, dx_leaf_bhs, 0,
684 ocfs2_validate_dx_leaf); 687 ocfs2_validate_dx_leaf);
685 if (ret) 688 if (ret)
686 mlog_errno(ret); 689 mlog_errno(ret);
@@ -802,7 +805,8 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
802 struct ocfs2_extent_rec *rec = NULL; 805 struct ocfs2_extent_rec *rec = NULL;
803 806
804 if (el->l_tree_depth) { 807 if (el->l_tree_depth) {
805 ret = ocfs2_find_leaf(inode, el, major_hash, &eb_bh); 808 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, major_hash,
809 &eb_bh);
806 if (ret) { 810 if (ret) {
807 mlog_errno(ret); 811 mlog_errno(ret);
808 goto out; 812 goto out;
@@ -1133,7 +1137,8 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle,
1133 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1137 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1134 access = ocfs2_journal_access_di; 1138 access = ocfs2_journal_access_di;
1135 1139
1136 ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE); 1140 ret = access(handle, INODE_CACHE(dir), de_bh,
1141 OCFS2_JOURNAL_ACCESS_WRITE);
1137 if (ret) { 1142 if (ret) {
1138 mlog_errno(ret); 1143 mlog_errno(ret);
1139 goto out; 1144 goto out;
@@ -1176,7 +1181,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
1176 goto bail; 1181 goto bail;
1177 } 1182 }
1178 if (de == de_del) { 1183 if (de == de_del) {
1179 status = access(handle, dir, bh, 1184 status = access(handle, INODE_CACHE(dir), bh,
1180 OCFS2_JOURNAL_ACCESS_WRITE); 1185 OCFS2_JOURNAL_ACCESS_WRITE);
1181 if (status < 0) { 1186 if (status < 0) {
1182 status = -EIO; 1187 status = -EIO;
@@ -1326,7 +1331,7 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
1326 * the entry count needs to be updated. Also, we might be 1331 * the entry count needs to be updated. Also, we might be
1327 * adding to the start of the free list. 1332 * adding to the start of the free list.
1328 */ 1333 */
1329 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 1334 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
1330 OCFS2_JOURNAL_ACCESS_WRITE); 1335 OCFS2_JOURNAL_ACCESS_WRITE);
1331 if (ret) { 1336 if (ret) {
1332 mlog_errno(ret); 1337 mlog_errno(ret);
@@ -1334,7 +1339,7 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
1334 } 1339 }
1335 1340
1336 if (!ocfs2_dx_root_inline(dx_root)) { 1341 if (!ocfs2_dx_root_inline(dx_root)) {
1337 ret = ocfs2_journal_access_dl(handle, dir, 1342 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
1338 lookup->dl_dx_leaf_bh, 1343 lookup->dl_dx_leaf_bh,
1339 OCFS2_JOURNAL_ACCESS_WRITE); 1344 OCFS2_JOURNAL_ACCESS_WRITE);
1340 if (ret) { 1345 if (ret) {
@@ -1493,7 +1498,7 @@ static int __ocfs2_dx_dir_leaf_insert(struct inode *dir, handle_t *handle,
1493 int ret; 1498 int ret;
1494 struct ocfs2_dx_leaf *dx_leaf; 1499 struct ocfs2_dx_leaf *dx_leaf;
1495 1500
1496 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh, 1501 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
1497 OCFS2_JOURNAL_ACCESS_WRITE); 1502 OCFS2_JOURNAL_ACCESS_WRITE);
1498 if (ret) { 1503 if (ret) {
1499 mlog_errno(ret); 1504 mlog_errno(ret);
@@ -1523,7 +1528,7 @@ static int ocfs2_dx_dir_insert(struct inode *dir, handle_t *handle,
1523 struct ocfs2_dx_root_block *dx_root; 1528 struct ocfs2_dx_root_block *dx_root;
1524 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; 1529 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
1525 1530
1526 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 1531 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
1527 OCFS2_JOURNAL_ACCESS_WRITE); 1532 OCFS2_JOURNAL_ACCESS_WRITE);
1528 if (ret) { 1533 if (ret) {
1529 mlog_errno(ret); 1534 mlog_errno(ret);
@@ -1645,11 +1650,13 @@ int __ocfs2_add_entry(handle_t *handle,
1645 */ 1650 */
1646 if (ocfs2_free_list_at_root(lookup)) { 1651 if (ocfs2_free_list_at_root(lookup)) {
1647 bh = lookup->dl_dx_root_bh; 1652 bh = lookup->dl_dx_root_bh;
1648 retval = ocfs2_journal_access_dr(handle, dir, bh, 1653 retval = ocfs2_journal_access_dr(handle,
1654 INODE_CACHE(dir), bh,
1649 OCFS2_JOURNAL_ACCESS_WRITE); 1655 OCFS2_JOURNAL_ACCESS_WRITE);
1650 } else { 1656 } else {
1651 bh = lookup->dl_prev_leaf_bh; 1657 bh = lookup->dl_prev_leaf_bh;
1652 retval = ocfs2_journal_access_db(handle, dir, bh, 1658 retval = ocfs2_journal_access_db(handle,
1659 INODE_CACHE(dir), bh,
1653 OCFS2_JOURNAL_ACCESS_WRITE); 1660 OCFS2_JOURNAL_ACCESS_WRITE);
1654 } 1661 }
1655 if (retval) { 1662 if (retval) {
@@ -1700,11 +1707,13 @@ int __ocfs2_add_entry(handle_t *handle,
1700 } 1707 }
1701 1708
1702 if (insert_bh == parent_fe_bh) 1709 if (insert_bh == parent_fe_bh)
1703 status = ocfs2_journal_access_di(handle, dir, 1710 status = ocfs2_journal_access_di(handle,
1711 INODE_CACHE(dir),
1704 insert_bh, 1712 insert_bh,
1705 OCFS2_JOURNAL_ACCESS_WRITE); 1713 OCFS2_JOURNAL_ACCESS_WRITE);
1706 else { 1714 else {
1707 status = ocfs2_journal_access_db(handle, dir, 1715 status = ocfs2_journal_access_db(handle,
1716 INODE_CACHE(dir),
1708 insert_bh, 1717 insert_bh,
1709 OCFS2_JOURNAL_ACCESS_WRITE); 1718 OCFS2_JOURNAL_ACCESS_WRITE);
1710 1719
@@ -2280,7 +2289,7 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
2280 struct ocfs2_inline_data *data = &di->id2.i_data; 2289 struct ocfs2_inline_data *data = &di->id2.i_data;
2281 unsigned int size = le16_to_cpu(data->id_count); 2290 unsigned int size = le16_to_cpu(data->id_count);
2282 2291
2283 ret = ocfs2_journal_access_di(handle, inode, di_bh, 2292 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
2284 OCFS2_JOURNAL_ACCESS_WRITE); 2293 OCFS2_JOURNAL_ACCESS_WRITE);
2285 if (ret) { 2294 if (ret) {
2286 mlog_errno(ret); 2295 mlog_errno(ret);
@@ -2332,9 +2341,9 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
2332 goto bail; 2341 goto bail;
2333 } 2342 }
2334 2343
2335 ocfs2_set_new_buffer_uptodate(inode, new_bh); 2344 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
2336 2345
2337 status = ocfs2_journal_access_db(handle, inode, new_bh, 2346 status = ocfs2_journal_access_db(handle, INODE_CACHE(inode), new_bh,
2338 OCFS2_JOURNAL_ACCESS_CREATE); 2347 OCFS2_JOURNAL_ACCESS_CREATE);
2339 if (status < 0) { 2348 if (status < 0) {
2340 mlog_errno(status); 2349 mlog_errno(status);
@@ -2418,9 +2427,9 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2418 ret = -EIO; 2427 ret = -EIO;
2419 goto out; 2428 goto out;
2420 } 2429 }
2421 ocfs2_set_new_buffer_uptodate(dir, dx_root_bh); 2430 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh);
2422 2431
2423 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 2432 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
2424 OCFS2_JOURNAL_ACCESS_CREATE); 2433 OCFS2_JOURNAL_ACCESS_CREATE);
2425 if (ret < 0) { 2434 if (ret < 0) {
2426 mlog_errno(ret); 2435 mlog_errno(ret);
@@ -2454,7 +2463,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2454 if (ret) 2463 if (ret)
2455 mlog_errno(ret); 2464 mlog_errno(ret);
2456 2465
2457 ret = ocfs2_journal_access_di(handle, dir, di_bh, 2466 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
2458 OCFS2_JOURNAL_ACCESS_CREATE); 2467 OCFS2_JOURNAL_ACCESS_CREATE);
2459 if (ret) { 2468 if (ret) {
2460 mlog_errno(ret); 2469 mlog_errno(ret);
@@ -2495,9 +2504,9 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
2495 } 2504 }
2496 dx_leaves[i] = bh; 2505 dx_leaves[i] = bh;
2497 2506
2498 ocfs2_set_new_buffer_uptodate(dir, bh); 2507 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), bh);
2499 2508
2500 ret = ocfs2_journal_access_dl(handle, dir, bh, 2509 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), bh,
2501 OCFS2_JOURNAL_ACCESS_CREATE); 2510 OCFS2_JOURNAL_ACCESS_CREATE);
2502 if (ret < 0) { 2511 if (ret < 0) {
2503 mlog_errno(ret); 2512 mlog_errno(ret);
@@ -2582,7 +2591,6 @@ static int ocfs2_dx_dir_new_cluster(struct inode *dir,
2582{ 2591{
2583 int ret; 2592 int ret;
2584 u64 phys_blkno; 2593 u64 phys_blkno;
2585 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2586 2594
2587 ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves, 2595 ret = __ocfs2_dx_dir_new_cluster(dir, cpos, handle, data_ac, dx_leaves,
2588 num_dx_leaves, &phys_blkno); 2596 num_dx_leaves, &phys_blkno);
@@ -2591,7 +2599,7 @@ static int ocfs2_dx_dir_new_cluster(struct inode *dir,
2591 goto out; 2599 goto out;
2592 } 2600 }
2593 2601
2594 ret = ocfs2_insert_extent(osb, handle, dir, et, cpos, phys_blkno, 1, 0, 2602 ret = ocfs2_insert_extent(handle, et, cpos, phys_blkno, 1, 0,
2595 meta_ac); 2603 meta_ac);
2596 if (ret) 2604 if (ret)
2597 mlog_errno(ret); 2605 mlog_errno(ret);
@@ -2895,7 +2903,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
2895 struct ocfs2_extent_tree dx_et; 2903 struct ocfs2_extent_tree dx_et;
2896 int did_quota = 0, bytes_allocated = 0; 2904 int did_quota = 0, bytes_allocated = 0;
2897 2905
2898 ocfs2_init_dinode_extent_tree(&et, dir, di_bh); 2906 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), di_bh);
2899 2907
2900 alloc = ocfs2_clusters_for_bytes(sb, bytes); 2908 alloc = ocfs2_clusters_for_bytes(sb, bytes);
2901 dx_alloc = 0; 2909 dx_alloc = 0;
@@ -3005,9 +3013,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3005 goto out_commit; 3013 goto out_commit;
3006 } 3014 }
3007 3015
3008 ocfs2_set_new_buffer_uptodate(dir, dirdata_bh); 3016 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dirdata_bh);
3009 3017
3010 ret = ocfs2_journal_access_db(handle, dir, dirdata_bh, 3018 ret = ocfs2_journal_access_db(handle, INODE_CACHE(dir), dirdata_bh,
3011 OCFS2_JOURNAL_ACCESS_CREATE); 3019 OCFS2_JOURNAL_ACCESS_CREATE);
3012 if (ret) { 3020 if (ret) {
3013 mlog_errno(ret); 3021 mlog_errno(ret);
@@ -3060,7 +3068,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3060 * We let the later dirent insert modify c/mtime - to the user 3068 * We let the later dirent insert modify c/mtime - to the user
3061 * the data hasn't changed. 3069 * the data hasn't changed.
3062 */ 3070 */
3063 ret = ocfs2_journal_access_di(handle, dir, di_bh, 3071 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
3064 OCFS2_JOURNAL_ACCESS_CREATE); 3072 OCFS2_JOURNAL_ACCESS_CREATE);
3065 if (ret) { 3073 if (ret) {
3066 mlog_errno(ret); 3074 mlog_errno(ret);
@@ -3085,7 +3093,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3085 * This should never fail as our extent list is empty and all 3093 * This should never fail as our extent list is empty and all
3086 * related blocks have been journaled already. 3094 * related blocks have been journaled already.
3087 */ 3095 */
3088 ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len, 3096 ret = ocfs2_insert_extent(handle, &et, 0, blkno, len,
3089 0, NULL); 3097 0, NULL);
3090 if (ret) { 3098 if (ret) {
3091 mlog_errno(ret); 3099 mlog_errno(ret);
@@ -3117,8 +3125,10 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3117 ocfs2_dx_dir_index_root_block(dir, dx_root_bh, 3125 ocfs2_dx_dir_index_root_block(dir, dx_root_bh,
3118 dirdata_bh); 3126 dirdata_bh);
3119 } else { 3127 } else {
3120 ocfs2_init_dx_root_extent_tree(&dx_et, dir, dx_root_bh); 3128 ocfs2_init_dx_root_extent_tree(&dx_et,
3121 ret = ocfs2_insert_extent(osb, handle, dir, &dx_et, 0, 3129 INODE_CACHE(dir),
3130 dx_root_bh);
3131 ret = ocfs2_insert_extent(handle, &dx_et, 0,
3122 dx_insert_blkno, 1, 0, NULL); 3132 dx_insert_blkno, 1, 0, NULL);
3123 if (ret) 3133 if (ret)
3124 mlog_errno(ret); 3134 mlog_errno(ret);
@@ -3138,7 +3148,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3138 } 3148 }
3139 blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); 3149 blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
3140 3150
3141 ret = ocfs2_insert_extent(osb, handle, dir, &et, 1, 3151 ret = ocfs2_insert_extent(handle, &et, 1,
3142 blkno, len, 0, NULL); 3152 blkno, len, 0, NULL);
3143 if (ret) { 3153 if (ret) {
3144 mlog_errno(ret); 3154 mlog_errno(ret);
@@ -3337,8 +3347,9 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
3337 spin_lock(&OCFS2_I(dir)->ip_lock); 3347 spin_lock(&OCFS2_I(dir)->ip_lock);
3338 if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { 3348 if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
3339 spin_unlock(&OCFS2_I(dir)->ip_lock); 3349 spin_unlock(&OCFS2_I(dir)->ip_lock);
3340 ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh); 3350 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir),
3341 num_free_extents = ocfs2_num_free_extents(osb, dir, &et); 3351 parent_fe_bh);
3352 num_free_extents = ocfs2_num_free_extents(osb, &et);
3342 if (num_free_extents < 0) { 3353 if (num_free_extents < 0) {
3343 status = num_free_extents; 3354 status = num_free_extents;
3344 mlog_errno(status); 3355 mlog_errno(status);
@@ -3387,9 +3398,9 @@ do_extend:
3387 goto bail; 3398 goto bail;
3388 } 3399 }
3389 3400
3390 ocfs2_set_new_buffer_uptodate(dir, new_bh); 3401 ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), new_bh);
3391 3402
3392 status = ocfs2_journal_access_db(handle, dir, new_bh, 3403 status = ocfs2_journal_access_db(handle, INODE_CACHE(dir), new_bh,
3393 OCFS2_JOURNAL_ACCESS_CREATE); 3404 OCFS2_JOURNAL_ACCESS_CREATE);
3394 if (status < 0) { 3405 if (status < 0) {
3395 mlog_errno(status); 3406 mlog_errno(status);
@@ -3829,7 +3840,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3829 (unsigned long long)OCFS2_I(dir)->ip_blkno, 3840 (unsigned long long)OCFS2_I(dir)->ip_blkno,
3830 (unsigned long long)leaf_blkno, insert_hash); 3841 (unsigned long long)leaf_blkno, insert_hash);
3831 3842
3832 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); 3843 ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
3833 3844
3834 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data; 3845 dx_root = (struct ocfs2_dx_root_block *)dx_root_bh->b_data;
3835 /* 3846 /*
@@ -3885,7 +3896,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3885 } 3896 }
3886 did_quota = 1; 3897 did_quota = 1;
3887 3898
3888 ret = ocfs2_journal_access_dl(handle, dir, dx_leaf_bh, 3899 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir), dx_leaf_bh,
3889 OCFS2_JOURNAL_ACCESS_WRITE); 3900 OCFS2_JOURNAL_ACCESS_WRITE);
3890 if (ret) { 3901 if (ret) {
3891 mlog_errno(ret); 3902 mlog_errno(ret);
@@ -3949,7 +3960,8 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3949 } 3960 }
3950 3961
3951 for (i = 0; i < num_dx_leaves; i++) { 3962 for (i = 0; i < num_dx_leaves; i++) {
3952 ret = ocfs2_journal_access_dl(handle, dir, orig_dx_leaves[i], 3963 ret = ocfs2_journal_access_dl(handle, INODE_CACHE(dir),
3964 orig_dx_leaves[i],
3953 OCFS2_JOURNAL_ACCESS_WRITE); 3965 OCFS2_JOURNAL_ACCESS_WRITE);
3954 if (ret) { 3966 if (ret) {
3955 mlog_errno(ret); 3967 mlog_errno(ret);
@@ -4165,7 +4177,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
4165 * failure to add the dx_root_bh to the journal won't result 4177 * failure to add the dx_root_bh to the journal won't result
4166 * us losing clusters. 4178 * us losing clusters.
4167 */ 4179 */
4168 ret = ocfs2_journal_access_dr(handle, dir, dx_root_bh, 4180 ret = ocfs2_journal_access_dr(handle, INODE_CACHE(dir), dx_root_bh,
4169 OCFS2_JOURNAL_ACCESS_WRITE); 4181 OCFS2_JOURNAL_ACCESS_WRITE);
4170 if (ret) { 4182 if (ret) {
4171 mlog_errno(ret); 4183 mlog_errno(ret);
@@ -4207,9 +4219,8 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
4207 4219
4208 /* This should never fail considering we start with an empty 4220 /* This should never fail considering we start with an empty
4209 * dx_root. */ 4221 * dx_root. */
4210 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); 4222 ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
4211 ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, 4223 ret = ocfs2_insert_extent(handle, &et, 0, insert_blkno, 1, 0, NULL);
4212 insert_blkno, 1, 0, NULL);
4213 if (ret) 4224 if (ret)
4214 mlog_errno(ret); 4225 mlog_errno(ret);
4215 did_quota = 0; 4226 did_quota = 0;
@@ -4469,7 +4480,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
4469 goto out_unlock; 4480 goto out_unlock;
4470 } 4481 }
4471 4482
4472 ret = ocfs2_journal_access_di(handle, dir, di_bh, 4483 ret = ocfs2_journal_access_di(handle, INODE_CACHE(dir), di_bh,
4473 OCFS2_JOURNAL_ACCESS_WRITE); 4484 OCFS2_JOURNAL_ACCESS_WRITE);
4474 if (ret) { 4485 if (ret) {
4475 mlog_errno(ret); 4486 mlog_errno(ret);
@@ -4532,7 +4543,7 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
4532 if (ocfs2_dx_root_inline(dx_root)) 4543 if (ocfs2_dx_root_inline(dx_root))
4533 goto remove_index; 4544 goto remove_index;
4534 4545
4535 ocfs2_init_dx_root_extent_tree(&et, dir, dx_root_bh); 4546 ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
4536 4547
4537 /* XXX: What if dr_clusters is too large? */ 4548 /* XXX: What if dr_clusters is too large? */
4538 while (le32_to_cpu(dx_root->dr_clusters)) { 4549 while (le32_to_cpu(dx_root->dr_clusters)) {
@@ -4565,7 +4576,7 @@ remove_index:
4565 goto out; 4576 goto out;
4566 } 4577 }
4567 4578
4568 ocfs2_remove_from_cache(dir, dx_root_bh); 4579 ocfs2_remove_from_cache(INODE_CACHE(dir), dx_root_bh);
4569out: 4580out:
4570 ocfs2_schedule_truncate_log_flush(osb, 1); 4581 ocfs2_schedule_truncate_log_flush(osb, 1);
4571 ocfs2_run_deallocs(osb, &dealloc); 4582 ocfs2_run_deallocs(osb, &dealloc);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 81eff8e58322..01cf8cc3d286 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 75997b4deaf3..ca96bce50e18 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index df52f706f669..ca46002ec10e 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -27,7 +27,6 @@
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/utsname.h>
31#include <linux/sysctl.h> 30#include <linux/sysctl.h>
32#include <linux/spinlock.h> 31#include <linux/spinlock.h>
33#include <linux/debugfs.h> 32#include <linux/debugfs.h>
@@ -683,7 +682,7 @@ static int lockres_seq_show(struct seq_file *s, void *v)
683 return 0; 682 return 0;
684} 683}
685 684
686static struct seq_operations debug_lockres_ops = { 685static const struct seq_operations debug_lockres_ops = {
687 .start = lockres_seq_start, 686 .start = lockres_seq_start,
688 .stop = lockres_seq_stop, 687 .stop = lockres_seq_stop,
689 .next = lockres_seq_next, 688 .next = lockres_seq_next,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 4d9e6b288dd8..0334000676d3 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -28,7 +28,6 @@
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/utsname.h>
32#include <linux/init.h> 31#include <linux/init.h>
33#include <linux/spinlock.h> 32#include <linux/spinlock.h>
34#include <linux/delay.h> 33#include <linux/delay.h>
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 83a9f2972ac8..437698e9465f 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index f8b653fcd4dd..83bcaf266b35 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 43e6e3280569..d9fa3d22e17c 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d490b66ad9d7..52ec020ea78b 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
@@ -212,14 +211,18 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
212 spin_lock(&dlm->spinlock); 211 spin_lock(&dlm->spinlock);
213 } 212 }
214 213
214 spin_lock(&res->spinlock);
215 if (!list_empty(&res->purge)) { 215 if (!list_empty(&res->purge)) {
216 mlog(0, "removing lockres %.*s:%p from purgelist, " 216 mlog(0, "removing lockres %.*s:%p from purgelist, "
217 "master = %d\n", res->lockname.len, res->lockname.name, 217 "master = %d\n", res->lockname.len, res->lockname.name,
218 res, master); 218 res, master);
219 list_del_init(&res->purge); 219 list_del_init(&res->purge);
220 spin_unlock(&res->spinlock);
220 dlm_lockres_put(res); 221 dlm_lockres_put(res);
221 dlm->purge_count--; 222 dlm->purge_count--;
222 } 223 } else
224 spin_unlock(&res->spinlock);
225
223 __dlm_unhash_lockres(res); 226 __dlm_unhash_lockres(res);
224 227
225 /* lockres is not in the hash now. drop the flag and wake up 228 /* lockres is not in the hash now. drop the flag and wake up
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 756f5b0998e0..00f53b2aea76 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -30,7 +30,6 @@
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/highmem.h> 32#include <linux/highmem.h>
33#include <linux/utsname.h>
34#include <linux/init.h> 33#include <linux/init.h>
35#include <linux/sysctl.h> 34#include <linux/sysctl.h>
36#include <linux/random.h> 35#include <linux/random.h>
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 110bb57c46ab..0d38d67194cb 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -53,6 +53,7 @@
53#include "super.h" 53#include "super.h"
54#include "uptodate.h" 54#include "uptodate.h"
55#include "quota.h" 55#include "quota.h"
56#include "refcounttree.h"
56 57
57#include "buffer_head_io.h" 58#include "buffer_head_io.h"
58 59
@@ -110,6 +111,11 @@ static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
110 111
111static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres); 112static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
112 113
114static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
115 int new_level);
116static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
117 int blocking);
118
113#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres) 119#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
114 120
115/* This aids in debugging situations where a bad LVB might be involved. */ 121/* This aids in debugging situations where a bad LVB might be involved. */
@@ -278,6 +284,12 @@ static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
278 .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB, 284 .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
279}; 285};
280 286
287static struct ocfs2_lock_res_ops ocfs2_refcount_block_lops = {
288 .check_downconvert = ocfs2_check_refcount_downconvert,
289 .downconvert_worker = ocfs2_refcount_convert_worker,
290 .flags = 0,
291};
292
281static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 293static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
282{ 294{
283 return lockres->l_type == OCFS2_LOCK_TYPE_META || 295 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -306,6 +318,12 @@ static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_re
306 return (struct ocfs2_mem_dqinfo *)lockres->l_priv; 318 return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
307} 319}
308 320
321static inline struct ocfs2_refcount_tree *
322ocfs2_lock_res_refcount_tree(struct ocfs2_lock_res *res)
323{
324 return container_of(res, struct ocfs2_refcount_tree, rf_lockres);
325}
326
309static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres) 327static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
310{ 328{
311 if (lockres->l_ops->get_osb) 329 if (lockres->l_ops->get_osb)
@@ -693,6 +711,17 @@ void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
693 info); 711 info);
694} 712}
695 713
714void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
715 struct ocfs2_super *osb, u64 ref_blkno,
716 unsigned int generation)
717{
718 ocfs2_lock_res_init_once(lockres);
719 ocfs2_build_lock_name(OCFS2_LOCK_TYPE_REFCOUNT, ref_blkno,
720 generation, lockres->l_name);
721 ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_REFCOUNT,
722 &ocfs2_refcount_block_lops, osb);
723}
724
696void ocfs2_lock_res_free(struct ocfs2_lock_res *res) 725void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
697{ 726{
698 mlog_entry_void(); 727 mlog_entry_void();
@@ -1548,8 +1577,10 @@ int ocfs2_rw_lock(struct inode *inode, int write)
1548 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1577 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1549 write ? "EXMODE" : "PRMODE"); 1578 write ? "EXMODE" : "PRMODE");
1550 1579
1551 if (ocfs2_mount_local(osb)) 1580 if (ocfs2_mount_local(osb)) {
1581 mlog_exit(0);
1552 return 0; 1582 return 0;
1583 }
1553 1584
1554 lockres = &OCFS2_I(inode)->ip_rw_lockres; 1585 lockres = &OCFS2_I(inode)->ip_rw_lockres;
1555 1586
@@ -2127,7 +2158,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
2127 2158
2128 /* This will discard any caching information we might have had 2159 /* This will discard any caching information we might have had
2129 * for the inode metadata. */ 2160 * for the inode metadata. */
2130 ocfs2_metadata_cache_purge(inode); 2161 ocfs2_metadata_cache_purge(INODE_CACHE(inode));
2131 2162
2132 ocfs2_extent_map_trunc(inode, 0); 2163 ocfs2_extent_map_trunc(inode, 0);
2133 2164
@@ -3009,6 +3040,7 @@ static void ocfs2_unlock_ast(void *opaque, int error)
3009 "unlock_action %d\n", error, lockres->l_name, 3040 "unlock_action %d\n", error, lockres->l_name,
3010 lockres->l_unlock_action); 3041 lockres->l_unlock_action);
3011 spin_unlock_irqrestore(&lockres->l_lock, flags); 3042 spin_unlock_irqrestore(&lockres->l_lock, flags);
3043 mlog_exit_void();
3012 return; 3044 return;
3013 } 3045 }
3014 3046
@@ -3495,11 +3527,11 @@ out:
3495 return UNBLOCK_CONTINUE; 3527 return UNBLOCK_CONTINUE;
3496} 3528}
3497 3529
3498static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, 3530static int ocfs2_ci_checkpointed(struct ocfs2_caching_info *ci,
3499 int new_level) 3531 struct ocfs2_lock_res *lockres,
3532 int new_level)
3500{ 3533{
3501 struct inode *inode = ocfs2_lock_res_inode(lockres); 3534 int checkpointed = ocfs2_ci_fully_checkpointed(ci);
3502 int checkpointed = ocfs2_inode_fully_checkpointed(inode);
3503 3535
3504 BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR); 3536 BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
3505 BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed); 3537 BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
@@ -3507,10 +3539,18 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
3507 if (checkpointed) 3539 if (checkpointed)
3508 return 1; 3540 return 1;
3509 3541
3510 ocfs2_start_checkpoint(OCFS2_SB(inode->i_sb)); 3542 ocfs2_start_checkpoint(OCFS2_SB(ocfs2_metadata_cache_get_super(ci)));
3511 return 0; 3543 return 0;
3512} 3544}
3513 3545
3546static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
3547 int new_level)
3548{
3549 struct inode *inode = ocfs2_lock_res_inode(lockres);
3550
3551 return ocfs2_ci_checkpointed(INODE_CACHE(inode), lockres, new_level);
3552}
3553
3514static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres) 3554static void ocfs2_set_meta_lvb(struct ocfs2_lock_res *lockres)
3515{ 3555{
3516 struct inode *inode = ocfs2_lock_res_inode(lockres); 3556 struct inode *inode = ocfs2_lock_res_inode(lockres);
@@ -3640,6 +3680,26 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
3640 return UNBLOCK_CONTINUE_POST; 3680 return UNBLOCK_CONTINUE_POST;
3641} 3681}
3642 3682
3683static int ocfs2_check_refcount_downconvert(struct ocfs2_lock_res *lockres,
3684 int new_level)
3685{
3686 struct ocfs2_refcount_tree *tree =
3687 ocfs2_lock_res_refcount_tree(lockres);
3688
3689 return ocfs2_ci_checkpointed(&tree->rf_ci, lockres, new_level);
3690}
3691
3692static int ocfs2_refcount_convert_worker(struct ocfs2_lock_res *lockres,
3693 int blocking)
3694{
3695 struct ocfs2_refcount_tree *tree =
3696 ocfs2_lock_res_refcount_tree(lockres);
3697
3698 ocfs2_metadata_cache_purge(&tree->rf_ci);
3699
3700 return UNBLOCK_CONTINUE;
3701}
3702
3643static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres) 3703static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
3644{ 3704{
3645 struct ocfs2_qinfo_lvb *lvb; 3705 struct ocfs2_qinfo_lvb *lvb;
@@ -3752,6 +3812,37 @@ bail:
3752 return status; 3812 return status;
3753} 3813}
3754 3814
3815int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex)
3816{
3817 int status;
3818 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
3819 struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
3820 struct ocfs2_super *osb = lockres->l_priv;
3821
3822
3823 if (ocfs2_is_hard_readonly(osb))
3824 return -EROFS;
3825
3826 if (ocfs2_mount_local(osb))
3827 return 0;
3828
3829 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
3830 if (status < 0)
3831 mlog_errno(status);
3832
3833 return status;
3834}
3835
3836void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex)
3837{
3838 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
3839 struct ocfs2_lock_res *lockres = &ref_tree->rf_lockres;
3840 struct ocfs2_super *osb = lockres->l_priv;
3841
3842 if (!ocfs2_mount_local(osb))
3843 ocfs2_cluster_unlock(osb, lockres, level);
3844}
3845
3755/* 3846/*
3756 * This is the filesystem locking protocol. It provides the lock handling 3847 * This is the filesystem locking protocol. It provides the lock handling
3757 * hooks for the underlying DLM. It has a maximum version number. 3848 * hooks for the underlying DLM. It has a maximum version number.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 7553836931de..d1ce48e1b3d6 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -101,6 +101,9 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
101struct ocfs2_mem_dqinfo; 101struct ocfs2_mem_dqinfo;
102void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres, 102void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
103 struct ocfs2_mem_dqinfo *info); 103 struct ocfs2_mem_dqinfo *info);
104void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
105 struct ocfs2_super *osb, u64 ref_blkno,
106 unsigned int generation);
104void ocfs2_lock_res_free(struct ocfs2_lock_res *res); 107void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
105int ocfs2_create_new_inode_locks(struct inode *inode); 108int ocfs2_create_new_inode_locks(struct inode *inode);
106int ocfs2_drop_inode_locks(struct inode *inode); 109int ocfs2_drop_inode_locks(struct inode *inode);
@@ -148,6 +151,9 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock);
148void ocfs2_file_unlock(struct file *file); 151void ocfs2_file_unlock(struct file *file);
149int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex); 152int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
150void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex); 153void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
154struct ocfs2_refcount_tree;
155int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
156void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
151 157
152 158
153void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); 159void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index f2bb1a04d253..843db64e9d4a 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
293 struct ocfs2_extent_block *eb; 293 struct ocfs2_extent_block *eb;
294 struct ocfs2_extent_list *el; 294 struct ocfs2_extent_list *el;
295 295
296 ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh); 296 ret = ocfs2_read_extent_block(INODE_CACHE(inode), last_eb_blk, &eb_bh);
297 if (ret) { 297 if (ret) {
298 mlog_errno(ret); 298 mlog_errno(ret);
299 goto out; 299 goto out;
@@ -353,11 +353,11 @@ static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
353 * eb_bh is NULL. Otherwise, eb_bh should point to the extent block 353 * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
354 * containing el. 354 * containing el.
355 */ 355 */
356static int ocfs2_figure_hole_clusters(struct inode *inode, 356int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
357 struct ocfs2_extent_list *el, 357 struct ocfs2_extent_list *el,
358 struct buffer_head *eb_bh, 358 struct buffer_head *eb_bh,
359 u32 v_cluster, 359 u32 v_cluster,
360 u32 *num_clusters) 360 u32 *num_clusters)
361{ 361{
362 int ret, i; 362 int ret, i;
363 struct buffer_head *next_eb_bh = NULL; 363 struct buffer_head *next_eb_bh = NULL;
@@ -375,7 +375,7 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
375 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) 375 if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
376 goto no_more_extents; 376 goto no_more_extents;
377 377
378 ret = ocfs2_read_extent_block(inode, 378 ret = ocfs2_read_extent_block(ci,
379 le64_to_cpu(eb->h_next_leaf_blk), 379 le64_to_cpu(eb->h_next_leaf_blk),
380 &next_eb_bh); 380 &next_eb_bh);
381 if (ret) { 381 if (ret) {
@@ -428,7 +428,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
428 tree_height = le16_to_cpu(el->l_tree_depth); 428 tree_height = le16_to_cpu(el->l_tree_depth);
429 429
430 if (tree_height > 0) { 430 if (tree_height > 0) {
431 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); 431 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
432 &eb_bh);
432 if (ret) { 433 if (ret) {
433 mlog_errno(ret); 434 mlog_errno(ret);
434 goto out; 435 goto out;
@@ -455,7 +456,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
455 * field. 456 * field.
456 */ 457 */
457 if (hole_len) { 458 if (hole_len) {
458 ret = ocfs2_figure_hole_clusters(inode, el, eb_bh, 459 ret = ocfs2_figure_hole_clusters(INODE_CACHE(inode),
460 el, eb_bh,
459 v_cluster, &len); 461 v_cluster, &len);
460 if (ret) { 462 if (ret) {
461 mlog_errno(ret); 463 mlog_errno(ret);
@@ -539,7 +541,8 @@ static void ocfs2_relative_extent_offsets(struct super_block *sb,
539 541
540int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, 542int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
541 u32 *p_cluster, u32 *num_clusters, 543 u32 *p_cluster, u32 *num_clusters,
542 struct ocfs2_extent_list *el) 544 struct ocfs2_extent_list *el,
545 unsigned int *extent_flags)
543{ 546{
544 int ret = 0, i; 547 int ret = 0, i;
545 struct buffer_head *eb_bh = NULL; 548 struct buffer_head *eb_bh = NULL;
@@ -548,7 +551,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
548 u32 coff; 551 u32 coff;
549 552
550 if (el->l_tree_depth) { 553 if (el->l_tree_depth) {
551 ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); 554 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, v_cluster,
555 &eb_bh);
552 if (ret) { 556 if (ret) {
553 mlog_errno(ret); 557 mlog_errno(ret);
554 goto out; 558 goto out;
@@ -590,6 +594,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
590 *p_cluster = *p_cluster + coff; 594 *p_cluster = *p_cluster + coff;
591 if (num_clusters) 595 if (num_clusters)
592 *num_clusters = ocfs2_rec_clusters(el, rec) - coff; 596 *num_clusters = ocfs2_rec_clusters(el, rec) - coff;
597
598 if (extent_flags)
599 *extent_flags = rec->e_flags;
593 } 600 }
594out: 601out:
595 if (eb_bh) 602 if (eb_bh)
@@ -862,8 +869,8 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
862 BUG_ON(bhs[done + i]->b_blocknr != (p_block + i)); 869 BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
863 } 870 }
864 871
865 rc = ocfs2_read_blocks(inode, p_block, count, bhs + done, 872 rc = ocfs2_read_blocks(INODE_CACHE(inode), p_block, count,
866 flags, validate); 873 bhs + done, flags, validate);
867 if (rc) { 874 if (rc) {
868 mlog_errno(rc); 875 mlog_errno(rc);
869 break; 876 break;
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index b7dd9731b462..e79d41c2c909 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -55,12 +55,18 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
55 55
56int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, 56int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
57 u32 *p_cluster, u32 *num_clusters, 57 u32 *p_cluster, u32 *num_clusters,
58 struct ocfs2_extent_list *el); 58 struct ocfs2_extent_list *el,
59 unsigned int *extent_flags);
59 60
60int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, 61int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
61 struct buffer_head *bhs[], int flags, 62 struct buffer_head *bhs[], int flags,
62 int (*validate)(struct super_block *sb, 63 int (*validate)(struct super_block *sb,
63 struct buffer_head *bh)); 64 struct buffer_head *bh));
65int ocfs2_figure_hole_clusters(struct ocfs2_caching_info *ci,
66 struct ocfs2_extent_list *el,
67 struct buffer_head *eb_bh,
68 u32 v_cluster,
69 u32 *num_clusters);
64static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block, 70static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
65 struct buffer_head **bh, 71 struct buffer_head **bh,
66 int (*validate)(struct super_block *sb, 72 int (*validate)(struct super_block *sb,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 221c5e98957b..89fc8ee1f5a5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -59,6 +59,7 @@
59#include "xattr.h" 59#include "xattr.h"
60#include "acl.h" 60#include "acl.h"
61#include "quota.h" 61#include "quota.h"
62#include "refcounttree.h"
62 63
63#include "buffer_head_io.h" 64#include "buffer_head_io.h"
64 65
@@ -259,7 +260,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
259 goto out; 260 goto out;
260 } 261 }
261 262
262 ret = ocfs2_journal_access_di(handle, inode, bh, 263 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
263 OCFS2_JOURNAL_ACCESS_WRITE); 264 OCFS2_JOURNAL_ACCESS_WRITE);
264 if (ret) { 265 if (ret) {
265 mlog_errno(ret); 266 mlog_errno(ret);
@@ -334,6 +335,39 @@ out:
334 return ret; 335 return ret;
335} 336}
336 337
338static int ocfs2_cow_file_pos(struct inode *inode,
339 struct buffer_head *fe_bh,
340 u64 offset)
341{
342 int status;
343 u32 phys, cpos = offset >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
344 unsigned int num_clusters = 0;
345 unsigned int ext_flags = 0;
346
347 /*
348 * If the new offset is aligned to the range of the cluster, there is
349 * no space for ocfs2_zero_range_for_truncate to fill, so no need to
350 * CoW either.
351 */
352 if ((offset & (OCFS2_SB(inode->i_sb)->s_clustersize - 1)) == 0)
353 return 0;
354
355 status = ocfs2_get_clusters(inode, cpos, &phys,
356 &num_clusters, &ext_flags);
357 if (status) {
358 mlog_errno(status);
359 goto out;
360 }
361
362 if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
363 goto out;
364
365 return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1);
366
367out:
368 return status;
369}
370
337static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, 371static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
338 struct inode *inode, 372 struct inode *inode,
339 struct buffer_head *fe_bh, 373 struct buffer_head *fe_bh,
@@ -346,6 +380,17 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
346 380
347 mlog_entry_void(); 381 mlog_entry_void();
348 382
383 /*
384 * We need to CoW the cluster contains the offset if it is reflinked
385 * since we will call ocfs2_zero_range_for_truncate later which will
386 * write "0" from offset to the end of the cluster.
387 */
388 status = ocfs2_cow_file_pos(inode, fe_bh, new_i_size);
389 if (status) {
390 mlog_errno(status);
391 return status;
392 }
393
349 /* TODO: This needs to actually orphan the inode in this 394 /* TODO: This needs to actually orphan the inode in this
350 * transaction. */ 395 * transaction. */
351 396
@@ -356,7 +401,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
356 goto out; 401 goto out;
357 } 402 }
358 403
359 status = ocfs2_journal_access_di(handle, inode, fe_bh, 404 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
360 OCFS2_JOURNAL_ACCESS_WRITE); 405 OCFS2_JOURNAL_ACCESS_WRITE);
361 if (status < 0) { 406 if (status < 0) {
362 mlog_errno(status); 407 mlog_errno(status);
@@ -486,6 +531,8 @@ bail_unlock_sem:
486 up_write(&OCFS2_I(inode)->ip_alloc_sem); 531 up_write(&OCFS2_I(inode)->ip_alloc_sem);
487 532
488bail: 533bail:
534 if (!status && OCFS2_I(inode)->ip_clusters == 0)
535 status = ocfs2_try_remove_refcount_tree(inode, di_bh);
489 536
490 mlog_exit(status); 537 mlog_exit(status);
491 return status; 538 return status;
@@ -515,11 +562,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
515 int ret; 562 int ret;
516 struct ocfs2_extent_tree et; 563 struct ocfs2_extent_tree et;
517 564
518 ocfs2_init_dinode_extent_tree(&et, inode, fe_bh); 565 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
519 ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset, 566 ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
520 clusters_to_add, mark_unwritten, 567 clusters_to_add, mark_unwritten,
521 &et, handle, 568 data_ac, meta_ac, reason_ret);
522 data_ac, meta_ac, reason_ret);
523 569
524 return ret; 570 return ret;
525} 571}
@@ -564,7 +610,7 @@ restart_all:
564 (unsigned long long)OCFS2_I(inode)->ip_blkno, 610 (unsigned long long)OCFS2_I(inode)->ip_blkno,
565 (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters), 611 (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
566 clusters_to_add); 612 clusters_to_add);
567 ocfs2_init_dinode_extent_tree(&et, inode, bh); 613 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
568 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, 614 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
569 &data_ac, &meta_ac); 615 &data_ac, &meta_ac);
570 if (status) { 616 if (status) {
@@ -593,7 +639,7 @@ restarted_transaction:
593 /* reserve a write to the file entry early on - that we if we 639 /* reserve a write to the file entry early on - that we if we
594 * run out of credits in the allocation path, we can still 640 * run out of credits in the allocation path, we can still
595 * update i_size. */ 641 * update i_size. */
596 status = ocfs2_journal_access_di(handle, inode, bh, 642 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
597 OCFS2_JOURNAL_ACCESS_WRITE); 643 OCFS2_JOURNAL_ACCESS_WRITE);
598 if (status < 0) { 644 if (status < 0) {
599 mlog_errno(status); 645 mlog_errno(status);
@@ -1131,7 +1177,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
1131 goto out; 1177 goto out;
1132 } 1178 }
1133 1179
1134 ret = ocfs2_journal_access_di(handle, inode, bh, 1180 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1135 OCFS2_JOURNAL_ACCESS_WRITE); 1181 OCFS2_JOURNAL_ACCESS_WRITE);
1136 if (ret < 0) { 1182 if (ret < 0) {
1137 mlog_errno(ret); 1183 mlog_errno(ret);
@@ -1395,7 +1441,7 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1395 struct address_space *mapping = inode->i_mapping; 1441 struct address_space *mapping = inode->i_mapping;
1396 struct ocfs2_extent_tree et; 1442 struct ocfs2_extent_tree et;
1397 1443
1398 ocfs2_init_dinode_extent_tree(&et, inode, di_bh); 1444 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1399 ocfs2_init_dealloc_ctxt(&dealloc); 1445 ocfs2_init_dealloc_ctxt(&dealloc);
1400 1446
1401 if (byte_len == 0) 1447 if (byte_len == 0)
@@ -1657,6 +1703,70 @@ static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset,
1657 OCFS2_IOC_RESVSP64, &sr, change_size); 1703 OCFS2_IOC_RESVSP64, &sr, change_size);
1658} 1704}
1659 1705
1706int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
1707 size_t count)
1708{
1709 int ret = 0;
1710 unsigned int extent_flags;
1711 u32 cpos, clusters, extent_len, phys_cpos;
1712 struct super_block *sb = inode->i_sb;
1713
1714 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
1715 !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
1716 return 0;
1717
1718 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1719 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1720
1721 while (clusters) {
1722 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1723 &extent_flags);
1724 if (ret < 0) {
1725 mlog_errno(ret);
1726 goto out;
1727 }
1728
1729 if (phys_cpos && (extent_flags & OCFS2_EXT_REFCOUNTED)) {
1730 ret = 1;
1731 break;
1732 }
1733
1734 if (extent_len > clusters)
1735 extent_len = clusters;
1736
1737 clusters -= extent_len;
1738 cpos += extent_len;
1739 }
1740out:
1741 return ret;
1742}
1743
1744static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
1745 loff_t pos, size_t count,
1746 int *meta_level)
1747{
1748 int ret;
1749 struct buffer_head *di_bh = NULL;
1750 u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1751 u32 clusters =
1752 ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
1753
1754 ret = ocfs2_inode_lock(inode, &di_bh, 1);
1755 if (ret) {
1756 mlog_errno(ret);
1757 goto out;
1758 }
1759
1760 *meta_level = 1;
1761
1762 ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
1763 if (ret)
1764 mlog_errno(ret);
1765out:
1766 brelse(di_bh);
1767 return ret;
1768}
1769
1660static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1770static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1661 loff_t *ppos, 1771 loff_t *ppos,
1662 size_t count, 1772 size_t count,
@@ -1713,6 +1823,22 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1713 1823
1714 end = saved_pos + count; 1824 end = saved_pos + count;
1715 1825
1826 ret = ocfs2_check_range_for_refcount(inode, saved_pos, count);
1827 if (ret == 1) {
1828 ocfs2_inode_unlock(inode, meta_level);
1829 meta_level = -1;
1830
1831 ret = ocfs2_prepare_inode_for_refcount(inode,
1832 saved_pos,
1833 count,
1834 &meta_level);
1835 }
1836
1837 if (ret < 0) {
1838 mlog_errno(ret);
1839 goto out_unlock;
1840 }
1841
1716 /* 1842 /*
1717 * Skip the O_DIRECT checks if we don't need 1843 * Skip the O_DIRECT checks if we don't need
1718 * them. 1844 * them.
@@ -1759,7 +1885,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1759 *ppos = saved_pos; 1885 *ppos = saved_pos;
1760 1886
1761out_unlock: 1887out_unlock:
1762 ocfs2_inode_unlock(inode, meta_level); 1888 if (meta_level >= 0)
1889 ocfs2_inode_unlock(inode, meta_level);
1763 1890
1764out: 1891out:
1765 return ret; 1892 return ret;
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 172f9fbc9fc7..d66cf4f7c70e 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -69,4 +69,6 @@ int ocfs2_update_inode_atime(struct inode *inode,
69int ocfs2_change_file_space(struct file *file, unsigned int cmd, 69int ocfs2_change_file_space(struct file *file, unsigned int cmd,
70 struct ocfs2_space_resv *sr); 70 struct ocfs2_space_resv *sr);
71 71
72int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
73 size_t count);
72#endif /* OCFS2_FILE_H */ 74#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 4dc8890ba316..0297fb8982b8 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -53,6 +53,7 @@
53#include "sysfile.h" 53#include "sysfile.h"
54#include "uptodate.h" 54#include "uptodate.h"
55#include "xattr.h" 55#include "xattr.h"
56#include "refcounttree.h"
56 57
57#include "buffer_head_io.h" 58#include "buffer_head_io.h"
58 59
@@ -562,7 +563,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
562 goto out; 563 goto out;
563 } 564 }
564 565
565 status = ocfs2_journal_access_di(handle, inode, fe_bh, 566 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
567 fe_bh,
566 OCFS2_JOURNAL_ACCESS_WRITE); 568 OCFS2_JOURNAL_ACCESS_WRITE);
567 if (status < 0) { 569 if (status < 0) {
568 mlog_errno(status); 570 mlog_errno(status);
@@ -646,7 +648,7 @@ static int ocfs2_remove_inode(struct inode *inode,
646 } 648 }
647 649
648 /* set the inodes dtime */ 650 /* set the inodes dtime */
649 status = ocfs2_journal_access_di(handle, inode, di_bh, 651 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
650 OCFS2_JOURNAL_ACCESS_WRITE); 652 OCFS2_JOURNAL_ACCESS_WRITE);
651 if (status < 0) { 653 if (status < 0) {
652 mlog_errno(status); 654 mlog_errno(status);
@@ -662,7 +664,7 @@ static int ocfs2_remove_inode(struct inode *inode,
662 goto bail_commit; 664 goto bail_commit;
663 } 665 }
664 666
665 ocfs2_remove_from_cache(inode, di_bh); 667 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh);
666 vfs_dq_free_inode(inode); 668 vfs_dq_free_inode(inode);
667 669
668 status = ocfs2_free_dinode(handle, inode_alloc_inode, 670 status = ocfs2_free_dinode(handle, inode_alloc_inode,
@@ -781,6 +783,12 @@ static int ocfs2_wipe_inode(struct inode *inode,
781 goto bail_unlock_dir; 783 goto bail_unlock_dir;
782 } 784 }
783 785
786 status = ocfs2_remove_refcount_tree(inode, di_bh);
787 if (status < 0) {
788 mlog_errno(status);
789 goto bail_unlock_dir;
790 }
791
784 status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, 792 status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
785 orphan_dir_bh); 793 orphan_dir_bh);
786 if (status < 0) 794 if (status < 0)
@@ -1112,13 +1120,14 @@ void ocfs2_clear_inode(struct inode *inode)
1112 ocfs2_lock_res_free(&oi->ip_inode_lockres); 1120 ocfs2_lock_res_free(&oi->ip_inode_lockres);
1113 ocfs2_lock_res_free(&oi->ip_open_lockres); 1121 ocfs2_lock_res_free(&oi->ip_open_lockres);
1114 1122
1115 ocfs2_metadata_cache_purge(inode); 1123 ocfs2_metadata_cache_exit(INODE_CACHE(inode));
1116 1124
1117 mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached, 1125 mlog_bug_on_msg(INODE_CACHE(inode)->ci_num_cached,
1118 "Clear inode of %llu, inode has %u cache items\n", 1126 "Clear inode of %llu, inode has %u cache items\n",
1119 (unsigned long long)oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached); 1127 (unsigned long long)oi->ip_blkno,
1128 INODE_CACHE(inode)->ci_num_cached);
1120 1129
1121 mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), 1130 mlog_bug_on_msg(!(INODE_CACHE(inode)->ci_flags & OCFS2_CACHE_FL_INLINE),
1122 "Clear inode of %llu, inode has a bad flag\n", 1131 "Clear inode of %llu, inode has a bad flag\n",
1123 (unsigned long long)oi->ip_blkno); 1132 (unsigned long long)oi->ip_blkno);
1124 1133
@@ -1145,9 +1154,7 @@ void ocfs2_clear_inode(struct inode *inode)
1145 (unsigned long long)oi->ip_blkno, oi->ip_open_count); 1154 (unsigned long long)oi->ip_blkno, oi->ip_open_count);
1146 1155
1147 /* Clear all other flags. */ 1156 /* Clear all other flags. */
1148 oi->ip_flags = OCFS2_INODE_CACHE_INLINE; 1157 oi->ip_flags = 0;
1149 oi->ip_created_trans = 0;
1150 oi->ip_last_trans = 0;
1151 oi->ip_dir_start_lookup = 0; 1158 oi->ip_dir_start_lookup = 0;
1152 oi->ip_blkno = 0ULL; 1159 oi->ip_blkno = 0ULL;
1153 1160
@@ -1239,7 +1246,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1239 mlog_entry("(inode %llu)\n", 1246 mlog_entry("(inode %llu)\n",
1240 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1247 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1241 1248
1242 status = ocfs2_journal_access_di(handle, inode, bh, 1249 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1243 OCFS2_JOURNAL_ACCESS_WRITE); 1250 OCFS2_JOURNAL_ACCESS_WRITE);
1244 if (status < 0) { 1251 if (status < 0) {
1245 mlog_errno(status); 1252 mlog_errno(status);
@@ -1380,8 +1387,8 @@ int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
1380 int rc; 1387 int rc;
1381 struct buffer_head *tmp = *bh; 1388 struct buffer_head *tmp = *bh;
1382 1389
1383 rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp, 1390 rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno,
1384 flags, ocfs2_validate_inode_block); 1391 1, &tmp, flags, ocfs2_validate_inode_block);
1385 1392
1386 /* If ocfs2_read_blocks() got us a new bh, pass it up. */ 1393 /* If ocfs2_read_blocks() got us a new bh, pass it up. */
1387 if (!rc && !*bh) 1394 if (!rc && !*bh)
@@ -1394,3 +1401,56 @@ int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
1394{ 1401{
1395 return ocfs2_read_inode_block_full(inode, bh, 0); 1402 return ocfs2_read_inode_block_full(inode, bh, 0);
1396} 1403}
1404
1405
1406static u64 ocfs2_inode_cache_owner(struct ocfs2_caching_info *ci)
1407{
1408 struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
1409
1410 return oi->ip_blkno;
1411}
1412
1413static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info *ci)
1414{
1415 struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
1416
1417 return oi->vfs_inode.i_sb;
1418}
1419
1420static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci)
1421{
1422 struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
1423
1424 spin_lock(&oi->ip_lock);
1425}
1426
1427static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci)
1428{
1429 struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
1430
1431 spin_unlock(&oi->ip_lock);
1432}
1433
1434static void ocfs2_inode_cache_io_lock(struct ocfs2_caching_info *ci)
1435{
1436 struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
1437
1438 mutex_lock(&oi->ip_io_mutex);
1439}
1440
1441static void ocfs2_inode_cache_io_unlock(struct ocfs2_caching_info *ci)
1442{
1443 struct ocfs2_inode_info *oi = cache_info_to_inode(ci);
1444
1445 mutex_unlock(&oi->ip_io_mutex);
1446}
1447
1448const struct ocfs2_caching_operations ocfs2_inode_caching_ops = {
1449 .co_owner = ocfs2_inode_cache_owner,
1450 .co_get_super = ocfs2_inode_cache_get_super,
1451 .co_cache_lock = ocfs2_inode_cache_lock,
1452 .co_cache_unlock = ocfs2_inode_cache_unlock,
1453 .co_io_lock = ocfs2_inode_cache_io_lock,
1454 .co_io_unlock = ocfs2_inode_cache_io_unlock,
1455};
1456
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ea71525aad41..ba4fe07b293c 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -60,12 +60,6 @@ struct ocfs2_inode_info
60 60
61 u32 ip_dir_start_lookup; 61 u32 ip_dir_start_lookup;
62 62
63 /* next two are protected by trans_inc_lock */
64 /* which transaction were we created on? Zero if none. */
65 unsigned long ip_created_trans;
66 /* last transaction we were a part of. */
67 unsigned long ip_last_trans;
68
69 struct ocfs2_caching_info ip_metadata_cache; 63 struct ocfs2_caching_info ip_metadata_cache;
70 64
71 struct ocfs2_extent_map ip_extent_map; 65 struct ocfs2_extent_map ip_extent_map;
@@ -106,8 +100,6 @@ struct ocfs2_inode_info
106#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 100#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
107/* Does someone have the file open O_DIRECT */ 101/* Does someone have the file open O_DIRECT */
108#define OCFS2_INODE_OPEN_DIRECT 0x00000040 102#define OCFS2_INODE_OPEN_DIRECT 0x00000040
109/* Indicates that the metadata cache should be used as an array. */
110#define OCFS2_INODE_CACHE_INLINE 0x00000080
111 103
112static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) 104static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
113{ 105{
@@ -120,6 +112,12 @@ static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
120extern struct kmem_cache *ocfs2_inode_cache; 112extern struct kmem_cache *ocfs2_inode_cache;
121 113
122extern const struct address_space_operations ocfs2_aops; 114extern const struct address_space_operations ocfs2_aops;
115extern const struct ocfs2_caching_operations ocfs2_inode_caching_ops;
116
117static inline struct ocfs2_caching_info *INODE_CACHE(struct inode *inode)
118{
119 return &OCFS2_I(inode)->ip_metadata_cache;
120}
123 121
124void ocfs2_clear_inode(struct inode *inode); 122void ocfs2_clear_inode(struct inode *inode);
125void ocfs2_delete_inode(struct inode *inode); 123void ocfs2_delete_inode(struct inode *inode);
@@ -172,4 +170,10 @@ int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
172/* The same, but can be passed OCFS2_BH_* flags */ 170/* The same, but can be passed OCFS2_BH_* flags */
173int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, 171int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
174 int flags); 172 int flags);
173
174static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_info *ci)
175{
176 return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache);
177}
178
175#endif /* OCFS2_INODE_H */ 179#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 467b413bec21..31fbb0619510 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -21,6 +21,7 @@
21#include "ocfs2_fs.h" 21#include "ocfs2_fs.h"
22#include "ioctl.h" 22#include "ioctl.h"
23#include "resize.h" 23#include "resize.h"
24#include "refcounttree.h"
24 25
25#include <linux/ext2_fs.h> 26#include <linux/ext2_fs.h>
26 27
@@ -115,6 +116,9 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
115 int status; 116 int status;
116 struct ocfs2_space_resv sr; 117 struct ocfs2_space_resv sr;
117 struct ocfs2_new_group_input input; 118 struct ocfs2_new_group_input input;
119 struct reflink_arguments args;
120 const char *old_path, *new_path;
121 bool preserve;
118 122
119 switch (cmd) { 123 switch (cmd) {
120 case OCFS2_IOC_GETFLAGS: 124 case OCFS2_IOC_GETFLAGS:
@@ -160,6 +164,15 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
160 return -EFAULT; 164 return -EFAULT;
161 165
162 return ocfs2_group_add(inode, &input); 166 return ocfs2_group_add(inode, &input);
167 case OCFS2_IOC_REFLINK:
168 if (copy_from_user(&args, (struct reflink_arguments *)arg,
169 sizeof(args)))
170 return -EFAULT;
171 old_path = (const char *)(unsigned long)args.old_path;
172 new_path = (const char *)(unsigned long)args.new_path;
173 preserve = (args.preserve != 0);
174
175 return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
163 default: 176 default:
164 return -ENOTTY; 177 return -ENOTTY;
165 } 178 }
@@ -182,6 +195,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
182 case OCFS2_IOC_GROUP_EXTEND: 195 case OCFS2_IOC_GROUP_EXTEND:
183 case OCFS2_IOC_GROUP_ADD: 196 case OCFS2_IOC_GROUP_ADD:
184 case OCFS2_IOC_GROUP_ADD64: 197 case OCFS2_IOC_GROUP_ADD64:
198 case OCFS2_IOC_REFLINK:
185 break; 199 break;
186 default: 200 default:
187 return -ENOIOCTLCMD; 201 return -ENOIOCTLCMD;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index c48b93ac6b65..54c16b66327e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -48,6 +48,7 @@
48#include "slot_map.h" 48#include "slot_map.h"
49#include "super.h" 49#include "super.h"
50#include "sysfile.h" 50#include "sysfile.h"
51#include "uptodate.h"
51#include "quota.h" 52#include "quota.h"
52 53
53#include "buffer_head_io.h" 54#include "buffer_head_io.h"
@@ -554,6 +555,14 @@ static struct ocfs2_triggers eb_triggers = {
554 .ot_offset = offsetof(struct ocfs2_extent_block, h_check), 555 .ot_offset = offsetof(struct ocfs2_extent_block, h_check),
555}; 556};
556 557
558static struct ocfs2_triggers rb_triggers = {
559 .ot_triggers = {
560 .t_commit = ocfs2_commit_trigger,
561 .t_abort = ocfs2_abort_trigger,
562 },
563 .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check),
564};
565
557static struct ocfs2_triggers gd_triggers = { 566static struct ocfs2_triggers gd_triggers = {
558 .ot_triggers = { 567 .ot_triggers = {
559 .t_commit = ocfs2_commit_trigger, 568 .t_commit = ocfs2_commit_trigger,
@@ -601,14 +610,16 @@ static struct ocfs2_triggers dl_triggers = {
601}; 610};
602 611
603static int __ocfs2_journal_access(handle_t *handle, 612static int __ocfs2_journal_access(handle_t *handle,
604 struct inode *inode, 613 struct ocfs2_caching_info *ci,
605 struct buffer_head *bh, 614 struct buffer_head *bh,
606 struct ocfs2_triggers *triggers, 615 struct ocfs2_triggers *triggers,
607 int type) 616 int type)
608{ 617{
609 int status; 618 int status;
619 struct ocfs2_super *osb =
620 OCFS2_SB(ocfs2_metadata_cache_get_super(ci));
610 621
611 BUG_ON(!inode); 622 BUG_ON(!ci || !ci->ci_ops);
612 BUG_ON(!handle); 623 BUG_ON(!handle);
613 BUG_ON(!bh); 624 BUG_ON(!bh);
614 625
@@ -627,15 +638,15 @@ static int __ocfs2_journal_access(handle_t *handle,
627 BUG(); 638 BUG();
628 } 639 }
629 640
630 /* Set the current transaction information on the inode so 641 /* Set the current transaction information on the ci so
631 * that the locking code knows whether it can drop it's locks 642 * that the locking code knows whether it can drop it's locks
632 * on this inode or not. We're protected from the commit 643 * on this ci or not. We're protected from the commit
633 * thread updating the current transaction id until 644 * thread updating the current transaction id until
634 * ocfs2_commit_trans() because ocfs2_start_trans() took 645 * ocfs2_commit_trans() because ocfs2_start_trans() took
635 * j_trans_barrier for us. */ 646 * j_trans_barrier for us. */
636 ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode); 647 ocfs2_set_ci_lock_trans(osb->journal, ci);
637 648
638 mutex_lock(&OCFS2_I(inode)->ip_io_mutex); 649 ocfs2_metadata_cache_io_lock(ci);
639 switch (type) { 650 switch (type) {
640 case OCFS2_JOURNAL_ACCESS_CREATE: 651 case OCFS2_JOURNAL_ACCESS_CREATE:
641 case OCFS2_JOURNAL_ACCESS_WRITE: 652 case OCFS2_JOURNAL_ACCESS_WRITE:
@@ -650,9 +661,9 @@ static int __ocfs2_journal_access(handle_t *handle,
650 status = -EINVAL; 661 status = -EINVAL;
651 mlog(ML_ERROR, "Uknown access type!\n"); 662 mlog(ML_ERROR, "Uknown access type!\n");
652 } 663 }
653 if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers) 664 if (!status && ocfs2_meta_ecc(osb) && triggers)
654 jbd2_journal_set_triggers(bh, &triggers->ot_triggers); 665 jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
655 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); 666 ocfs2_metadata_cache_io_unlock(ci);
656 667
657 if (status < 0) 668 if (status < 0)
658 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", 669 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
@@ -662,66 +673,65 @@ static int __ocfs2_journal_access(handle_t *handle,
662 return status; 673 return status;
663} 674}
664 675
665int ocfs2_journal_access_di(handle_t *handle, struct inode *inode, 676int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
666 struct buffer_head *bh, int type) 677 struct buffer_head *bh, int type)
667{ 678{
668 return __ocfs2_journal_access(handle, inode, bh, &di_triggers, 679 return __ocfs2_journal_access(handle, ci, bh, &di_triggers, type);
669 type);
670} 680}
671 681
672int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode, 682int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
673 struct buffer_head *bh, int type) 683 struct buffer_head *bh, int type)
674{ 684{
675 return __ocfs2_journal_access(handle, inode, bh, &eb_triggers, 685 return __ocfs2_journal_access(handle, ci, bh, &eb_triggers, type);
676 type);
677} 686}
678 687
679int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode, 688int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
680 struct buffer_head *bh, int type) 689 struct buffer_head *bh, int type)
681{ 690{
682 return __ocfs2_journal_access(handle, inode, bh, &gd_triggers, 691 return __ocfs2_journal_access(handle, ci, bh, &rb_triggers,
683 type); 692 type);
684} 693}
685 694
686int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, 695int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
687 struct buffer_head *bh, int type) 696 struct buffer_head *bh, int type)
688{ 697{
689 return __ocfs2_journal_access(handle, inode, bh, &db_triggers, 698 return __ocfs2_journal_access(handle, ci, bh, &gd_triggers, type);
690 type);
691} 699}
692 700
693int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode, 701int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
694 struct buffer_head *bh, int type) 702 struct buffer_head *bh, int type)
695{ 703{
696 return __ocfs2_journal_access(handle, inode, bh, &xb_triggers, 704 return __ocfs2_journal_access(handle, ci, bh, &db_triggers, type);
697 type);
698} 705}
699 706
700int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode, 707int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
701 struct buffer_head *bh, int type) 708 struct buffer_head *bh, int type)
702{ 709{
703 return __ocfs2_journal_access(handle, inode, bh, &dq_triggers, 710 return __ocfs2_journal_access(handle, ci, bh, &xb_triggers, type);
704 type);
705} 711}
706 712
707int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode, 713int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
708 struct buffer_head *bh, int type) 714 struct buffer_head *bh, int type)
709{ 715{
710 return __ocfs2_journal_access(handle, inode, bh, &dr_triggers, 716 return __ocfs2_journal_access(handle, ci, bh, &dq_triggers, type);
711 type);
712} 717}
713 718
714int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode, 719int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
715 struct buffer_head *bh, int type) 720 struct buffer_head *bh, int type)
716{ 721{
717 return __ocfs2_journal_access(handle, inode, bh, &dl_triggers, 722 return __ocfs2_journal_access(handle, ci, bh, &dr_triggers, type);
718 type); 723}
724
725int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
726 struct buffer_head *bh, int type)
727{
728 return __ocfs2_journal_access(handle, ci, bh, &dl_triggers, type);
719} 729}
720 730
721int ocfs2_journal_access(handle_t *handle, struct inode *inode, 731int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
722 struct buffer_head *bh, int type) 732 struct buffer_head *bh, int type)
723{ 733{
724 return __ocfs2_journal_access(handle, inode, bh, NULL, type); 734 return __ocfs2_journal_access(handle, ci, bh, NULL, type);
725} 735}
726 736
727int ocfs2_journal_dirty(handle_t *handle, 737int ocfs2_journal_dirty(handle_t *handle,
@@ -898,7 +908,7 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
898 ocfs2_bump_recovery_generation(fe); 908 ocfs2_bump_recovery_generation(fe);
899 909
900 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check); 910 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
901 status = ocfs2_write_block(osb, bh, journal->j_inode); 911 status = ocfs2_write_block(osb, bh, INODE_CACHE(journal->j_inode));
902 if (status < 0) 912 if (status < 0)
903 mlog_errno(status); 913 mlog_errno(status);
904 914
@@ -1642,7 +1652,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1642 ocfs2_get_recovery_generation(fe); 1652 ocfs2_get_recovery_generation(fe);
1643 1653
1644 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check); 1654 ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
1645 status = ocfs2_write_block(osb, bh, inode); 1655 status = ocfs2_write_block(osb, bh, INODE_CACHE(inode));
1646 if (status < 0) 1656 if (status < 0)
1647 mlog_errno(status); 1657 mlog_errno(status);
1648 1658
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 2c3222aec622..3f74e09b0d80 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -90,56 +90,66 @@ static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j)
90 return old_id; 90 return old_id;
91} 91}
92 92
93static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal, 93static inline void ocfs2_set_ci_lock_trans(struct ocfs2_journal *journal,
94 struct inode *inode) 94 struct ocfs2_caching_info *ci)
95{ 95{
96 spin_lock(&trans_inc_lock); 96 spin_lock(&trans_inc_lock);
97 OCFS2_I(inode)->ip_last_trans = journal->j_trans_id; 97 ci->ci_last_trans = journal->j_trans_id;
98 spin_unlock(&trans_inc_lock); 98 spin_unlock(&trans_inc_lock);
99} 99}
100 100
101/* Used to figure out whether it's safe to drop a metadata lock on an 101/* Used to figure out whether it's safe to drop a metadata lock on an
102 * inode. Returns true if all the inodes changes have been 102 * cached object. Returns true if all the object's changes have been
103 * checkpointed to disk. You should be holding the spinlock on the 103 * checkpointed to disk. You should be holding the spinlock on the
104 * metadata lock while calling this to be sure that nobody can take 104 * metadata lock while calling this to be sure that nobody can take
105 * the lock and put it on another transaction. */ 105 * the lock and put it on another transaction. */
106static inline int ocfs2_inode_fully_checkpointed(struct inode *inode) 106static inline int ocfs2_ci_fully_checkpointed(struct ocfs2_caching_info *ci)
107{ 107{
108 int ret; 108 int ret;
109 struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal; 109 struct ocfs2_journal *journal =
110 OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal;
110 111
111 spin_lock(&trans_inc_lock); 112 spin_lock(&trans_inc_lock);
112 ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans); 113 ret = time_after(journal->j_trans_id, ci->ci_last_trans);
113 spin_unlock(&trans_inc_lock); 114 spin_unlock(&trans_inc_lock);
114 return ret; 115 return ret;
115} 116}
116 117
117/* convenience function to check if an inode is still new (has never 118/* convenience function to check if an object backed by struct
118 * hit disk) Will do you a favor and set created_trans = 0 when you've 119 * ocfs2_caching_info is still new (has never hit disk) Will do you a
119 * been checkpointed. returns '1' if the inode is still new. */ 120 * favor and set created_trans = 0 when you've
120static inline int ocfs2_inode_is_new(struct inode *inode) 121 * been checkpointed. returns '1' if the ci is still new. */
122static inline int ocfs2_ci_is_new(struct ocfs2_caching_info *ci)
121{ 123{
122 int ret; 124 int ret;
125 struct ocfs2_journal *journal =
126 OCFS2_SB(ocfs2_metadata_cache_get_super(ci))->journal;
123 127
128 spin_lock(&trans_inc_lock);
129 ret = !(time_after(journal->j_trans_id, ci->ci_created_trans));
130 if (!ret)
131 ci->ci_created_trans = 0;
132 spin_unlock(&trans_inc_lock);
133 return ret;
134}
135
136/* Wrapper for inodes so we can check system files */
137static inline int ocfs2_inode_is_new(struct inode *inode)
138{
124 /* System files are never "new" as they're written out by 139 /* System files are never "new" as they're written out by
125 * mkfs. This helps us early during mount, before we have the 140 * mkfs. This helps us early during mount, before we have the
126 * journal open and j_trans_id could be junk. */ 141 * journal open and j_trans_id could be junk. */
127 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) 142 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
128 return 0; 143 return 0;
129 spin_lock(&trans_inc_lock); 144
130 ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id, 145 return ocfs2_ci_is_new(INODE_CACHE(inode));
131 OCFS2_I(inode)->ip_created_trans));
132 if (!ret)
133 OCFS2_I(inode)->ip_created_trans = 0;
134 spin_unlock(&trans_inc_lock);
135 return ret;
136} 146}
137 147
138static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, 148static inline void ocfs2_ci_set_new(struct ocfs2_super *osb,
139 struct inode *inode) 149 struct ocfs2_caching_info *ci)
140{ 150{
141 spin_lock(&trans_inc_lock); 151 spin_lock(&trans_inc_lock);
142 OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id; 152 ci->ci_created_trans = osb->journal->j_trans_id;
143 spin_unlock(&trans_inc_lock); 153 spin_unlock(&trans_inc_lock);
144} 154}
145 155
@@ -200,7 +210,7 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
200 if (ocfs2_mount_local(osb)) 210 if (ocfs2_mount_local(osb))
201 return; 211 return;
202 212
203 if (!ocfs2_inode_fully_checkpointed(inode)) { 213 if (!ocfs2_ci_fully_checkpointed(INODE_CACHE(inode))) {
204 /* WARNING: This only kicks off a single 214 /* WARNING: This only kicks off a single
205 * checkpoint. If someone races you and adds more 215 * checkpoint. If someone races you and adds more
206 * metadata to the journal, you won't know, and will 216 * metadata to the journal, you won't know, and will
@@ -210,7 +220,7 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
210 ocfs2_start_checkpoint(osb); 220 ocfs2_start_checkpoint(osb);
211 221
212 wait_event(osb->journal->j_checkpointed, 222 wait_event(osb->journal->j_checkpointed,
213 ocfs2_inode_fully_checkpointed(inode)); 223 ocfs2_ci_fully_checkpointed(INODE_CACHE(inode)));
214 } 224 }
215} 225}
216 226
@@ -266,31 +276,34 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks);
266 276
267 277
268/* ocfs2_inode */ 278/* ocfs2_inode */
269int ocfs2_journal_access_di(handle_t *handle, struct inode *inode, 279int ocfs2_journal_access_di(handle_t *handle, struct ocfs2_caching_info *ci,
270 struct buffer_head *bh, int type); 280 struct buffer_head *bh, int type);
271/* ocfs2_extent_block */ 281/* ocfs2_extent_block */
272int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode, 282int ocfs2_journal_access_eb(handle_t *handle, struct ocfs2_caching_info *ci,
283 struct buffer_head *bh, int type);
284/* ocfs2_refcount_block */
285int ocfs2_journal_access_rb(handle_t *handle, struct ocfs2_caching_info *ci,
273 struct buffer_head *bh, int type); 286 struct buffer_head *bh, int type);
274/* ocfs2_group_desc */ 287/* ocfs2_group_desc */
275int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode, 288int ocfs2_journal_access_gd(handle_t *handle, struct ocfs2_caching_info *ci,
276 struct buffer_head *bh, int type); 289 struct buffer_head *bh, int type);
277/* ocfs2_xattr_block */ 290/* ocfs2_xattr_block */
278int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode, 291int ocfs2_journal_access_xb(handle_t *handle, struct ocfs2_caching_info *ci,
279 struct buffer_head *bh, int type); 292 struct buffer_head *bh, int type);
280/* quota blocks */ 293/* quota blocks */
281int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode, 294int ocfs2_journal_access_dq(handle_t *handle, struct ocfs2_caching_info *ci,
282 struct buffer_head *bh, int type); 295 struct buffer_head *bh, int type);
283/* dirblock */ 296/* dirblock */
284int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, 297int ocfs2_journal_access_db(handle_t *handle, struct ocfs2_caching_info *ci,
285 struct buffer_head *bh, int type); 298 struct buffer_head *bh, int type);
286/* ocfs2_dx_root_block */ 299/* ocfs2_dx_root_block */
287int ocfs2_journal_access_dr(handle_t *handle, struct inode *inode, 300int ocfs2_journal_access_dr(handle_t *handle, struct ocfs2_caching_info *ci,
288 struct buffer_head *bh, int type); 301 struct buffer_head *bh, int type);
289/* ocfs2_dx_leaf */ 302/* ocfs2_dx_leaf */
290int ocfs2_journal_access_dl(handle_t *handle, struct inode *inode, 303int ocfs2_journal_access_dl(handle_t *handle, struct ocfs2_caching_info *ci,
291 struct buffer_head *bh, int type); 304 struct buffer_head *bh, int type);
292/* Anything that has no ecc */ 305/* Anything that has no ecc */
293int ocfs2_journal_access(handle_t *handle, struct inode *inode, 306int ocfs2_journal_access(handle_t *handle, struct ocfs2_caching_info *ci,
294 struct buffer_head *bh, int type); 307 struct buffer_head *bh, int type);
295 308
296/* 309/*
@@ -477,6 +490,23 @@ static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
477 return credits; 490 return credits;
478} 491}
479 492
493/* inode update, new refcount block and its allocation credits. */
494#define OCFS2_REFCOUNT_TREE_CREATE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1 \
495 + OCFS2_SUBALLOC_ALLOC)
496
497/* inode and the refcount block update. */
498#define OCFS2_REFCOUNT_TREE_SET_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
499
500/*
501 * inode and the refcount block update.
502 * It doesn't include the credits for sub alloc change.
503 * So if we need to free the bit, OCFS2_SUBALLOC_FREE needs to be added.
504 */
505#define OCFS2_REFCOUNT_TREE_REMOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
506
507/* 2 metadata alloc, 2 new blocks and root refcount block */
508#define OCFS2_EXPAND_REFCOUNT_TREE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + 3)
509
480/* 510/*
481 * Please note that the caller must make sure that root_el is the root 511 * Please note that the caller must make sure that root_el is the root
482 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise 512 * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index bac7e6abaf47..ac10f83edb95 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -297,8 +297,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
297 } 297 }
298 memcpy(alloc_copy, alloc, bh->b_size); 298 memcpy(alloc_copy, alloc, bh->b_size);
299 299
300 status = ocfs2_journal_access_di(handle, local_alloc_inode, bh, 300 status = ocfs2_journal_access_di(handle, INODE_CACHE(local_alloc_inode),
301 OCFS2_JOURNAL_ACCESS_WRITE); 301 bh, OCFS2_JOURNAL_ACCESS_WRITE);
302 if (status < 0) { 302 if (status < 0) {
303 mlog_errno(status); 303 mlog_errno(status);
304 goto out_commit; 304 goto out_commit;
@@ -392,7 +392,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
392 ocfs2_clear_local_alloc(alloc); 392 ocfs2_clear_local_alloc(alloc);
393 393
394 ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check); 394 ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
395 status = ocfs2_write_block(osb, alloc_bh, inode); 395 status = ocfs2_write_block(osb, alloc_bh, INODE_CACHE(inode));
396 if (status < 0) 396 if (status < 0)
397 mlog_errno(status); 397 mlog_errno(status);
398 398
@@ -678,7 +678,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
678 * delete bits from it! */ 678 * delete bits from it! */
679 *num_bits = bits_wanted; 679 *num_bits = bits_wanted;
680 680
681 status = ocfs2_journal_access_di(handle, local_alloc_inode, 681 status = ocfs2_journal_access_di(handle,
682 INODE_CACHE(local_alloc_inode),
682 osb->local_alloc_bh, 683 osb->local_alloc_bh,
683 OCFS2_JOURNAL_ACCESS_WRITE); 684 OCFS2_JOURNAL_ACCESS_WRITE);
684 if (status < 0) { 685 if (status < 0) {
@@ -1156,7 +1157,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
1156 } 1157 }
1157 memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); 1158 memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
1158 1159
1159 status = ocfs2_journal_access_di(handle, local_alloc_inode, 1160 status = ocfs2_journal_access_di(handle,
1161 INODE_CACHE(local_alloc_inode),
1160 osb->local_alloc_bh, 1162 osb->local_alloc_bh,
1161 OCFS2_JOURNAL_ACCESS_WRITE); 1163 OCFS2_JOURNAL_ACCESS_WRITE);
1162 if (status < 0) { 1164 if (status < 0) {
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8601f934010b..f010b22b1c44 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -69,7 +69,6 @@
69static int ocfs2_mknod_locked(struct ocfs2_super *osb, 69static int ocfs2_mknod_locked(struct ocfs2_super *osb,
70 struct inode *dir, 70 struct inode *dir,
71 struct inode *inode, 71 struct inode *inode,
72 struct dentry *dentry,
73 dev_t dev, 72 dev_t dev,
74 struct buffer_head **new_fe_bh, 73 struct buffer_head **new_fe_bh,
75 struct buffer_head *parent_fe_bh, 74 struct buffer_head *parent_fe_bh,
@@ -78,7 +77,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
78 77
79static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, 78static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
80 struct inode **ret_orphan_dir, 79 struct inode **ret_orphan_dir,
81 struct inode *inode, 80 u64 blkno,
82 char *name, 81 char *name,
83 struct ocfs2_dir_lookup_result *lookup); 82 struct ocfs2_dir_lookup_result *lookup);
84 83
@@ -358,8 +357,12 @@ static int ocfs2_mknod(struct inode *dir,
358 } 357 }
359 did_quota_inode = 1; 358 did_quota_inode = 1;
360 359
360 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
361 inode->i_mode, (unsigned long)dev, dentry->d_name.len,
362 dentry->d_name.name);
363
361 /* do the real work now. */ 364 /* do the real work now. */
362 status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev, 365 status = ocfs2_mknod_locked(osb, dir, inode, dev,
363 &new_fe_bh, parent_fe_bh, handle, 366 &new_fe_bh, parent_fe_bh, handle,
364 inode_ac); 367 inode_ac);
365 if (status < 0) { 368 if (status < 0) {
@@ -375,7 +378,8 @@ static int ocfs2_mknod(struct inode *dir,
375 goto leave; 378 goto leave;
376 } 379 }
377 380
378 status = ocfs2_journal_access_di(handle, dir, parent_fe_bh, 381 status = ocfs2_journal_access_di(handle, INODE_CACHE(dir),
382 parent_fe_bh,
379 OCFS2_JOURNAL_ACCESS_WRITE); 383 OCFS2_JOURNAL_ACCESS_WRITE);
380 if (status < 0) { 384 if (status < 0) {
381 mlog_errno(status); 385 mlog_errno(status);
@@ -465,7 +469,6 @@ leave:
465static int ocfs2_mknod_locked(struct ocfs2_super *osb, 469static int ocfs2_mknod_locked(struct ocfs2_super *osb,
466 struct inode *dir, 470 struct inode *dir,
467 struct inode *inode, 471 struct inode *inode,
468 struct dentry *dentry,
469 dev_t dev, 472 dev_t dev,
470 struct buffer_head **new_fe_bh, 473 struct buffer_head **new_fe_bh,
471 struct buffer_head *parent_fe_bh, 474 struct buffer_head *parent_fe_bh,
@@ -479,10 +482,6 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
479 u16 suballoc_bit; 482 u16 suballoc_bit;
480 u16 feat; 483 u16 feat;
481 484
482 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
483 inode->i_mode, (unsigned long)dev, dentry->d_name.len,
484 dentry->d_name.name);
485
486 *new_fe_bh = NULL; 485 *new_fe_bh = NULL;
487 486
488 status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh, 487 status = ocfs2_claim_new_inode(osb, handle, dir, parent_fe_bh,
@@ -507,9 +506,10 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
507 mlog_errno(status); 506 mlog_errno(status);
508 goto leave; 507 goto leave;
509 } 508 }
510 ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh); 509 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), *new_fe_bh);
511 510
512 status = ocfs2_journal_access_di(handle, inode, *new_fe_bh, 511 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
512 *new_fe_bh,
513 OCFS2_JOURNAL_ACCESS_CREATE); 513 OCFS2_JOURNAL_ACCESS_CREATE);
514 if (status < 0) { 514 if (status < 0) {
515 mlog_errno(status); 515 mlog_errno(status);
@@ -565,7 +565,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
565 } 565 }
566 566
567 ocfs2_populate_inode(inode, fe, 1); 567 ocfs2_populate_inode(inode, fe, 1);
568 ocfs2_inode_set_new(osb, inode); 568 ocfs2_ci_set_new(osb, INODE_CACHE(inode));
569 if (!ocfs2_mount_local(osb)) { 569 if (!ocfs2_mount_local(osb)) {
570 status = ocfs2_create_new_inode_locks(inode); 570 status = ocfs2_create_new_inode_locks(inode);
571 if (status < 0) 571 if (status < 0)
@@ -682,7 +682,7 @@ static int ocfs2_link(struct dentry *old_dentry,
682 goto out_unlock_inode; 682 goto out_unlock_inode;
683 } 683 }
684 684
685 err = ocfs2_journal_access_di(handle, inode, fe_bh, 685 err = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
686 OCFS2_JOURNAL_ACCESS_WRITE); 686 OCFS2_JOURNAL_ACCESS_WRITE);
687 if (err < 0) { 687 if (err < 0) {
688 mlog_errno(err); 688 mlog_errno(err);
@@ -850,7 +850,8 @@ static int ocfs2_unlink(struct inode *dir,
850 } 850 }
851 851
852 if (inode_is_unlinkable(inode)) { 852 if (inode_is_unlinkable(inode)) {
853 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, inode, 853 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
854 OCFS2_I(inode)->ip_blkno,
854 orphan_name, &orphan_insert); 855 orphan_name, &orphan_insert);
855 if (status < 0) { 856 if (status < 0) {
856 mlog_errno(status); 857 mlog_errno(status);
@@ -866,7 +867,7 @@ static int ocfs2_unlink(struct inode *dir,
866 goto leave; 867 goto leave;
867 } 868 }
868 869
869 status = ocfs2_journal_access_di(handle, inode, fe_bh, 870 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), fe_bh,
870 OCFS2_JOURNAL_ACCESS_WRITE); 871 OCFS2_JOURNAL_ACCESS_WRITE);
871 if (status < 0) { 872 if (status < 0) {
872 mlog_errno(status); 873 mlog_errno(status);
@@ -1241,9 +1242,8 @@ static int ocfs2_rename(struct inode *old_dir,
1241 1242
1242 if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { 1243 if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
1243 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, 1244 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
1244 new_inode, 1245 OCFS2_I(new_inode)->ip_blkno,
1245 orphan_name, 1246 orphan_name, &orphan_insert);
1246 &orphan_insert);
1247 if (status < 0) { 1247 if (status < 0) {
1248 mlog_errno(status); 1248 mlog_errno(status);
1249 goto bail; 1249 goto bail;
@@ -1284,7 +1284,8 @@ static int ocfs2_rename(struct inode *old_dir,
1284 goto bail; 1284 goto bail;
1285 } 1285 }
1286 } 1286 }
1287 status = ocfs2_journal_access_di(handle, new_inode, newfe_bh, 1287 status = ocfs2_journal_access_di(handle, INODE_CACHE(new_inode),
1288 newfe_bh,
1288 OCFS2_JOURNAL_ACCESS_WRITE); 1289 OCFS2_JOURNAL_ACCESS_WRITE);
1289 if (status < 0) { 1290 if (status < 0) {
1290 mlog_errno(status); 1291 mlog_errno(status);
@@ -1331,7 +1332,8 @@ static int ocfs2_rename(struct inode *old_dir,
1331 old_inode->i_ctime = CURRENT_TIME; 1332 old_inode->i_ctime = CURRENT_TIME;
1332 mark_inode_dirty(old_inode); 1333 mark_inode_dirty(old_inode);
1333 1334
1334 status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh, 1335 status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode),
1336 old_inode_bh,
1335 OCFS2_JOURNAL_ACCESS_WRITE); 1337 OCFS2_JOURNAL_ACCESS_WRITE);
1336 if (status >= 0) { 1338 if (status >= 0) {
1337 old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; 1339 old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
@@ -1407,9 +1409,10 @@ static int ocfs2_rename(struct inode *old_dir,
1407 (int)old_dir_nlink, old_dir->i_nlink); 1409 (int)old_dir_nlink, old_dir->i_nlink);
1408 } else { 1410 } else {
1409 struct ocfs2_dinode *fe; 1411 struct ocfs2_dinode *fe;
1410 status = ocfs2_journal_access_di(handle, old_dir, 1412 status = ocfs2_journal_access_di(handle,
1411 old_dir_bh, 1413 INODE_CACHE(old_dir),
1412 OCFS2_JOURNAL_ACCESS_WRITE); 1414 old_dir_bh,
1415 OCFS2_JOURNAL_ACCESS_WRITE);
1413 fe = (struct ocfs2_dinode *) old_dir_bh->b_data; 1416 fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
1414 ocfs2_set_links_count(fe, old_dir->i_nlink); 1417 ocfs2_set_links_count(fe, old_dir->i_nlink);
1415 status = ocfs2_journal_dirty(handle, old_dir_bh); 1418 status = ocfs2_journal_dirty(handle, old_dir_bh);
@@ -1527,9 +1530,11 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
1527 mlog_errno(status); 1530 mlog_errno(status);
1528 goto bail; 1531 goto bail;
1529 } 1532 }
1530 ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]); 1533 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode),
1534 bhs[virtual]);
1531 1535
1532 status = ocfs2_journal_access(handle, inode, bhs[virtual], 1536 status = ocfs2_journal_access(handle, INODE_CACHE(inode),
1537 bhs[virtual],
1533 OCFS2_JOURNAL_ACCESS_CREATE); 1538 OCFS2_JOURNAL_ACCESS_CREATE);
1534 if (status < 0) { 1539 if (status < 0) {
1535 mlog_errno(status); 1540 mlog_errno(status);
@@ -1692,7 +1697,11 @@ static int ocfs2_symlink(struct inode *dir,
1692 } 1697 }
1693 did_quota_inode = 1; 1698 did_quota_inode = 1;
1694 1699
1695 status = ocfs2_mknod_locked(osb, dir, inode, dentry, 1700 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
1701 inode->i_mode, dentry->d_name.len,
1702 dentry->d_name.name);
1703
1704 status = ocfs2_mknod_locked(osb, dir, inode,
1696 0, &new_fe_bh, parent_fe_bh, handle, 1705 0, &new_fe_bh, parent_fe_bh, handle,
1697 inode_ac); 1706 inode_ac);
1698 if (status < 0) { 1707 if (status < 0) {
@@ -1842,7 +1851,7 @@ bail:
1842 1851
1843static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, 1852static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1844 struct inode **ret_orphan_dir, 1853 struct inode **ret_orphan_dir,
1845 struct inode *inode, 1854 u64 blkno,
1846 char *name, 1855 char *name,
1847 struct ocfs2_dir_lookup_result *lookup) 1856 struct ocfs2_dir_lookup_result *lookup)
1848{ 1857{
@@ -1850,7 +1859,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
1850 struct buffer_head *orphan_dir_bh = NULL; 1859 struct buffer_head *orphan_dir_bh = NULL;
1851 int status = 0; 1860 int status = 0;
1852 1861
1853 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); 1862 status = ocfs2_blkno_stringify(blkno, name);
1854 if (status < 0) { 1863 if (status < 0) {
1855 mlog_errno(status); 1864 mlog_errno(status);
1856 return status; 1865 return status;
@@ -1917,7 +1926,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1917 goto leave; 1926 goto leave;
1918 } 1927 }
1919 1928
1920 status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh, 1929 status = ocfs2_journal_access_di(handle,
1930 INODE_CACHE(orphan_dir_inode),
1931 orphan_dir_bh,
1921 OCFS2_JOURNAL_ACCESS_WRITE); 1932 OCFS2_JOURNAL_ACCESS_WRITE);
1922 if (status < 0) { 1933 if (status < 0) {
1923 mlog_errno(status); 1934 mlog_errno(status);
@@ -2002,7 +2013,9 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2002 goto leave; 2013 goto leave;
2003 } 2014 }
2004 2015
2005 status = ocfs2_journal_access_di(handle,orphan_dir_inode, orphan_dir_bh, 2016 status = ocfs2_journal_access_di(handle,
2017 INODE_CACHE(orphan_dir_inode),
2018 orphan_dir_bh,
2006 OCFS2_JOURNAL_ACCESS_WRITE); 2019 OCFS2_JOURNAL_ACCESS_WRITE);
2007 if (status < 0) { 2020 if (status < 0) {
2008 mlog_errno(status); 2021 mlog_errno(status);
@@ -2028,6 +2041,274 @@ leave:
2028 return status; 2041 return status;
2029} 2042}
2030 2043
2044int ocfs2_create_inode_in_orphan(struct inode *dir,
2045 int mode,
2046 struct inode **new_inode)
2047{
2048 int status, did_quota_inode = 0;
2049 struct inode *inode = NULL;
2050 struct inode *orphan_dir = NULL;
2051 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2052 struct ocfs2_dinode *di = NULL;
2053 handle_t *handle = NULL;
2054 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
2055 struct buffer_head *parent_di_bh = NULL;
2056 struct buffer_head *new_di_bh = NULL;
2057 struct ocfs2_alloc_context *inode_ac = NULL;
2058 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
2059
2060 status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
2061 if (status < 0) {
2062 if (status != -ENOENT)
2063 mlog_errno(status);
2064 return status;
2065 }
2066
2067 /*
2068 * We give the orphan dir the root blkno to fake an orphan name,
2069 * and allocate enough space for our insertion.
2070 */
2071 status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
2072 osb->root_blkno,
2073 orphan_name, &orphan_insert);
2074 if (status < 0) {
2075 mlog_errno(status);
2076 goto leave;
2077 }
2078
2079 /* reserve an inode spot */
2080 status = ocfs2_reserve_new_inode(osb, &inode_ac);
2081 if (status < 0) {
2082 if (status != -ENOSPC)
2083 mlog_errno(status);
2084 goto leave;
2085 }
2086
2087 inode = ocfs2_get_init_inode(dir, mode);
2088 if (!inode) {
2089 status = -ENOMEM;
2090 mlog_errno(status);
2091 goto leave;
2092 }
2093
2094 handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, 0, 0));
2095 if (IS_ERR(handle)) {
2096 status = PTR_ERR(handle);
2097 handle = NULL;
2098 mlog_errno(status);
2099 goto leave;
2100 }
2101
2102 /* We don't use standard VFS wrapper because we don't want vfs_dq_init
2103 * to be called. */
2104 if (sb_any_quota_active(osb->sb) &&
2105 osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
2106 status = -EDQUOT;
2107 goto leave;
2108 }
2109 did_quota_inode = 1;
2110
2111 /* do the real work now. */
2112 status = ocfs2_mknod_locked(osb, dir, inode,
2113 0, &new_di_bh, parent_di_bh, handle,
2114 inode_ac);
2115 if (status < 0) {
2116 mlog_errno(status);
2117 goto leave;
2118 }
2119
2120 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name);
2121 if (status < 0) {
2122 mlog_errno(status);
2123 goto leave;
2124 }
2125
2126 di = (struct ocfs2_dinode *)new_di_bh->b_data;
2127 status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name,
2128 &orphan_insert, orphan_dir);
2129 if (status < 0) {
2130 mlog_errno(status);
2131 goto leave;
2132 }
2133
2134 /* get open lock so that only nodes can't remove it from orphan dir. */
2135 status = ocfs2_open_lock(inode);
2136 if (status < 0)
2137 mlog_errno(status);
2138
2139leave:
2140 if (status < 0 && did_quota_inode)
2141 vfs_dq_free_inode(inode);
2142 if (handle)
2143 ocfs2_commit_trans(osb, handle);
2144
2145 if (orphan_dir) {
2146 /* This was locked for us in ocfs2_prepare_orphan_dir() */
2147 ocfs2_inode_unlock(orphan_dir, 1);
2148 mutex_unlock(&orphan_dir->i_mutex);
2149 iput(orphan_dir);
2150 }
2151
2152 if (status == -ENOSPC)
2153 mlog(0, "Disk is full\n");
2154
2155 if ((status < 0) && inode) {
2156 clear_nlink(inode);
2157 iput(inode);
2158 }
2159
2160 if (inode_ac)
2161 ocfs2_free_alloc_context(inode_ac);
2162
2163 brelse(new_di_bh);
2164
2165 if (!status)
2166 *new_inode = inode;
2167
2168 ocfs2_free_dir_lookup_result(&orphan_insert);
2169
2170 ocfs2_inode_unlock(dir, 1);
2171 brelse(parent_di_bh);
2172 return status;
2173}
2174
2175int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2176 struct inode *inode,
2177 struct dentry *dentry)
2178{
2179 int status = 0;
2180 struct buffer_head *parent_di_bh = NULL;
2181 handle_t *handle = NULL;
2182 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2183 struct ocfs2_dinode *dir_di, *di;
2184 struct inode *orphan_dir_inode = NULL;
2185 struct buffer_head *orphan_dir_bh = NULL;
2186 struct buffer_head *di_bh = NULL;
2187 struct ocfs2_dir_lookup_result lookup = { NULL, };
2188
2189 mlog_entry("(0x%p, 0x%p, %.*s')\n", dir, dentry,
2190 dentry->d_name.len, dentry->d_name.name);
2191
2192 status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
2193 if (status < 0) {
2194 if (status != -ENOENT)
2195 mlog_errno(status);
2196 return status;
2197 }
2198
2199 dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data;
2200 if (!dir_di->i_links_count) {
2201 /* can't make a file in a deleted directory. */
2202 status = -ENOENT;
2203 goto leave;
2204 }
2205
2206 status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
2207 dentry->d_name.len);
2208 if (status)
2209 goto leave;
2210
2211 /* get a spot inside the dir. */
2212 status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh,
2213 dentry->d_name.name,
2214 dentry->d_name.len, &lookup);
2215 if (status < 0) {
2216 mlog_errno(status);
2217 goto leave;
2218 }
2219
2220 orphan_dir_inode = ocfs2_get_system_file_inode(osb,
2221 ORPHAN_DIR_SYSTEM_INODE,
2222 osb->slot_num);
2223 if (!orphan_dir_inode) {
2224 status = -EEXIST;
2225 mlog_errno(status);
2226 goto leave;
2227 }
2228
2229 mutex_lock(&orphan_dir_inode->i_mutex);
2230
2231 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
2232 if (status < 0) {
2233 mlog_errno(status);
2234 mutex_unlock(&orphan_dir_inode->i_mutex);
2235 iput(orphan_dir_inode);
2236 goto leave;
2237 }
2238
2239 status = ocfs2_read_inode_block(inode, &di_bh);
2240 if (status < 0) {
2241 mlog_errno(status);
2242 goto orphan_unlock;
2243 }
2244
2245 handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
2246 if (IS_ERR(handle)) {
2247 status = PTR_ERR(handle);
2248 handle = NULL;
2249 mlog_errno(status);
2250 goto orphan_unlock;
2251 }
2252
2253 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
2254 di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
2255 if (status < 0) {
2256 mlog_errno(status);
2257 goto out_commit;
2258 }
2259
2260 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
2261 orphan_dir_bh);
2262 if (status < 0) {
2263 mlog_errno(status);
2264 goto out_commit;
2265 }
2266
2267 di = (struct ocfs2_dinode *)di_bh->b_data;
2268 le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
2269 di->i_orphaned_slot = 0;
2270 ocfs2_journal_dirty(handle, di_bh);
2271
2272 status = ocfs2_add_entry(handle, dentry, inode,
2273 OCFS2_I(inode)->ip_blkno, parent_di_bh,
2274 &lookup);
2275 if (status < 0) {
2276 mlog_errno(status);
2277 goto out_commit;
2278 }
2279
2280 status = ocfs2_dentry_attach_lock(dentry, inode,
2281 OCFS2_I(dir)->ip_blkno);
2282 if (status) {
2283 mlog_errno(status);
2284 goto out_commit;
2285 }
2286
2287 insert_inode_hash(inode);
2288 dentry->d_op = &ocfs2_dentry_ops;
2289 d_instantiate(dentry, inode);
2290 status = 0;
2291out_commit:
2292 ocfs2_commit_trans(osb, handle);
2293orphan_unlock:
2294 ocfs2_inode_unlock(orphan_dir_inode, 1);
2295 mutex_unlock(&orphan_dir_inode->i_mutex);
2296 iput(orphan_dir_inode);
2297leave:
2298
2299 ocfs2_inode_unlock(dir, 1);
2300
2301 brelse(di_bh);
2302 brelse(parent_di_bh);
2303 brelse(orphan_dir_bh);
2304
2305 ocfs2_free_dir_lookup_result(&lookup);
2306
2307 mlog_exit(status);
2308
2309 return status;
2310}
2311
2031const struct inode_operations ocfs2_dir_iops = { 2312const struct inode_operations ocfs2_dir_iops = {
2032 .create = ocfs2_create, 2313 .create = ocfs2_create,
2033 .lookup = ocfs2_lookup, 2314 .lookup = ocfs2_lookup,
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index 688aef64c879..e5d059d4f115 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -35,5 +35,11 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
35 struct inode *orphan_dir_inode, 35 struct inode *orphan_dir_inode,
36 struct inode *inode, 36 struct inode *inode,
37 struct buffer_head *orphan_dir_bh); 37 struct buffer_head *orphan_dir_bh);
38int ocfs2_create_inode_in_orphan(struct inode *dir,
39 int mode,
40 struct inode **new_inode);
41int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
42 struct inode *new_inode,
43 struct dentry *new_dentry);
38 44
39#endif /* OCFS2_NAMEI_H */ 45#endif /* OCFS2_NAMEI_H */
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 39e1d5a39505..eae404602424 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -51,20 +51,51 @@
51/* For struct ocfs2_blockcheck_stats */ 51/* For struct ocfs2_blockcheck_stats */
52#include "blockcheck.h" 52#include "blockcheck.h"
53 53
54
55/* Caching of metadata buffers */
56
54/* Most user visible OCFS2 inodes will have very few pieces of 57/* Most user visible OCFS2 inodes will have very few pieces of
55 * metadata, but larger files (including bitmaps, etc) must be taken 58 * metadata, but larger files (including bitmaps, etc) must be taken
56 * into account when designing an access scheme. We allow a small 59 * into account when designing an access scheme. We allow a small
57 * amount of inlined blocks to be stored on an array and grow the 60 * amount of inlined blocks to be stored on an array and grow the
58 * structure into a rb tree when necessary. */ 61 * structure into a rb tree when necessary. */
59#define OCFS2_INODE_MAX_CACHE_ARRAY 2 62#define OCFS2_CACHE_INFO_MAX_ARRAY 2
63
64/* Flags for ocfs2_caching_info */
65
66enum ocfs2_caching_info_flags {
67 /* Indicates that the metadata cache is using the inline array */
68 OCFS2_CACHE_FL_INLINE = 1<<1,
69};
60 70
71struct ocfs2_caching_operations;
61struct ocfs2_caching_info { 72struct ocfs2_caching_info {
73 /*
74 * The parent structure provides the locks, but because the
75 * parent structure can differ, it provides locking operations
76 * to struct ocfs2_caching_info.
77 */
78 const struct ocfs2_caching_operations *ci_ops;
79
80 /* next two are protected by trans_inc_lock */
81 /* which transaction were we created on? Zero if none. */
82 unsigned long ci_created_trans;
83 /* last transaction we were a part of. */
84 unsigned long ci_last_trans;
85
86 /* Cache structures */
87 unsigned int ci_flags;
62 unsigned int ci_num_cached; 88 unsigned int ci_num_cached;
63 union { 89 union {
64 sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY]; 90 sector_t ci_array[OCFS2_CACHE_INFO_MAX_ARRAY];
65 struct rb_root ci_tree; 91 struct rb_root ci_tree;
66 } ci_cache; 92 } ci_cache;
67}; 93};
94/*
95 * Need this prototype here instead of in uptodate.h because journal.h
96 * uses it.
97 */
98struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci);
68 99
69/* this limits us to 256 nodes 100/* this limits us to 256 nodes
70 * if we need more, we can do a kmalloc for the map */ 101 * if we need more, we can do a kmalloc for the map */
@@ -377,12 +408,17 @@ struct ocfs2_super
377 408
378 /* the group we used to allocate inodes. */ 409 /* the group we used to allocate inodes. */
379 u64 osb_inode_alloc_group; 410 u64 osb_inode_alloc_group;
411
412 /* rb tree root for refcount lock. */
413 struct rb_root osb_rf_lock_tree;
414 struct ocfs2_refcount_tree *osb_ref_tree_lru;
380}; 415};
381 416
382#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 417#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
383 418
384/* Useful typedef for passing around journal access functions */ 419/* Useful typedef for passing around journal access functions */
385typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode, 420typedef int (*ocfs2_journal_access_func)(handle_t *handle,
421 struct ocfs2_caching_info *ci,
386 struct buffer_head *bh, int type); 422 struct buffer_head *bh, int type);
387 423
388static inline int ocfs2_should_order_data(struct inode *inode) 424static inline int ocfs2_should_order_data(struct inode *inode)
@@ -480,6 +516,13 @@ static inline void ocfs2_add_links_count(struct ocfs2_dinode *di, int n)
480 ocfs2_set_links_count(di, links); 516 ocfs2_set_links_count(di, links);
481} 517}
482 518
519static inline int ocfs2_refcount_tree(struct ocfs2_super *osb)
520{
521 if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
522 return 1;
523 return 0;
524}
525
483/* set / clear functions because cluster events can make these happen 526/* set / clear functions because cluster events can make these happen
484 * in parallel so we want the transitions to be atomic. this also 527 * in parallel so we want the transitions to be atomic. this also
485 * means that any future flags osb_flags must be protected by spinlock 528 * means that any future flags osb_flags must be protected by spinlock
@@ -578,6 +621,9 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
578#define OCFS2_IS_VALID_DX_LEAF(ptr) \ 621#define OCFS2_IS_VALID_DX_LEAF(ptr) \
579 (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE)) 622 (!strcmp((ptr)->dl_signature, OCFS2_DX_LEAF_SIGNATURE))
580 623
624#define OCFS2_IS_VALID_REFCOUNT_BLOCK(ptr) \
625 (!strcmp((ptr)->rf_signature, OCFS2_REFCOUNT_BLOCK_SIGNATURE))
626
581static inline unsigned long ino_from_blkno(struct super_block *sb, 627static inline unsigned long ino_from_blkno(struct super_block *sb,
582 u64 blkno) 628 u64 blkno)
583{ 629{
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 7ab6e9e5e77c..e9431e4a5e7c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -68,6 +68,7 @@
68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" 68#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1"
69#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01" 69#define OCFS2_DX_ROOT_SIGNATURE "DXDIR01"
70#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1" 70#define OCFS2_DX_LEAF_SIGNATURE "DXLEAF1"
71#define OCFS2_REFCOUNT_BLOCK_SIGNATURE "REFCNT1"
71 72
72/* Compatibility flags */ 73/* Compatibility flags */
73#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ 74#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -98,7 +99,8 @@
98 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ 99 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
99 | OCFS2_FEATURE_INCOMPAT_XATTR \ 100 | OCFS2_FEATURE_INCOMPAT_XATTR \
100 | OCFS2_FEATURE_INCOMPAT_META_ECC \ 101 | OCFS2_FEATURE_INCOMPAT_META_ECC \
101 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS) 102 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
103 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE)
102#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ 104#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
103 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ 105 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
104 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) 106 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
@@ -160,6 +162,9 @@
160/* Metadata checksum and error correction */ 162/* Metadata checksum and error correction */
161#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 163#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800
162 164
165/* Refcount tree support */
166#define OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE 0x1000
167
163/* 168/*
164 * backup superblock flag is used to indicate that this volume 169 * backup superblock flag is used to indicate that this volume
165 * has backup superblocks. 170 * has backup superblocks.
@@ -223,6 +228,7 @@
223#define OCFS2_HAS_XATTR_FL (0x0002) 228#define OCFS2_HAS_XATTR_FL (0x0002)
224#define OCFS2_INLINE_XATTR_FL (0x0004) 229#define OCFS2_INLINE_XATTR_FL (0x0004)
225#define OCFS2_INDEXED_DIR_FL (0x0008) 230#define OCFS2_INDEXED_DIR_FL (0x0008)
231#define OCFS2_HAS_REFCOUNT_FL (0x0010)
226 232
227/* Inode attributes, keep in sync with EXT2 */ 233/* Inode attributes, keep in sync with EXT2 */
228#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */ 234#define OCFS2_SECRM_FL (0x00000001) /* Secure deletion */
@@ -241,8 +247,11 @@
241/* 247/*
242 * Extent record flags (e_node.leaf.flags) 248 * Extent record flags (e_node.leaf.flags)
243 */ 249 */
244#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but 250#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but
245 * unwritten */ 251 * unwritten */
252#define OCFS2_EXT_REFCOUNTED (0x02) /* Extent is reference
253 * counted in an associated
254 * refcount tree */
246 255
247/* 256/*
248 * ioctl commands 257 * ioctl commands
@@ -292,6 +301,15 @@ struct ocfs2_new_group_input {
292#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input) 301#define OCFS2_IOC_GROUP_ADD _IOW('o', 2,struct ocfs2_new_group_input)
293#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input) 302#define OCFS2_IOC_GROUP_ADD64 _IOW('o', 3,struct ocfs2_new_group_input)
294 303
304/* Used to pass 2 file names to reflink. */
305struct reflink_arguments {
306 __u64 old_path;
307 __u64 new_path;
308 __u64 preserve;
309};
310#define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments)
311
312
295/* 313/*
296 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) 314 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
297 */ 315 */
@@ -717,7 +735,8 @@ struct ocfs2_dinode {
717 __le64 i_xattr_loc; 735 __le64 i_xattr_loc;
718/*80*/ struct ocfs2_block_check i_check; /* Error checking */ 736/*80*/ struct ocfs2_block_check i_check; /* Error checking */
719/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */ 737/*88*/ __le64 i_dx_root; /* Pointer to dir index root block */
720 __le64 i_reserved2[5]; 738/*90*/ __le64 i_refcount_loc;
739 __le64 i_reserved2[4];
721/*B8*/ union { 740/*B8*/ union {
722 __le64 i_pad1; /* Generic way to refer to this 741 __le64 i_pad1; /* Generic way to refer to this
723 64bit union */ 742 64bit union */
@@ -901,6 +920,60 @@ struct ocfs2_group_desc
901/*40*/ __u8 bg_bitmap[0]; 920/*40*/ __u8 bg_bitmap[0];
902}; 921};
903 922
923struct ocfs2_refcount_rec {
924/*00*/ __le64 r_cpos; /* Physical offset, in clusters */
925 __le32 r_clusters; /* Clusters covered by this extent */
926 __le32 r_refcount; /* Reference count of this extent */
927/*10*/
928};
929#define OCFS2_32BIT_POS_MASK (0xffffffffULL)
930
931#define OCFS2_REFCOUNT_LEAF_FL (0x00000001)
932#define OCFS2_REFCOUNT_TREE_FL (0x00000002)
933
934struct ocfs2_refcount_list {
935/*00*/ __le16 rl_count; /* Maximum number of entries possible
936 in rl_records */
937 __le16 rl_used; /* Current number of used records */
938 __le32 rl_reserved2;
939 __le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */
940/*10*/ struct ocfs2_refcount_rec rl_recs[0]; /* Refcount records */
941};
942
943
944struct ocfs2_refcount_block {
945/*00*/ __u8 rf_signature[8]; /* Signature for verification */
946 __le16 rf_suballoc_slot; /* Slot suballocator this block
947 belongs to */
948 __le16 rf_suballoc_bit; /* Bit offset in suballocator
949 block group */
950 __le32 rf_fs_generation; /* Must match superblock */
951/*10*/ __le64 rf_blkno; /* Offset on disk, in blocks */
952 __le64 rf_parent; /* Parent block, only valid if
953 OCFS2_REFCOUNT_LEAF_FL is set in
954 rf_flags */
955/*20*/ struct ocfs2_block_check rf_check; /* Error checking */
956 __le64 rf_last_eb_blk; /* Pointer to last extent block */
957/*30*/ __le32 rf_count; /* Number of inodes sharing this
958 refcount tree */
959 __le32 rf_flags; /* See the flags above */
960 __le32 rf_clusters; /* clusters covered by refcount tree. */
961 __le32 rf_cpos; /* cluster offset in refcount tree.*/
962/*40*/ __le32 rf_generation; /* generation number. all be the same
963 * for the same refcount tree. */
964 __le32 rf_reserved0;
965 __le64 rf_reserved1[7];
966/*80*/ union {
967 struct ocfs2_refcount_list rf_records; /* List of refcount
968 records */
969 struct ocfs2_extent_list rf_list; /* Extent record list,
970 only valid if
971 OCFS2_REFCOUNT_TREE_FL
972 is set in rf_flags */
973 };
974/* Actual on-disk size is one block */
975};
976
904/* 977/*
905 * On disk extended attribute structure for OCFS2. 978 * On disk extended attribute structure for OCFS2.
906 */ 979 */
@@ -1312,6 +1385,32 @@ static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
1312 1385
1313 return size / sizeof(struct ocfs2_extent_rec); 1386 return size / sizeof(struct ocfs2_extent_rec);
1314} 1387}
1388
1389static inline u16 ocfs2_extent_recs_per_rb(struct super_block *sb)
1390{
1391 int size;
1392
1393 size = sb->s_blocksize -
1394 offsetof(struct ocfs2_refcount_block, rf_list.l_recs);
1395
1396 return size / sizeof(struct ocfs2_extent_rec);
1397}
1398
1399static inline u16 ocfs2_refcount_recs_per_rb(struct super_block *sb)
1400{
1401 int size;
1402
1403 size = sb->s_blocksize -
1404 offsetof(struct ocfs2_refcount_block, rf_records.rl_recs);
1405
1406 return size / sizeof(struct ocfs2_refcount_rec);
1407}
1408
1409static inline u32
1410ocfs2_get_ref_rec_low_cpos(const struct ocfs2_refcount_rec *rec)
1411{
1412 return le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
1413}
1315#else 1414#else
1316static inline int ocfs2_fast_symlink_chars(int blocksize) 1415static inline int ocfs2_fast_symlink_chars(int blocksize)
1317{ 1416{
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index c212cf5a2bdf..d277aabf5dfb 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -49,6 +49,7 @@ enum ocfs2_lock_type {
49 OCFS2_LOCK_TYPE_QINFO, 49 OCFS2_LOCK_TYPE_QINFO,
50 OCFS2_LOCK_TYPE_NFS_SYNC, 50 OCFS2_LOCK_TYPE_NFS_SYNC,
51 OCFS2_LOCK_TYPE_ORPHAN_SCAN, 51 OCFS2_LOCK_TYPE_ORPHAN_SCAN,
52 OCFS2_LOCK_TYPE_REFCOUNT,
52 OCFS2_NUM_LOCK_TYPES 53 OCFS2_NUM_LOCK_TYPES
53}; 54};
54 55
@@ -89,6 +90,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
89 case OCFS2_LOCK_TYPE_ORPHAN_SCAN: 90 case OCFS2_LOCK_TYPE_ORPHAN_SCAN:
90 c = 'P'; 91 c = 'P';
91 break; 92 break;
93 case OCFS2_LOCK_TYPE_REFCOUNT:
94 c = 'T';
95 break;
92 default: 96 default:
93 c = '\0'; 97 c = '\0';
94 } 98 }
@@ -110,6 +114,7 @@ static char *ocfs2_lock_type_strings[] = {
110 [OCFS2_LOCK_TYPE_QINFO] = "Quota", 114 [OCFS2_LOCK_TYPE_QINFO] = "Quota",
111 [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync", 115 [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
112 [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", 116 [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
117 [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount",
113}; 118};
114 119
115static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 120static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 3fb96fcd4c81..e5df9d170b0c 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -109,7 +109,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
109int ocfs2_read_quota_block(struct inode *inode, u64 v_block, 109int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
110 struct buffer_head **bh); 110 struct buffer_head **bh);
111 111
112extern struct dquot_operations ocfs2_quota_operations; 112extern const struct dquot_operations ocfs2_quota_operations;
113extern struct quota_format_type ocfs2_quota_format; 113extern struct quota_format_type ocfs2_quota_format;
114 114
115int ocfs2_quota_setup(void); 115int ocfs2_quota_setup(void);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 44f2a5e1d042..b437dc0c4cad 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -154,7 +154,7 @@ static int ocfs2_get_quota_block(struct inode *inode, int block,
154 err = -EIO; 154 err = -EIO;
155 mlog_errno(err); 155 mlog_errno(err);
156 } 156 }
157 return err;; 157 return err;
158} 158}
159 159
160/* Read data from global quotafile - avoid pagecache and such because we cannot 160/* Read data from global quotafile - avoid pagecache and such because we cannot
@@ -253,8 +253,9 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
253 flush_dcache_page(bh->b_page); 253 flush_dcache_page(bh->b_page);
254 set_buffer_uptodate(bh); 254 set_buffer_uptodate(bh);
255 unlock_buffer(bh); 255 unlock_buffer(bh);
256 ocfs2_set_buffer_uptodate(gqinode, bh); 256 ocfs2_set_buffer_uptodate(INODE_CACHE(gqinode), bh);
257 err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type); 257 err = ocfs2_journal_access_dq(handle, INODE_CACHE(gqinode), bh,
258 ja_type);
258 if (err < 0) { 259 if (err < 0) {
259 brelse(bh); 260 brelse(bh);
260 goto out; 261 goto out;
@@ -849,7 +850,7 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
849 kmem_cache_free(ocfs2_dquot_cachep, dquot); 850 kmem_cache_free(ocfs2_dquot_cachep, dquot);
850} 851}
851 852
852struct dquot_operations ocfs2_quota_operations = { 853const struct dquot_operations ocfs2_quota_operations = {
853 .initialize = dquot_initialize, 854 .initialize = dquot_initialize,
854 .drop = dquot_drop, 855 .drop = dquot_drop,
855 .alloc_space = dquot_alloc_space, 856 .alloc_space = dquot_alloc_space,
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index bdb09cb6e1fe..1a2c50a759fa 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -108,7 +108,7 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
108 mlog_errno(status); 108 mlog_errno(status);
109 return status; 109 return status;
110 } 110 }
111 status = ocfs2_journal_access_dq(handle, inode, bh, 111 status = ocfs2_journal_access_dq(handle, INODE_CACHE(inode), bh,
112 OCFS2_JOURNAL_ACCESS_WRITE); 112 OCFS2_JOURNAL_ACCESS_WRITE);
113 if (status < 0) { 113 if (status < 0) {
114 mlog_errno(status); 114 mlog_errno(status);
@@ -510,7 +510,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
510 goto out_commit; 510 goto out_commit;
511 } 511 }
512 /* Release local quota file entry */ 512 /* Release local quota file entry */
513 status = ocfs2_journal_access_dq(handle, lqinode, 513 status = ocfs2_journal_access_dq(handle,
514 INODE_CACHE(lqinode),
514 qbh, OCFS2_JOURNAL_ACCESS_WRITE); 515 qbh, OCFS2_JOURNAL_ACCESS_WRITE);
515 if (status < 0) { 516 if (status < 0) {
516 mlog_errno(status); 517 mlog_errno(status);
@@ -619,7 +620,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
619 mlog_errno(status); 620 mlog_errno(status);
620 goto out_bh; 621 goto out_bh;
621 } 622 }
622 status = ocfs2_journal_access_dq(handle, lqinode, bh, 623 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
624 bh,
623 OCFS2_JOURNAL_ACCESS_WRITE); 625 OCFS2_JOURNAL_ACCESS_WRITE);
624 if (status < 0) { 626 if (status < 0) {
625 mlog_errno(status); 627 mlog_errno(status);
@@ -993,8 +995,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
993 goto out_trans; 995 goto out_trans;
994 } 996 }
995 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; 997 dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
996 ocfs2_set_new_buffer_uptodate(lqinode, bh); 998 ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh);
997 status = ocfs2_journal_access_dq(handle, lqinode, bh, 999 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh,
998 OCFS2_JOURNAL_ACCESS_CREATE); 1000 OCFS2_JOURNAL_ACCESS_CREATE);
999 if (status < 0) { 1001 if (status < 0) {
1000 mlog_errno(status); 1002 mlog_errno(status);
@@ -1027,8 +1029,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
1027 mlog_errno(status); 1029 mlog_errno(status);
1028 goto out_trans; 1030 goto out_trans;
1029 } 1031 }
1030 ocfs2_set_new_buffer_uptodate(lqinode, dbh); 1032 ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), dbh);
1031 status = ocfs2_journal_access_dq(handle, lqinode, dbh, 1033 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), dbh,
1032 OCFS2_JOURNAL_ACCESS_CREATE); 1034 OCFS2_JOURNAL_ACCESS_CREATE);
1033 if (status < 0) { 1035 if (status < 0) {
1034 mlog_errno(status); 1036 mlog_errno(status);
@@ -1131,7 +1133,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1131 mlog_errno(status); 1133 mlog_errno(status);
1132 goto out; 1134 goto out;
1133 } 1135 }
1134 ocfs2_set_new_buffer_uptodate(lqinode, bh); 1136 ocfs2_set_new_buffer_uptodate(INODE_CACHE(lqinode), bh);
1135 1137
1136 /* Local quota info, chunk header and the new block we initialize */ 1138 /* Local quota info, chunk header and the new block we initialize */
1137 handle = ocfs2_start_trans(OCFS2_SB(sb), 1139 handle = ocfs2_start_trans(OCFS2_SB(sb),
@@ -1143,7 +1145,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1143 goto out; 1145 goto out;
1144 } 1146 }
1145 /* Zero created block */ 1147 /* Zero created block */
1146 status = ocfs2_journal_access_dq(handle, lqinode, bh, 1148 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode), bh,
1147 OCFS2_JOURNAL_ACCESS_CREATE); 1149 OCFS2_JOURNAL_ACCESS_CREATE);
1148 if (status < 0) { 1150 if (status < 0) {
1149 mlog_errno(status); 1151 mlog_errno(status);
@@ -1158,7 +1160,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
1158 goto out_trans; 1160 goto out_trans;
1159 } 1161 }
1160 /* Update chunk header */ 1162 /* Update chunk header */
1161 status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh, 1163 status = ocfs2_journal_access_dq(handle, INODE_CACHE(lqinode),
1164 chunk->qc_headerbh,
1162 OCFS2_JOURNAL_ACCESS_WRITE); 1165 OCFS2_JOURNAL_ACCESS_WRITE);
1163 if (status < 0) { 1166 if (status < 0) {
1164 mlog_errno(status); 1167 mlog_errno(status);
@@ -1292,7 +1295,8 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
1292 goto out; 1295 goto out;
1293 } 1296 }
1294 1297
1295 status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type], 1298 status = ocfs2_journal_access_dq(handle,
1299 INODE_CACHE(sb_dqopt(sb)->files[type]),
1296 od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE); 1300 od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
1297 if (status < 0) { 1301 if (status < 0) {
1298 mlog_errno(status); 1302 mlog_errno(status);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
new file mode 100644
index 000000000000..60287fc56bcb
--- /dev/null
+++ b/fs/ocfs2/refcounttree.c
@@ -0,0 +1,4313 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * refcounttree.c
5 *
6 * Copyright (C) 2009 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17
18#include <linux/sort.h>
19#define MLOG_MASK_PREFIX ML_REFCOUNT
20#include <cluster/masklog.h>
21#include "ocfs2.h"
22#include "inode.h"
23#include "alloc.h"
24#include "suballoc.h"
25#include "journal.h"
26#include "uptodate.h"
27#include "super.h"
28#include "buffer_head_io.h"
29#include "blockcheck.h"
30#include "refcounttree.h"
31#include "sysfile.h"
32#include "dlmglue.h"
33#include "extent_map.h"
34#include "aops.h"
35#include "xattr.h"
36#include "namei.h"
37
38#include <linux/bio.h>
39#include <linux/blkdev.h>
40#include <linux/gfp.h>
41#include <linux/slab.h>
42#include <linux/writeback.h>
43#include <linux/pagevec.h>
44#include <linux/swap.h>
45#include <linux/security.h>
46#include <linux/fsnotify.h>
47#include <linux/quotaops.h>
48#include <linux/namei.h>
49#include <linux/mount.h>
50
51struct ocfs2_cow_context {
52 struct inode *inode;
53 u32 cow_start;
54 u32 cow_len;
55 struct ocfs2_extent_tree data_et;
56 struct ocfs2_refcount_tree *ref_tree;
57 struct buffer_head *ref_root_bh;
58 struct ocfs2_alloc_context *meta_ac;
59 struct ocfs2_alloc_context *data_ac;
60 struct ocfs2_cached_dealloc_ctxt dealloc;
61 void *cow_object;
62 struct ocfs2_post_refcount *post_refcount;
63 int extra_credits;
64 int (*get_clusters)(struct ocfs2_cow_context *context,
65 u32 v_cluster, u32 *p_cluster,
66 u32 *num_clusters,
67 unsigned int *extent_flags);
68 int (*cow_duplicate_clusters)(handle_t *handle,
69 struct ocfs2_cow_context *context,
70 u32 cpos, u32 old_cluster,
71 u32 new_cluster, u32 new_len);
72};
73
74static inline struct ocfs2_refcount_tree *
75cache_info_to_refcount(struct ocfs2_caching_info *ci)
76{
77 return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
78}
79
80static int ocfs2_validate_refcount_block(struct super_block *sb,
81 struct buffer_head *bh)
82{
83 int rc;
84 struct ocfs2_refcount_block *rb =
85 (struct ocfs2_refcount_block *)bh->b_data;
86
87 mlog(0, "Validating refcount block %llu\n",
88 (unsigned long long)bh->b_blocknr);
89
90 BUG_ON(!buffer_uptodate(bh));
91
92 /*
93 * If the ecc fails, we return the error but otherwise
94 * leave the filesystem running. We know any error is
95 * local to this block.
96 */
97 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
98 if (rc) {
99 mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
100 (unsigned long long)bh->b_blocknr);
101 return rc;
102 }
103
104
105 if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
106 ocfs2_error(sb,
107 "Refcount block #%llu has bad signature %.*s",
108 (unsigned long long)bh->b_blocknr, 7,
109 rb->rf_signature);
110 return -EINVAL;
111 }
112
113 if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
114 ocfs2_error(sb,
115 "Refcount block #%llu has an invalid rf_blkno "
116 "of %llu",
117 (unsigned long long)bh->b_blocknr,
118 (unsigned long long)le64_to_cpu(rb->rf_blkno));
119 return -EINVAL;
120 }
121
122 if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
123 ocfs2_error(sb,
124 "Refcount block #%llu has an invalid "
125 "rf_fs_generation of #%u",
126 (unsigned long long)bh->b_blocknr,
127 le32_to_cpu(rb->rf_fs_generation));
128 return -EINVAL;
129 }
130
131 return 0;
132}
133
134static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
135 u64 rb_blkno,
136 struct buffer_head **bh)
137{
138 int rc;
139 struct buffer_head *tmp = *bh;
140
141 rc = ocfs2_read_block(ci, rb_blkno, &tmp,
142 ocfs2_validate_refcount_block);
143
144 /* If ocfs2_read_block() got us a new bh, pass it up. */
145 if (!rc && !*bh)
146 *bh = tmp;
147
148 return rc;
149}
150
151static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
152{
153 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
154
155 return rf->rf_blkno;
156}
157
158static struct super_block *
159ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
160{
161 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
162
163 return rf->rf_sb;
164}
165
166static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
167{
168 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
169
170 spin_lock(&rf->rf_lock);
171}
172
173static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
174{
175 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
176
177 spin_unlock(&rf->rf_lock);
178}
179
180static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
181{
182 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
183
184 mutex_lock(&rf->rf_io_mutex);
185}
186
187static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
188{
189 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
190
191 mutex_unlock(&rf->rf_io_mutex);
192}
193
194static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
195 .co_owner = ocfs2_refcount_cache_owner,
196 .co_get_super = ocfs2_refcount_cache_get_super,
197 .co_cache_lock = ocfs2_refcount_cache_lock,
198 .co_cache_unlock = ocfs2_refcount_cache_unlock,
199 .co_io_lock = ocfs2_refcount_cache_io_lock,
200 .co_io_unlock = ocfs2_refcount_cache_io_unlock,
201};
202
203static struct ocfs2_refcount_tree *
204ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
205{
206 struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
207 struct ocfs2_refcount_tree *tree = NULL;
208
209 while (n) {
210 tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
211
212 if (blkno < tree->rf_blkno)
213 n = n->rb_left;
214 else if (blkno > tree->rf_blkno)
215 n = n->rb_right;
216 else
217 return tree;
218 }
219
220 return NULL;
221}
222
223/* osb_lock is already locked. */
224static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
225 struct ocfs2_refcount_tree *new)
226{
227 u64 rf_blkno = new->rf_blkno;
228 struct rb_node *parent = NULL;
229 struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
230 struct ocfs2_refcount_tree *tmp;
231
232 while (*p) {
233 parent = *p;
234
235 tmp = rb_entry(parent, struct ocfs2_refcount_tree,
236 rf_node);
237
238 if (rf_blkno < tmp->rf_blkno)
239 p = &(*p)->rb_left;
240 else if (rf_blkno > tmp->rf_blkno)
241 p = &(*p)->rb_right;
242 else {
243 /* This should never happen! */
244 mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
245 (unsigned long long)rf_blkno);
246 BUG();
247 }
248 }
249
250 rb_link_node(&new->rf_node, parent, p);
251 rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
252}
253
254static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
255{
256 ocfs2_metadata_cache_exit(&tree->rf_ci);
257 ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
258 ocfs2_lock_res_free(&tree->rf_lockres);
259 kfree(tree);
260}
261
262static inline void
263ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
264 struct ocfs2_refcount_tree *tree)
265{
266 rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
267 if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
268 osb->osb_ref_tree_lru = NULL;
269}
270
271static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
272 struct ocfs2_refcount_tree *tree)
273{
274 spin_lock(&osb->osb_lock);
275 ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
276 spin_unlock(&osb->osb_lock);
277}
278
279void ocfs2_kref_remove_refcount_tree(struct kref *kref)
280{
281 struct ocfs2_refcount_tree *tree =
282 container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
283
284 ocfs2_free_refcount_tree(tree);
285}
286
287static inline void
288ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
289{
290 kref_get(&tree->rf_getcnt);
291}
292
293static inline void
294ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
295{
296 kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
297}
298
299static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
300 struct super_block *sb)
301{
302 ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
303 mutex_init(&new->rf_io_mutex);
304 new->rf_sb = sb;
305 spin_lock_init(&new->rf_lock);
306}
307
308static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
309 struct ocfs2_refcount_tree *new,
310 u64 rf_blkno, u32 generation)
311{
312 init_rwsem(&new->rf_sem);
313 ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
314 rf_blkno, generation);
315}
316
317static struct ocfs2_refcount_tree*
318ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
319{
320 struct ocfs2_refcount_tree *new;
321
322 new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
323 if (!new)
324 return NULL;
325
326 new->rf_blkno = rf_blkno;
327 kref_init(&new->rf_getcnt);
328 ocfs2_init_refcount_tree_ci(new, osb->sb);
329
330 return new;
331}
332
333static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
334 struct ocfs2_refcount_tree **ret_tree)
335{
336 int ret = 0;
337 struct ocfs2_refcount_tree *tree, *new = NULL;
338 struct buffer_head *ref_root_bh = NULL;
339 struct ocfs2_refcount_block *ref_rb;
340
341 spin_lock(&osb->osb_lock);
342 if (osb->osb_ref_tree_lru &&
343 osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
344 tree = osb->osb_ref_tree_lru;
345 else
346 tree = ocfs2_find_refcount_tree(osb, rf_blkno);
347 if (tree)
348 goto out;
349
350 spin_unlock(&osb->osb_lock);
351
352 new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
353 if (!new) {
354 ret = -ENOMEM;
355 mlog_errno(ret);
356 return ret;
357 }
358 /*
359 * We need the generation to create the refcount tree lock and since
360 * it isn't changed during the tree modification, we are safe here to
361 * read without protection.
362 * We also have to purge the cache after we create the lock since the
363 * refcount block may have the stale data. It can only be trusted when
364 * we hold the refcount lock.
365 */
366 ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
367 if (ret) {
368 mlog_errno(ret);
369 ocfs2_metadata_cache_exit(&new->rf_ci);
370 kfree(new);
371 return ret;
372 }
373
374 ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
375 new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
376 ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
377 new->rf_generation);
378 ocfs2_metadata_cache_purge(&new->rf_ci);
379
380 spin_lock(&osb->osb_lock);
381 tree = ocfs2_find_refcount_tree(osb, rf_blkno);
382 if (tree)
383 goto out;
384
385 ocfs2_insert_refcount_tree(osb, new);
386
387 tree = new;
388 new = NULL;
389
390out:
391 *ret_tree = tree;
392
393 osb->osb_ref_tree_lru = tree;
394
395 spin_unlock(&osb->osb_lock);
396
397 if (new)
398 ocfs2_free_refcount_tree(new);
399
400 brelse(ref_root_bh);
401 return ret;
402}
403
404static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
405{
406 int ret;
407 struct buffer_head *di_bh = NULL;
408 struct ocfs2_dinode *di;
409
410 ret = ocfs2_read_inode_block(inode, &di_bh);
411 if (ret) {
412 mlog_errno(ret);
413 goto out;
414 }
415
416 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
417
418 di = (struct ocfs2_dinode *)di_bh->b_data;
419 *ref_blkno = le64_to_cpu(di->i_refcount_loc);
420 brelse(di_bh);
421out:
422 return ret;
423}
424
425static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
426 struct ocfs2_refcount_tree *tree, int rw)
427{
428 int ret;
429
430 ret = ocfs2_refcount_lock(tree, rw);
431 if (ret) {
432 mlog_errno(ret);
433 goto out;
434 }
435
436 if (rw)
437 down_write(&tree->rf_sem);
438 else
439 down_read(&tree->rf_sem);
440
441out:
442 return ret;
443}
444
445/*
446 * Lock the refcount tree pointed by ref_blkno and return the tree.
447 * In most case, we lock the tree and read the refcount block.
448 * So read it here if the caller really needs it.
449 *
450 * If the tree has been re-created by other node, it will free the
451 * old one and re-create it.
452 */
453int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
454 u64 ref_blkno, int rw,
455 struct ocfs2_refcount_tree **ret_tree,
456 struct buffer_head **ref_bh)
457{
458 int ret, delete_tree = 0;
459 struct ocfs2_refcount_tree *tree = NULL;
460 struct buffer_head *ref_root_bh = NULL;
461 struct ocfs2_refcount_block *rb;
462
463again:
464 ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
465 if (ret) {
466 mlog_errno(ret);
467 return ret;
468 }
469
470 ocfs2_refcount_tree_get(tree);
471
472 ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
473 if (ret) {
474 mlog_errno(ret);
475 ocfs2_refcount_tree_put(tree);
476 goto out;
477 }
478
479 ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
480 &ref_root_bh);
481 if (ret) {
482 mlog_errno(ret);
483 ocfs2_unlock_refcount_tree(osb, tree, rw);
484 ocfs2_refcount_tree_put(tree);
485 goto out;
486 }
487
488 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
489 /*
490 * If the refcount block has been freed and re-created, we may need
491 * to recreate the refcount tree also.
492 *
493 * Here we just remove the tree from the rb-tree, and the last
494 * kref holder will unlock and delete this refcount_tree.
495 * Then we goto "again" and ocfs2_get_refcount_tree will create
496 * the new refcount tree for us.
497 */
498 if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
499 if (!tree->rf_removed) {
500 ocfs2_erase_refcount_tree_from_list(osb, tree);
501 tree->rf_removed = 1;
502 delete_tree = 1;
503 }
504
505 ocfs2_unlock_refcount_tree(osb, tree, rw);
506 /*
507 * We get an extra reference when we create the refcount
508 * tree, so another put will destroy it.
509 */
510 if (delete_tree)
511 ocfs2_refcount_tree_put(tree);
512 brelse(ref_root_bh);
513 ref_root_bh = NULL;
514 goto again;
515 }
516
517 *ret_tree = tree;
518 if (ref_bh) {
519 *ref_bh = ref_root_bh;
520 ref_root_bh = NULL;
521 }
522out:
523 brelse(ref_root_bh);
524 return ret;
525}
526
527int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw,
528 struct ocfs2_refcount_tree **ret_tree,
529 struct buffer_head **ref_bh)
530{
531 int ret;
532 u64 ref_blkno;
533
534 ret = ocfs2_get_refcount_block(inode, &ref_blkno);
535 if (ret) {
536 mlog_errno(ret);
537 return ret;
538 }
539
540 return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno,
541 rw, ret_tree, ref_bh);
542}
543
544void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
545 struct ocfs2_refcount_tree *tree, int rw)
546{
547 if (rw)
548 up_write(&tree->rf_sem);
549 else
550 up_read(&tree->rf_sem);
551
552 ocfs2_refcount_unlock(tree, rw);
553 ocfs2_refcount_tree_put(tree);
554}
555
556void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
557{
558 struct rb_node *node;
559 struct ocfs2_refcount_tree *tree;
560 struct rb_root *root = &osb->osb_rf_lock_tree;
561
562 while ((node = rb_last(root)) != NULL) {
563 tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
564
565 mlog(0, "Purge tree %llu\n",
566 (unsigned long long) tree->rf_blkno);
567
568 rb_erase(&tree->rf_node, root);
569 ocfs2_free_refcount_tree(tree);
570 }
571}
572
573/*
574 * Create a refcount tree for an inode.
575 * We take for granted that the inode is already locked.
576 */
577static int ocfs2_create_refcount_tree(struct inode *inode,
578 struct buffer_head *di_bh)
579{
580 int ret;
581 handle_t *handle = NULL;
582 struct ocfs2_alloc_context *meta_ac = NULL;
583 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
584 struct ocfs2_inode_info *oi = OCFS2_I(inode);
585 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
586 struct buffer_head *new_bh = NULL;
587 struct ocfs2_refcount_block *rb;
588 struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
589 u16 suballoc_bit_start;
590 u32 num_got;
591 u64 first_blkno;
592
593 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
594
595 mlog(0, "create tree for inode %lu\n", inode->i_ino);
596
597 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
598 if (ret) {
599 mlog_errno(ret);
600 goto out;
601 }
602
603 handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
604 if (IS_ERR(handle)) {
605 ret = PTR_ERR(handle);
606 mlog_errno(ret);
607 goto out;
608 }
609
610 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
611 OCFS2_JOURNAL_ACCESS_WRITE);
612 if (ret) {
613 mlog_errno(ret);
614 goto out_commit;
615 }
616
617 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
618 &suballoc_bit_start, &num_got,
619 &first_blkno);
620 if (ret) {
621 mlog_errno(ret);
622 goto out_commit;
623 }
624
625 new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
626 if (!new_tree) {
627 ret = -ENOMEM;
628 mlog_errno(ret);
629 goto out_commit;
630 }
631
632 new_bh = sb_getblk(inode->i_sb, first_blkno);
633 ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
634
635 ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
636 OCFS2_JOURNAL_ACCESS_CREATE);
637 if (ret) {
638 mlog_errno(ret);
639 goto out_commit;
640 }
641
642 /* Initialize ocfs2_refcount_block. */
643 rb = (struct ocfs2_refcount_block *)new_bh->b_data;
644 memset(rb, 0, inode->i_sb->s_blocksize);
645 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
646 rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num);
647 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
648 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
649 rb->rf_blkno = cpu_to_le64(first_blkno);
650 rb->rf_count = cpu_to_le32(1);
651 rb->rf_records.rl_count =
652 cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
653 spin_lock(&osb->osb_lock);
654 rb->rf_generation = osb->s_next_generation++;
655 spin_unlock(&osb->osb_lock);
656
657 ocfs2_journal_dirty(handle, new_bh);
658
659 spin_lock(&oi->ip_lock);
660 oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
661 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
662 di->i_refcount_loc = cpu_to_le64(first_blkno);
663 spin_unlock(&oi->ip_lock);
664
665 mlog(0, "created tree for inode %lu, refblock %llu\n",
666 inode->i_ino, (unsigned long long)first_blkno);
667
668 ocfs2_journal_dirty(handle, di_bh);
669
670 /*
671 * We have to init the tree lock here since it will use
672 * the generation number to create it.
673 */
674 new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
675 ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
676 new_tree->rf_generation);
677
678 spin_lock(&osb->osb_lock);
679 tree = ocfs2_find_refcount_tree(osb, first_blkno);
680
681 /*
682 * We've just created a new refcount tree in this block. If
683 * we found a refcount tree on the ocfs2_super, it must be
684 * one we just deleted. We free the old tree before
685 * inserting the new tree.
686 */
687 BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
688 if (tree)
689 ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
690 ocfs2_insert_refcount_tree(osb, new_tree);
691 spin_unlock(&osb->osb_lock);
692 new_tree = NULL;
693 if (tree)
694 ocfs2_refcount_tree_put(tree);
695
696out_commit:
697 ocfs2_commit_trans(osb, handle);
698
699out:
700 if (new_tree) {
701 ocfs2_metadata_cache_exit(&new_tree->rf_ci);
702 kfree(new_tree);
703 }
704
705 brelse(new_bh);
706 if (meta_ac)
707 ocfs2_free_alloc_context(meta_ac);
708
709 return ret;
710}
711
712static int ocfs2_set_refcount_tree(struct inode *inode,
713 struct buffer_head *di_bh,
714 u64 refcount_loc)
715{
716 int ret;
717 handle_t *handle = NULL;
718 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
719 struct ocfs2_inode_info *oi = OCFS2_I(inode);
720 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
721 struct buffer_head *ref_root_bh = NULL;
722 struct ocfs2_refcount_block *rb;
723 struct ocfs2_refcount_tree *ref_tree;
724
725 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
726
727 ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
728 &ref_tree, &ref_root_bh);
729 if (ret) {
730 mlog_errno(ret);
731 return ret;
732 }
733
734 handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
735 if (IS_ERR(handle)) {
736 ret = PTR_ERR(handle);
737 mlog_errno(ret);
738 goto out;
739 }
740
741 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
742 OCFS2_JOURNAL_ACCESS_WRITE);
743 if (ret) {
744 mlog_errno(ret);
745 goto out_commit;
746 }
747
748 ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
749 OCFS2_JOURNAL_ACCESS_WRITE);
750 if (ret) {
751 mlog_errno(ret);
752 goto out_commit;
753 }
754
755 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
756 le32_add_cpu(&rb->rf_count, 1);
757
758 ocfs2_journal_dirty(handle, ref_root_bh);
759
760 spin_lock(&oi->ip_lock);
761 oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
762 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
763 di->i_refcount_loc = cpu_to_le64(refcount_loc);
764 spin_unlock(&oi->ip_lock);
765 ocfs2_journal_dirty(handle, di_bh);
766
767out_commit:
768 ocfs2_commit_trans(osb, handle);
769out:
770 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
771 brelse(ref_root_bh);
772
773 return ret;
774}
775
776int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
777{
778 int ret, delete_tree = 0;
779 handle_t *handle = NULL;
780 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
781 struct ocfs2_inode_info *oi = OCFS2_I(inode);
782 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
783 struct ocfs2_refcount_block *rb;
784 struct inode *alloc_inode = NULL;
785 struct buffer_head *alloc_bh = NULL;
786 struct buffer_head *blk_bh = NULL;
787 struct ocfs2_refcount_tree *ref_tree;
788 int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS;
789 u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
790 u16 bit = 0;
791
792 if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
793 return 0;
794
795 BUG_ON(!ref_blkno);
796 ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
797 if (ret) {
798 mlog_errno(ret);
799 return ret;
800 }
801
802 rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
803
804 /*
805 * If we are the last user, we need to free the block.
806 * So lock the allocator ahead.
807 */
808 if (le32_to_cpu(rb->rf_count) == 1) {
809 blk = le64_to_cpu(rb->rf_blkno);
810 bit = le16_to_cpu(rb->rf_suballoc_bit);
811 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
812
813 alloc_inode = ocfs2_get_system_file_inode(osb,
814 EXTENT_ALLOC_SYSTEM_INODE,
815 le16_to_cpu(rb->rf_suballoc_slot));
816 if (!alloc_inode) {
817 ret = -ENOMEM;
818 mlog_errno(ret);
819 goto out;
820 }
821 mutex_lock(&alloc_inode->i_mutex);
822
823 ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
824 if (ret) {
825 mlog_errno(ret);
826 goto out_mutex;
827 }
828
829 credits += OCFS2_SUBALLOC_FREE;
830 }
831
832 handle = ocfs2_start_trans(osb, credits);
833 if (IS_ERR(handle)) {
834 ret = PTR_ERR(handle);
835 mlog_errno(ret);
836 goto out_unlock;
837 }
838
839 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
840 OCFS2_JOURNAL_ACCESS_WRITE);
841 if (ret) {
842 mlog_errno(ret);
843 goto out_commit;
844 }
845
846 ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
847 OCFS2_JOURNAL_ACCESS_WRITE);
848 if (ret) {
849 mlog_errno(ret);
850 goto out_commit;
851 }
852
853 spin_lock(&oi->ip_lock);
854 oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
855 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
856 di->i_refcount_loc = 0;
857 spin_unlock(&oi->ip_lock);
858 ocfs2_journal_dirty(handle, di_bh);
859
860 le32_add_cpu(&rb->rf_count , -1);
861 ocfs2_journal_dirty(handle, blk_bh);
862
863 if (!rb->rf_count) {
864 delete_tree = 1;
865 ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
866 ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
867 alloc_bh, bit, bg_blkno, 1);
868 if (ret)
869 mlog_errno(ret);
870 }
871
872out_commit:
873 ocfs2_commit_trans(osb, handle);
874out_unlock:
875 if (alloc_inode) {
876 ocfs2_inode_unlock(alloc_inode, 1);
877 brelse(alloc_bh);
878 }
879out_mutex:
880 if (alloc_inode) {
881 mutex_unlock(&alloc_inode->i_mutex);
882 iput(alloc_inode);
883 }
884out:
885 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
886 if (delete_tree)
887 ocfs2_refcount_tree_put(ref_tree);
888 brelse(blk_bh);
889
890 return ret;
891}
892
893static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci,
894 struct buffer_head *ref_leaf_bh,
895 u64 cpos, unsigned int len,
896 struct ocfs2_refcount_rec *ret_rec,
897 int *index)
898{
899 int i = 0;
900 struct ocfs2_refcount_block *rb =
901 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
902 struct ocfs2_refcount_rec *rec = NULL;
903
904 for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
905 rec = &rb->rf_records.rl_recs[i];
906
907 if (le64_to_cpu(rec->r_cpos) +
908 le32_to_cpu(rec->r_clusters) <= cpos)
909 continue;
910 else if (le64_to_cpu(rec->r_cpos) > cpos)
911 break;
912
913 /* ok, cpos fail in this rec. Just return. */
914 if (ret_rec)
915 *ret_rec = *rec;
916 goto out;
917 }
918
919 if (ret_rec) {
920 /* We meet with a hole here, so fake the rec. */
921 ret_rec->r_cpos = cpu_to_le64(cpos);
922 ret_rec->r_refcount = 0;
923 if (i < le16_to_cpu(rb->rf_records.rl_used) &&
924 le64_to_cpu(rec->r_cpos) < cpos + len)
925 ret_rec->r_clusters =
926 cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
927 else
928 ret_rec->r_clusters = cpu_to_le32(len);
929 }
930
931out:
932 *index = i;
933}
934
935/*
936 * Try to remove refcount tree. The mechanism is:
937 * 1) Check whether i_clusters == 0, if no, exit.
938 * 2) check whether we have i_xattr_loc in dinode. if yes, exit.
939 * 3) Check whether we have inline xattr stored outside, if yes, exit.
940 * 4) Remove the tree.
941 */
942int ocfs2_try_remove_refcount_tree(struct inode *inode,
943 struct buffer_head *di_bh)
944{
945 int ret;
946 struct ocfs2_inode_info *oi = OCFS2_I(inode);
947 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
948
949 down_write(&oi->ip_xattr_sem);
950 down_write(&oi->ip_alloc_sem);
951
952 if (oi->ip_clusters)
953 goto out;
954
955 if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc)
956 goto out;
957
958 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL &&
959 ocfs2_has_inline_xattr_value_outside(inode, di))
960 goto out;
961
962 ret = ocfs2_remove_refcount_tree(inode, di_bh);
963 if (ret)
964 mlog_errno(ret);
965out:
966 up_write(&oi->ip_alloc_sem);
967 up_write(&oi->ip_xattr_sem);
968 return 0;
969}
970
971/*
972 * Given a cpos and len, try to find the refcount record which contains cpos.
973 * 1. If cpos can be found in one refcount record, return the record.
974 * 2. If cpos can't be found, return a fake record which start from cpos
975 * and end at a small value between cpos+len and start of the next record.
976 * This fake record has r_refcount = 0.
977 */
978static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
979 struct buffer_head *ref_root_bh,
980 u64 cpos, unsigned int len,
981 struct ocfs2_refcount_rec *ret_rec,
982 int *index,
983 struct buffer_head **ret_bh)
984{
985 int ret = 0, i, found;
986 u32 low_cpos;
987 struct ocfs2_extent_list *el;
988 struct ocfs2_extent_rec *tmp, *rec = NULL;
989 struct ocfs2_extent_block *eb;
990 struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
991 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
992 struct ocfs2_refcount_block *rb =
993 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
994
995 if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) {
996 ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len,
997 ret_rec, index);
998 *ret_bh = ref_root_bh;
999 get_bh(ref_root_bh);
1000 return 0;
1001 }
1002
1003 el = &rb->rf_list;
1004 low_cpos = cpos & OCFS2_32BIT_POS_MASK;
1005
1006 if (el->l_tree_depth) {
1007 ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
1008 if (ret) {
1009 mlog_errno(ret);
1010 goto out;
1011 }
1012
1013 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1014 el = &eb->h_list;
1015
1016 if (el->l_tree_depth) {
1017 ocfs2_error(sb,
1018 "refcount tree %llu has non zero tree "
1019 "depth in leaf btree tree block %llu\n",
1020 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1021 (unsigned long long)eb_bh->b_blocknr);
1022 ret = -EROFS;
1023 goto out;
1024 }
1025 }
1026
1027 found = 0;
1028 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
1029 rec = &el->l_recs[i];
1030
1031 if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
1032 found = 1;
1033 break;
1034 }
1035 }
1036
1037 /* adjust len when we have ocfs2_extent_rec after it. */
1038 if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) {
1039 tmp = &el->l_recs[i+1];
1040
1041 if (le32_to_cpu(tmp->e_cpos) < cpos + len)
1042 len = le32_to_cpu(tmp->e_cpos) - cpos;
1043 }
1044
1045 ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
1046 &ref_leaf_bh);
1047 if (ret) {
1048 mlog_errno(ret);
1049 goto out;
1050 }
1051
1052 ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len,
1053 ret_rec, index);
1054 *ret_bh = ref_leaf_bh;
1055out:
1056 brelse(eb_bh);
1057 return ret;
1058}
1059
1060enum ocfs2_ref_rec_contig {
1061 REF_CONTIG_NONE = 0,
1062 REF_CONTIG_LEFT,
1063 REF_CONTIG_RIGHT,
1064 REF_CONTIG_LEFTRIGHT,
1065};
1066
1067static enum ocfs2_ref_rec_contig
1068 ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
1069 int index)
1070{
1071 if ((rb->rf_records.rl_recs[index].r_refcount ==
1072 rb->rf_records.rl_recs[index + 1].r_refcount) &&
1073 (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
1074 le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) ==
1075 le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
1076 return REF_CONTIG_RIGHT;
1077
1078 return REF_CONTIG_NONE;
1079}
1080
1081static enum ocfs2_ref_rec_contig
1082 ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
1083 int index)
1084{
1085 enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE;
1086
1087 if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
1088 ret = ocfs2_refcount_rec_adjacent(rb, index);
1089
1090 if (index > 0) {
1091 enum ocfs2_ref_rec_contig tmp;
1092
1093 tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
1094
1095 if (tmp == REF_CONTIG_RIGHT) {
1096 if (ret == REF_CONTIG_RIGHT)
1097 ret = REF_CONTIG_LEFTRIGHT;
1098 else
1099 ret = REF_CONTIG_LEFT;
1100 }
1101 }
1102
1103 return ret;
1104}
1105
1106static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
1107 int index)
1108{
1109 BUG_ON(rb->rf_records.rl_recs[index].r_refcount !=
1110 rb->rf_records.rl_recs[index+1].r_refcount);
1111
1112 le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
1113 le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
1114
1115 if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
1116 memmove(&rb->rf_records.rl_recs[index + 1],
1117 &rb->rf_records.rl_recs[index + 2],
1118 sizeof(struct ocfs2_refcount_rec) *
1119 (le16_to_cpu(rb->rf_records.rl_used) - index - 2));
1120
1121 memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1],
1122 0, sizeof(struct ocfs2_refcount_rec));
1123 le16_add_cpu(&rb->rf_records.rl_used, -1);
1124}
1125
1126/*
1127 * Merge the refcount rec if we are contiguous with the adjacent recs.
1128 */
1129static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
1130 int index)
1131{
1132 enum ocfs2_ref_rec_contig contig =
1133 ocfs2_refcount_rec_contig(rb, index);
1134
1135 if (contig == REF_CONTIG_NONE)
1136 return;
1137
1138 if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
1139 BUG_ON(index == 0);
1140 index--;
1141 }
1142
1143 ocfs2_rotate_refcount_rec_left(rb, index);
1144
1145 if (contig == REF_CONTIG_LEFTRIGHT)
1146 ocfs2_rotate_refcount_rec_left(rb, index);
1147}
1148
1149/*
1150 * Change the refcount indexed by "index" in ref_bh.
1151 * If refcount reaches 0, remove it.
1152 */
1153static int ocfs2_change_refcount_rec(handle_t *handle,
1154 struct ocfs2_caching_info *ci,
1155 struct buffer_head *ref_leaf_bh,
1156 int index, int merge, int change)
1157{
1158 int ret;
1159 struct ocfs2_refcount_block *rb =
1160 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1161 struct ocfs2_refcount_list *rl = &rb->rf_records;
1162 struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
1163
1164 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1165 OCFS2_JOURNAL_ACCESS_WRITE);
1166 if (ret) {
1167 mlog_errno(ret);
1168 goto out;
1169 }
1170
1171 mlog(0, "change index %d, old count %u, change %d\n", index,
1172 le32_to_cpu(rec->r_refcount), change);
1173 le32_add_cpu(&rec->r_refcount, change);
1174
1175 if (!rec->r_refcount) {
1176 if (index != le16_to_cpu(rl->rl_used) - 1) {
1177 memmove(rec, rec + 1,
1178 (le16_to_cpu(rl->rl_used) - index - 1) *
1179 sizeof(struct ocfs2_refcount_rec));
1180 memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
1181 0, sizeof(struct ocfs2_refcount_rec));
1182 }
1183
1184 le16_add_cpu(&rl->rl_used, -1);
1185 } else if (merge)
1186 ocfs2_refcount_rec_merge(rb, index);
1187
1188 ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
1189 if (ret)
1190 mlog_errno(ret);
1191out:
1192 return ret;
1193}
1194
1195static int ocfs2_expand_inline_ref_root(handle_t *handle,
1196 struct ocfs2_caching_info *ci,
1197 struct buffer_head *ref_root_bh,
1198 struct buffer_head **ref_leaf_bh,
1199 struct ocfs2_alloc_context *meta_ac)
1200{
1201 int ret;
1202 u16 suballoc_bit_start;
1203 u32 num_got;
1204 u64 blkno;
1205 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1206 struct buffer_head *new_bh = NULL;
1207 struct ocfs2_refcount_block *new_rb;
1208 struct ocfs2_refcount_block *root_rb =
1209 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1210
1211 ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
1212 OCFS2_JOURNAL_ACCESS_WRITE);
1213 if (ret) {
1214 mlog_errno(ret);
1215 goto out;
1216 }
1217
1218 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
1219 &suballoc_bit_start, &num_got,
1220 &blkno);
1221 if (ret) {
1222 mlog_errno(ret);
1223 goto out;
1224 }
1225
1226 new_bh = sb_getblk(sb, blkno);
1227 if (new_bh == NULL) {
1228 ret = -EIO;
1229 mlog_errno(ret);
1230 goto out;
1231 }
1232 ocfs2_set_new_buffer_uptodate(ci, new_bh);
1233
1234 ret = ocfs2_journal_access_rb(handle, ci, new_bh,
1235 OCFS2_JOURNAL_ACCESS_CREATE);
1236 if (ret) {
1237 mlog_errno(ret);
1238 goto out;
1239 }
1240
1241 /*
1242 * Initialize ocfs2_refcount_block.
1243 * It should contain the same information as the old root.
1244 * so just memcpy it and change the corresponding field.
1245 */
1246 memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
1247
1248 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1249 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
1250 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1251 new_rb->rf_blkno = cpu_to_le64(blkno);
1252 new_rb->rf_cpos = cpu_to_le32(0);
1253 new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
1254 new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
1255 ocfs2_journal_dirty(handle, new_bh);
1256
1257 /* Now change the root. */
1258 memset(&root_rb->rf_list, 0, sb->s_blocksize -
1259 offsetof(struct ocfs2_refcount_block, rf_list));
1260 root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
1261 root_rb->rf_clusters = cpu_to_le32(1);
1262 root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
1263 root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
1264 root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
1265 root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL);
1266
1267 ocfs2_journal_dirty(handle, ref_root_bh);
1268
1269 mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno,
1270 le16_to_cpu(new_rb->rf_records.rl_used));
1271
1272 *ref_leaf_bh = new_bh;
1273 new_bh = NULL;
1274out:
1275 brelse(new_bh);
1276 return ret;
1277}
1278
1279static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
1280 struct ocfs2_refcount_rec *next)
1281{
1282 if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <=
1283 ocfs2_get_ref_rec_low_cpos(next))
1284 return 1;
1285
1286 return 0;
1287}
1288
1289static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
1290{
1291 const struct ocfs2_refcount_rec *l = a, *r = b;
1292 u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
1293 u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
1294
1295 if (l_cpos > r_cpos)
1296 return 1;
1297 if (l_cpos < r_cpos)
1298 return -1;
1299 return 0;
1300}
1301
1302static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
1303{
1304 const struct ocfs2_refcount_rec *l = a, *r = b;
1305 u64 l_cpos = le64_to_cpu(l->r_cpos);
1306 u64 r_cpos = le64_to_cpu(r->r_cpos);
1307
1308 if (l_cpos > r_cpos)
1309 return 1;
1310 if (l_cpos < r_cpos)
1311 return -1;
1312 return 0;
1313}
1314
1315static void swap_refcount_rec(void *a, void *b, int size)
1316{
1317 struct ocfs2_refcount_rec *l = a, *r = b, tmp;
1318
1319 tmp = *(struct ocfs2_refcount_rec *)l;
1320 *(struct ocfs2_refcount_rec *)l =
1321 *(struct ocfs2_refcount_rec *)r;
1322 *(struct ocfs2_refcount_rec *)r = tmp;
1323}
1324
1325/*
1326 * The refcount cpos are ordered by their 64bit cpos,
1327 * But we will use the low 32 bit to be the e_cpos in the b-tree.
1328 * So we need to make sure that this pos isn't intersected with others.
1329 *
1330 * Note: The refcount block is already sorted by their low 32 bit cpos,
1331 * So just try the middle pos first, and we will exit when we find
1332 * the good position.
1333 */
1334static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
1335 u32 *split_pos, int *split_index)
1336{
1337 int num_used = le16_to_cpu(rl->rl_used);
1338 int delta, middle = num_used / 2;
1339
1340 for (delta = 0; delta < middle; delta++) {
1341 /* Let's check delta earlier than middle */
1342 if (ocfs2_refcount_rec_no_intersect(
1343 &rl->rl_recs[middle - delta - 1],
1344 &rl->rl_recs[middle - delta])) {
1345 *split_index = middle - delta;
1346 break;
1347 }
1348
1349 /* For even counts, don't walk off the end */
1350 if ((middle + delta + 1) == num_used)
1351 continue;
1352
1353 /* Now try delta past middle */
1354 if (ocfs2_refcount_rec_no_intersect(
1355 &rl->rl_recs[middle + delta],
1356 &rl->rl_recs[middle + delta + 1])) {
1357 *split_index = middle + delta + 1;
1358 break;
1359 }
1360 }
1361
1362 if (delta >= middle)
1363 return -ENOSPC;
1364
1365 *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
1366 return 0;
1367}
1368
1369static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
1370 struct buffer_head *new_bh,
1371 u32 *split_cpos)
1372{
1373 int split_index = 0, num_moved, ret;
1374 u32 cpos = 0;
1375 struct ocfs2_refcount_block *rb =
1376 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1377 struct ocfs2_refcount_list *rl = &rb->rf_records;
1378 struct ocfs2_refcount_block *new_rb =
1379 (struct ocfs2_refcount_block *)new_bh->b_data;
1380 struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
1381
1382 mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n",
1383 (unsigned long long)ref_leaf_bh->b_blocknr,
1384 le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used));
1385
1386 /*
1387 * XXX: Improvement later.
1388 * If we know all the high 32 bit cpos is the same, no need to sort.
1389 *
1390 * In order to make the whole process safe, we do:
1391 * 1. sort the entries by their low 32 bit cpos first so that we can
1392 * find the split cpos easily.
1393 * 2. call ocfs2_insert_extent to insert the new refcount block.
1394 * 3. move the refcount rec to the new block.
1395 * 4. sort the entries by their 64 bit cpos.
1396 * 5. dirty the new_rb and rb.
1397 */
1398 sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1399 sizeof(struct ocfs2_refcount_rec),
1400 cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
1401
1402 ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
1403 if (ret) {
1404 mlog_errno(ret);
1405 return ret;
1406 }
1407
1408 new_rb->rf_cpos = cpu_to_le32(cpos);
1409
1410 /* move refcount records starting from split_index to the new block. */
1411 num_moved = le16_to_cpu(rl->rl_used) - split_index;
1412 memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
1413 num_moved * sizeof(struct ocfs2_refcount_rec));
1414
1415 /*ok, remove the entries we just moved over to the other block. */
1416 memset(&rl->rl_recs[split_index], 0,
1417 num_moved * sizeof(struct ocfs2_refcount_rec));
1418
1419 /* change old and new rl_used accordingly. */
1420 le16_add_cpu(&rl->rl_used, -num_moved);
1421 new_rl->rl_used = cpu_to_le32(num_moved);
1422
1423 sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
1424 sizeof(struct ocfs2_refcount_rec),
1425 cmp_refcount_rec_by_cpos, swap_refcount_rec);
1426
1427 sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
1428 sizeof(struct ocfs2_refcount_rec),
1429 cmp_refcount_rec_by_cpos, swap_refcount_rec);
1430
1431 *split_cpos = cpos;
1432 return 0;
1433}
1434
1435static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1436 struct ocfs2_caching_info *ci,
1437 struct buffer_head *ref_root_bh,
1438 struct buffer_head *ref_leaf_bh,
1439 struct ocfs2_alloc_context *meta_ac)
1440{
1441 int ret;
1442 u16 suballoc_bit_start;
1443 u32 num_got, new_cpos;
1444 u64 blkno;
1445 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
1446 struct ocfs2_refcount_block *root_rb =
1447 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1448 struct buffer_head *new_bh = NULL;
1449 struct ocfs2_refcount_block *new_rb;
1450 struct ocfs2_extent_tree ref_et;
1451
1452 BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL));
1453
1454 ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
1455 OCFS2_JOURNAL_ACCESS_WRITE);
1456 if (ret) {
1457 mlog_errno(ret);
1458 goto out;
1459 }
1460
1461 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1462 OCFS2_JOURNAL_ACCESS_WRITE);
1463 if (ret) {
1464 mlog_errno(ret);
1465 goto out;
1466 }
1467
1468 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1,
1469 &suballoc_bit_start, &num_got,
1470 &blkno);
1471 if (ret) {
1472 mlog_errno(ret);
1473 goto out;
1474 }
1475
1476 new_bh = sb_getblk(sb, blkno);
1477 if (new_bh == NULL) {
1478 ret = -EIO;
1479 mlog_errno(ret);
1480 goto out;
1481 }
1482 ocfs2_set_new_buffer_uptodate(ci, new_bh);
1483
1484 ret = ocfs2_journal_access_rb(handle, ci, new_bh,
1485 OCFS2_JOURNAL_ACCESS_CREATE);
1486 if (ret) {
1487 mlog_errno(ret);
1488 goto out;
1489 }
1490
1491 /* Initialize ocfs2_refcount_block. */
1492 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
1493 memset(new_rb, 0, sb->s_blocksize);
1494 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
1495 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num);
1496 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1497 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
1498 new_rb->rf_blkno = cpu_to_le64(blkno);
1499 new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
1500 new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
1501 new_rb->rf_records.rl_count =
1502 cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
1503 new_rb->rf_generation = root_rb->rf_generation;
1504
1505 ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos);
1506 if (ret) {
1507 mlog_errno(ret);
1508 goto out;
1509 }
1510
1511 ocfs2_journal_dirty(handle, ref_leaf_bh);
1512 ocfs2_journal_dirty(handle, new_bh);
1513
1514 ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
1515
1516 mlog(0, "insert new leaf block %llu at %u\n",
1517 (unsigned long long)new_bh->b_blocknr, new_cpos);
1518
1519 /* Insert the new leaf block with the specific offset cpos. */
1520 ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
1521 1, 0, meta_ac);
1522 if (ret)
1523 mlog_errno(ret);
1524
1525out:
1526 brelse(new_bh);
1527 return ret;
1528}
1529
1530static int ocfs2_expand_refcount_tree(handle_t *handle,
1531 struct ocfs2_caching_info *ci,
1532 struct buffer_head *ref_root_bh,
1533 struct buffer_head *ref_leaf_bh,
1534 struct ocfs2_alloc_context *meta_ac)
1535{
1536 int ret;
1537 struct buffer_head *expand_bh = NULL;
1538
1539 if (ref_root_bh == ref_leaf_bh) {
1540 /*
1541 * the old root bh hasn't been expanded to a b-tree,
1542 * so expand it first.
1543 */
1544 ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
1545 &expand_bh, meta_ac);
1546 if (ret) {
1547 mlog_errno(ret);
1548 goto out;
1549 }
1550 } else {
1551 expand_bh = ref_leaf_bh;
1552 get_bh(expand_bh);
1553 }
1554
1555
1556 /* Now add a new refcount block into the tree.*/
1557 ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
1558 expand_bh, meta_ac);
1559 if (ret)
1560 mlog_errno(ret);
1561out:
1562 brelse(expand_bh);
1563 return ret;
1564}
1565
1566/*
1567 * Adjust the extent rec in b-tree representing ref_leaf_bh.
1568 *
1569 * Only called when we have inserted a new refcount rec at index 0
1570 * which means ocfs2_extent_rec.e_cpos may need some change.
1571 */
1572static int ocfs2_adjust_refcount_rec(handle_t *handle,
1573 struct ocfs2_caching_info *ci,
1574 struct buffer_head *ref_root_bh,
1575 struct buffer_head *ref_leaf_bh,
1576 struct ocfs2_refcount_rec *rec)
1577{
1578 int ret = 0, i;
1579 u32 new_cpos, old_cpos;
1580 struct ocfs2_path *path = NULL;
1581 struct ocfs2_extent_tree et;
1582 struct ocfs2_refcount_block *rb =
1583 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
1584 struct ocfs2_extent_list *el;
1585
1586 if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL))
1587 goto out;
1588
1589 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1590 old_cpos = le32_to_cpu(rb->rf_cpos);
1591 new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
1592 if (old_cpos <= new_cpos)
1593 goto out;
1594
1595 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
1596
1597 path = ocfs2_new_path_from_et(&et);
1598 if (!path) {
1599 ret = -ENOMEM;
1600 mlog_errno(ret);
1601 goto out;
1602 }
1603
1604 ret = ocfs2_find_path(ci, path, old_cpos);
1605 if (ret) {
1606 mlog_errno(ret);
1607 goto out;
1608 }
1609
1610 /*
1611 * 2 more credits, one for the leaf refcount block, one for
1612 * the extent block contains the extent rec.
1613 */
1614 ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2);
1615 if (ret < 0) {
1616 mlog_errno(ret);
1617 goto out;
1618 }
1619
1620 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1621 OCFS2_JOURNAL_ACCESS_WRITE);
1622 if (ret < 0) {
1623 mlog_errno(ret);
1624 goto out;
1625 }
1626
1627 ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path),
1628 OCFS2_JOURNAL_ACCESS_WRITE);
1629 if (ret < 0) {
1630 mlog_errno(ret);
1631 goto out;
1632 }
1633
1634 /* change the leaf extent block first. */
1635 el = path_leaf_el(path);
1636
1637 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++)
1638 if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos)
1639 break;
1640
1641 BUG_ON(i == le16_to_cpu(el->l_next_free_rec));
1642
1643 el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1644
1645 /* change the r_cpos in the leaf block. */
1646 rb->rf_cpos = cpu_to_le32(new_cpos);
1647
1648 ocfs2_journal_dirty(handle, path_leaf_bh(path));
1649 ocfs2_journal_dirty(handle, ref_leaf_bh);
1650
1651out:
1652 ocfs2_free_path(path);
1653 return ret;
1654}
1655
1656static int ocfs2_insert_refcount_rec(handle_t *handle,
1657 struct ocfs2_caching_info *ci,
1658 struct buffer_head *ref_root_bh,
1659 struct buffer_head *ref_leaf_bh,
1660 struct ocfs2_refcount_rec *rec,
1661 int index, int merge,
1662 struct ocfs2_alloc_context *meta_ac)
1663{
1664 int ret;
1665 struct ocfs2_refcount_block *rb =
1666 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1667 struct ocfs2_refcount_list *rf_list = &rb->rf_records;
1668 struct buffer_head *new_bh = NULL;
1669
1670 BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
1671
1672 if (rf_list->rl_used == rf_list->rl_count) {
1673 u64 cpos = le64_to_cpu(rec->r_cpos);
1674 u32 len = le32_to_cpu(rec->r_clusters);
1675
1676 ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
1677 ref_leaf_bh, meta_ac);
1678 if (ret) {
1679 mlog_errno(ret);
1680 goto out;
1681 }
1682
1683 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1684 cpos, len, NULL, &index,
1685 &new_bh);
1686 if (ret) {
1687 mlog_errno(ret);
1688 goto out;
1689 }
1690
1691 ref_leaf_bh = new_bh;
1692 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1693 rf_list = &rb->rf_records;
1694 }
1695
1696 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1697 OCFS2_JOURNAL_ACCESS_WRITE);
1698 if (ret) {
1699 mlog_errno(ret);
1700 goto out;
1701 }
1702
1703 if (index < le16_to_cpu(rf_list->rl_used))
1704 memmove(&rf_list->rl_recs[index + 1],
1705 &rf_list->rl_recs[index],
1706 (le16_to_cpu(rf_list->rl_used) - index) *
1707 sizeof(struct ocfs2_refcount_rec));
1708
1709 mlog(0, "insert refcount record start %llu, len %u, count %u "
1710 "to leaf block %llu at index %d\n",
1711 (unsigned long long)le64_to_cpu(rec->r_cpos),
1712 le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount),
1713 (unsigned long long)ref_leaf_bh->b_blocknr, index);
1714
1715 rf_list->rl_recs[index] = *rec;
1716
1717 le16_add_cpu(&rf_list->rl_used, 1);
1718
1719 if (merge)
1720 ocfs2_refcount_rec_merge(rb, index);
1721
1722 ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
1723 if (ret) {
1724 mlog_errno(ret);
1725 goto out;
1726 }
1727
1728 if (index == 0) {
1729 ret = ocfs2_adjust_refcount_rec(handle, ci,
1730 ref_root_bh,
1731 ref_leaf_bh, rec);
1732 if (ret)
1733 mlog_errno(ret);
1734 }
1735out:
1736 brelse(new_bh);
1737 return ret;
1738}
1739
1740/*
1741 * Split the refcount_rec indexed by "index" in ref_leaf_bh.
1742 * This is much simple than our b-tree code.
1743 * split_rec is the new refcount rec we want to insert.
1744 * If split_rec->r_refcount > 0, we are changing the refcount(in case we
1745 * increase refcount or decrease a refcount to non-zero).
1746 * If split_rec->r_refcount == 0, we are punching a hole in current refcount
1747 * rec( in case we decrease a refcount to zero).
1748 */
1749static int ocfs2_split_refcount_rec(handle_t *handle,
1750 struct ocfs2_caching_info *ci,
1751 struct buffer_head *ref_root_bh,
1752 struct buffer_head *ref_leaf_bh,
1753 struct ocfs2_refcount_rec *split_rec,
1754 int index, int merge,
1755 struct ocfs2_alloc_context *meta_ac,
1756 struct ocfs2_cached_dealloc_ctxt *dealloc)
1757{
1758 int ret, recs_need;
1759 u32 len;
1760 struct ocfs2_refcount_block *rb =
1761 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1762 struct ocfs2_refcount_list *rf_list = &rb->rf_records;
1763 struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
1764 struct ocfs2_refcount_rec *tail_rec = NULL;
1765 struct buffer_head *new_bh = NULL;
1766
1767 BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
1768
1769 mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n",
1770 le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters),
1771 le64_to_cpu(split_rec->r_cpos),
1772 le32_to_cpu(split_rec->r_clusters));
1773
1774 /*
1775 * If we just need to split the header or tail clusters,
1776 * no more recs are needed, just split is OK.
1777 * Otherwise we at least need one new recs.
1778 */
1779 if (!split_rec->r_refcount &&
1780 (split_rec->r_cpos == orig_rec->r_cpos ||
1781 le64_to_cpu(split_rec->r_cpos) +
1782 le32_to_cpu(split_rec->r_clusters) ==
1783 le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
1784 recs_need = 0;
1785 else
1786 recs_need = 1;
1787
1788 /*
1789 * We need one more rec if we split in the middle and the new rec have
1790 * some refcount in it.
1791 */
1792 if (split_rec->r_refcount &&
1793 (split_rec->r_cpos != orig_rec->r_cpos &&
1794 le64_to_cpu(split_rec->r_cpos) +
1795 le32_to_cpu(split_rec->r_clusters) !=
1796 le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
1797 recs_need++;
1798
1799 /* If the leaf block don't have enough record, expand it. */
1800 if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) {
1801 struct ocfs2_refcount_rec tmp_rec;
1802 u64 cpos = le64_to_cpu(orig_rec->r_cpos);
1803 len = le32_to_cpu(orig_rec->r_clusters);
1804 ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
1805 ref_leaf_bh, meta_ac);
1806 if (ret) {
1807 mlog_errno(ret);
1808 goto out;
1809 }
1810
1811 /*
1812 * We have to re-get it since now cpos may be moved to
1813 * another leaf block.
1814 */
1815 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1816 cpos, len, &tmp_rec, &index,
1817 &new_bh);
1818 if (ret) {
1819 mlog_errno(ret);
1820 goto out;
1821 }
1822
1823 ref_leaf_bh = new_bh;
1824 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
1825 rf_list = &rb->rf_records;
1826 orig_rec = &rf_list->rl_recs[index];
1827 }
1828
1829 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
1830 OCFS2_JOURNAL_ACCESS_WRITE);
1831 if (ret) {
1832 mlog_errno(ret);
1833 goto out;
1834 }
1835
1836 /*
1837 * We have calculated out how many new records we need and store
1838 * in recs_need, so spare enough space first by moving the records
1839 * after "index" to the end.
1840 */
1841 if (index != le16_to_cpu(rf_list->rl_used) - 1)
1842 memmove(&rf_list->rl_recs[index + 1 + recs_need],
1843 &rf_list->rl_recs[index + 1],
1844 (le16_to_cpu(rf_list->rl_used) - index - 1) *
1845 sizeof(struct ocfs2_refcount_rec));
1846
1847 len = (le64_to_cpu(orig_rec->r_cpos) +
1848 le32_to_cpu(orig_rec->r_clusters)) -
1849 (le64_to_cpu(split_rec->r_cpos) +
1850 le32_to_cpu(split_rec->r_clusters));
1851
1852 /*
1853 * If we have "len", the we will split in the tail and move it
1854 * to the end of the space we have just spared.
1855 */
1856 if (len) {
1857 tail_rec = &rf_list->rl_recs[index + recs_need];
1858
1859 memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
1860 le64_add_cpu(&tail_rec->r_cpos,
1861 le32_to_cpu(tail_rec->r_clusters) - len);
1862 tail_rec->r_clusters = le32_to_cpu(len);
1863 }
1864
1865 /*
1866 * If the split pos isn't the same as the original one, we need to
1867 * split in the head.
1868 *
1869 * Note: We have the chance that split_rec.r_refcount = 0,
1870 * recs_need = 0 and len > 0, which means we just cut the head from
1871 * the orig_rec and in that case we have done some modification in
1872 * orig_rec above, so the check for r_cpos is faked.
1873 */
1874 if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) {
1875 len = le64_to_cpu(split_rec->r_cpos) -
1876 le64_to_cpu(orig_rec->r_cpos);
1877 orig_rec->r_clusters = cpu_to_le32(len);
1878 index++;
1879 }
1880
1881 le16_add_cpu(&rf_list->rl_used, recs_need);
1882
1883 if (split_rec->r_refcount) {
1884 rf_list->rl_recs[index] = *split_rec;
1885 mlog(0, "insert refcount record start %llu, len %u, count %u "
1886 "to leaf block %llu at index %d\n",
1887 (unsigned long long)le64_to_cpu(split_rec->r_cpos),
1888 le32_to_cpu(split_rec->r_clusters),
1889 le32_to_cpu(split_rec->r_refcount),
1890 (unsigned long long)ref_leaf_bh->b_blocknr, index);
1891
1892 if (merge)
1893 ocfs2_refcount_rec_merge(rb, index);
1894 }
1895
1896 ret = ocfs2_journal_dirty(handle, ref_leaf_bh);
1897 if (ret)
1898 mlog_errno(ret);
1899
1900out:
1901 brelse(new_bh);
1902 return ret;
1903}
1904
1905static int __ocfs2_increase_refcount(handle_t *handle,
1906 struct ocfs2_caching_info *ci,
1907 struct buffer_head *ref_root_bh,
1908 u64 cpos, u32 len, int merge,
1909 struct ocfs2_alloc_context *meta_ac,
1910 struct ocfs2_cached_dealloc_ctxt *dealloc)
1911{
1912 int ret = 0, index;
1913 struct buffer_head *ref_leaf_bh = NULL;
1914 struct ocfs2_refcount_rec rec;
1915 unsigned int set_len = 0;
1916
1917 mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n",
1918 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1919 (unsigned long long)cpos, len);
1920
1921 while (len) {
1922 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
1923 cpos, len, &rec, &index,
1924 &ref_leaf_bh);
1925 if (ret) {
1926 mlog_errno(ret);
1927 goto out;
1928 }
1929
1930 set_len = le32_to_cpu(rec.r_clusters);
1931
1932 /*
1933 * Here we may meet with 3 situations:
1934 *
1935 * 1. If we find an already existing record, and the length
1936 * is the same, cool, we just need to increase the r_refcount
1937 * and it is OK.
1938 * 2. If we find a hole, just insert it with r_refcount = 1.
1939 * 3. If we are in the middle of one extent record, split
1940 * it.
1941 */
1942 if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
1943 set_len <= len) {
1944 mlog(0, "increase refcount rec, start %llu, len %u, "
1945 "count %u\n", (unsigned long long)cpos, set_len,
1946 le32_to_cpu(rec.r_refcount));
1947 ret = ocfs2_change_refcount_rec(handle, ci,
1948 ref_leaf_bh, index,
1949 merge, 1);
1950 if (ret) {
1951 mlog_errno(ret);
1952 goto out;
1953 }
1954 } else if (!rec.r_refcount) {
1955 rec.r_refcount = cpu_to_le32(1);
1956
1957 mlog(0, "insert refcount rec, start %llu, len %u\n",
1958 (unsigned long long)le64_to_cpu(rec.r_cpos),
1959 set_len);
1960 ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
1961 ref_leaf_bh,
1962 &rec, index,
1963 merge, meta_ac);
1964 if (ret) {
1965 mlog_errno(ret);
1966 goto out;
1967 }
1968 } else {
1969 set_len = min((u64)(cpos + len),
1970 le64_to_cpu(rec.r_cpos) + set_len) - cpos;
1971 rec.r_cpos = cpu_to_le64(cpos);
1972 rec.r_clusters = cpu_to_le32(set_len);
1973 le32_add_cpu(&rec.r_refcount, 1);
1974
1975 mlog(0, "split refcount rec, start %llu, "
1976 "len %u, count %u\n",
1977 (unsigned long long)le64_to_cpu(rec.r_cpos),
1978 set_len, le32_to_cpu(rec.r_refcount));
1979 ret = ocfs2_split_refcount_rec(handle, ci,
1980 ref_root_bh, ref_leaf_bh,
1981 &rec, index, merge,
1982 meta_ac, dealloc);
1983 if (ret) {
1984 mlog_errno(ret);
1985 goto out;
1986 }
1987 }
1988
1989 cpos += set_len;
1990 len -= set_len;
1991 brelse(ref_leaf_bh);
1992 ref_leaf_bh = NULL;
1993 }
1994
1995out:
1996 brelse(ref_leaf_bh);
1997 return ret;
1998}
1999
2000static int ocfs2_remove_refcount_extent(handle_t *handle,
2001 struct ocfs2_caching_info *ci,
2002 struct buffer_head *ref_root_bh,
2003 struct buffer_head *ref_leaf_bh,
2004 struct ocfs2_alloc_context *meta_ac,
2005 struct ocfs2_cached_dealloc_ctxt *dealloc)
2006{
2007 int ret;
2008 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2009 struct ocfs2_refcount_block *rb =
2010 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2011 struct ocfs2_extent_tree et;
2012
2013 BUG_ON(rb->rf_records.rl_used);
2014
2015 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2016 ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
2017 1, meta_ac, dealloc);
2018 if (ret) {
2019 mlog_errno(ret);
2020 goto out;
2021 }
2022
2023 ocfs2_remove_from_cache(ci, ref_leaf_bh);
2024
2025 /*
2026 * add the freed block to the dealloc so that it will be freed
2027 * when we run dealloc.
2028 */
2029 ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
2030 le16_to_cpu(rb->rf_suballoc_slot),
2031 le64_to_cpu(rb->rf_blkno),
2032 le16_to_cpu(rb->rf_suballoc_bit));
2033 if (ret) {
2034 mlog_errno(ret);
2035 goto out;
2036 }
2037
2038 ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
2039 OCFS2_JOURNAL_ACCESS_WRITE);
2040 if (ret) {
2041 mlog_errno(ret);
2042 goto out;
2043 }
2044
2045 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
2046
2047 le32_add_cpu(&rb->rf_clusters, -1);
2048
2049 /*
2050 * check whether we need to restore the root refcount block if
2051 * there is no leaf extent block at atll.
2052 */
2053 if (!rb->rf_list.l_next_free_rec) {
2054 BUG_ON(rb->rf_clusters);
2055
2056 mlog(0, "reset refcount tree root %llu to be a record block.\n",
2057 (unsigned long long)ref_root_bh->b_blocknr);
2058
2059 rb->rf_flags = 0;
2060 rb->rf_parent = 0;
2061 rb->rf_cpos = 0;
2062 memset(&rb->rf_records, 0, sb->s_blocksize -
2063 offsetof(struct ocfs2_refcount_block, rf_records));
2064 rb->rf_records.rl_count =
2065 cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
2066 }
2067
2068 ocfs2_journal_dirty(handle, ref_root_bh);
2069
2070out:
2071 return ret;
2072}
2073
2074int ocfs2_increase_refcount(handle_t *handle,
2075 struct ocfs2_caching_info *ci,
2076 struct buffer_head *ref_root_bh,
2077 u64 cpos, u32 len,
2078 struct ocfs2_alloc_context *meta_ac,
2079 struct ocfs2_cached_dealloc_ctxt *dealloc)
2080{
2081 return __ocfs2_increase_refcount(handle, ci, ref_root_bh,
2082 cpos, len, 1,
2083 meta_ac, dealloc);
2084}
2085
2086static int ocfs2_decrease_refcount_rec(handle_t *handle,
2087 struct ocfs2_caching_info *ci,
2088 struct buffer_head *ref_root_bh,
2089 struct buffer_head *ref_leaf_bh,
2090 int index, u64 cpos, unsigned int len,
2091 struct ocfs2_alloc_context *meta_ac,
2092 struct ocfs2_cached_dealloc_ctxt *dealloc)
2093{
2094 int ret;
2095 struct ocfs2_refcount_block *rb =
2096 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2097 struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
2098
2099 BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
2100 BUG_ON(cpos + len >
2101 le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
2102
2103 if (cpos == le64_to_cpu(rec->r_cpos) &&
2104 len == le32_to_cpu(rec->r_clusters))
2105 ret = ocfs2_change_refcount_rec(handle, ci,
2106 ref_leaf_bh, index, 1, -1);
2107 else {
2108 struct ocfs2_refcount_rec split = *rec;
2109 split.r_cpos = cpu_to_le64(cpos);
2110 split.r_clusters = cpu_to_le32(len);
2111
2112 le32_add_cpu(&split.r_refcount, -1);
2113
2114 mlog(0, "split refcount rec, start %llu, "
2115 "len %u, count %u, original start %llu, len %u\n",
2116 (unsigned long long)le64_to_cpu(split.r_cpos),
2117 len, le32_to_cpu(split.r_refcount),
2118 (unsigned long long)le64_to_cpu(rec->r_cpos),
2119 le32_to_cpu(rec->r_clusters));
2120 ret = ocfs2_split_refcount_rec(handle, ci,
2121 ref_root_bh, ref_leaf_bh,
2122 &split, index, 1,
2123 meta_ac, dealloc);
2124 }
2125
2126 if (ret) {
2127 mlog_errno(ret);
2128 goto out;
2129 }
2130
2131 /* Remove the leaf refcount block if it contains no refcount record. */
2132 if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
2133 ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
2134 ref_leaf_bh, meta_ac,
2135 dealloc);
2136 if (ret)
2137 mlog_errno(ret);
2138 }
2139
2140out:
2141 return ret;
2142}
2143
2144static int __ocfs2_decrease_refcount(handle_t *handle,
2145 struct ocfs2_caching_info *ci,
2146 struct buffer_head *ref_root_bh,
2147 u64 cpos, u32 len,
2148 struct ocfs2_alloc_context *meta_ac,
2149 struct ocfs2_cached_dealloc_ctxt *dealloc,
2150 int delete)
2151{
2152 int ret = 0, index = 0;
2153 struct ocfs2_refcount_rec rec;
2154 unsigned int r_count = 0, r_len;
2155 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2156 struct buffer_head *ref_leaf_bh = NULL;
2157
2158 mlog(0, "Tree owner %llu, decrease refcount start %llu, "
2159 "len %u, delete %u\n",
2160 (unsigned long long)ocfs2_metadata_cache_owner(ci),
2161 (unsigned long long)cpos, len, delete);
2162
2163 while (len) {
2164 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2165 cpos, len, &rec, &index,
2166 &ref_leaf_bh);
2167 if (ret) {
2168 mlog_errno(ret);
2169 goto out;
2170 }
2171
2172 r_count = le32_to_cpu(rec.r_refcount);
2173 BUG_ON(r_count == 0);
2174 if (!delete)
2175 BUG_ON(r_count > 1);
2176
2177 r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
2178 le32_to_cpu(rec.r_clusters)) - cpos;
2179
2180 ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
2181 ref_leaf_bh, index,
2182 cpos, r_len,
2183 meta_ac, dealloc);
2184 if (ret) {
2185 mlog_errno(ret);
2186 goto out;
2187 }
2188
2189 if (le32_to_cpu(rec.r_refcount) == 1 && delete) {
2190 ret = ocfs2_cache_cluster_dealloc(dealloc,
2191 ocfs2_clusters_to_blocks(sb, cpos),
2192 r_len);
2193 if (ret) {
2194 mlog_errno(ret);
2195 goto out;
2196 }
2197 }
2198
2199 cpos += r_len;
2200 len -= r_len;
2201 brelse(ref_leaf_bh);
2202 ref_leaf_bh = NULL;
2203 }
2204
2205out:
2206 brelse(ref_leaf_bh);
2207 return ret;
2208}
2209
2210/* Caller must hold refcount tree lock. */
2211int ocfs2_decrease_refcount(struct inode *inode,
2212 handle_t *handle, u32 cpos, u32 len,
2213 struct ocfs2_alloc_context *meta_ac,
2214 struct ocfs2_cached_dealloc_ctxt *dealloc,
2215 int delete)
2216{
2217 int ret;
2218 u64 ref_blkno;
2219 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2220 struct buffer_head *ref_root_bh = NULL;
2221 struct ocfs2_refcount_tree *tree;
2222
2223 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
2224
2225 ret = ocfs2_get_refcount_block(inode, &ref_blkno);
2226 if (ret) {
2227 mlog_errno(ret);
2228 goto out;
2229 }
2230
2231 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree);
2232 if (ret) {
2233 mlog_errno(ret);
2234 goto out;
2235 }
2236
2237 ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
2238 &ref_root_bh);
2239 if (ret) {
2240 mlog_errno(ret);
2241 goto out;
2242 }
2243
2244 ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
2245 cpos, len, meta_ac, dealloc, delete);
2246 if (ret)
2247 mlog_errno(ret);
2248out:
2249 brelse(ref_root_bh);
2250 return ret;
2251}
2252
2253/*
2254 * Mark the already-existing extent at cpos as refcounted for len clusters.
2255 * This adds the refcount extent flag.
2256 *
2257 * If the existing extent is larger than the request, initiate a
2258 * split. An attempt will be made at merging with adjacent extents.
2259 *
2260 * The caller is responsible for passing down meta_ac if we'll need it.
2261 */
2262static int ocfs2_mark_extent_refcounted(struct inode *inode,
2263 struct ocfs2_extent_tree *et,
2264 handle_t *handle, u32 cpos,
2265 u32 len, u32 phys,
2266 struct ocfs2_alloc_context *meta_ac,
2267 struct ocfs2_cached_dealloc_ctxt *dealloc)
2268{
2269 int ret;
2270
2271 mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n",
2272 inode->i_ino, cpos, len, phys);
2273
2274 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2275 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
2276 "tree, but the feature bit is not set in the "
2277 "super block.", inode->i_ino);
2278 ret = -EROFS;
2279 goto out;
2280 }
2281
2282 ret = ocfs2_change_extent_flag(handle, et, cpos,
2283 len, phys, meta_ac, dealloc,
2284 OCFS2_EXT_REFCOUNTED, 0);
2285 if (ret)
2286 mlog_errno(ret);
2287
2288out:
2289 return ret;
2290}
2291
2292/*
2293 * Given some contiguous physical clusters, calculate what we need
2294 * for modifying their refcount.
2295 */
2296static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2297 struct ocfs2_caching_info *ci,
2298 struct buffer_head *ref_root_bh,
2299 u64 start_cpos,
2300 u32 clusters,
2301 int *meta_add,
2302 int *credits)
2303{
2304 int ret = 0, index, ref_blocks = 0, recs_add = 0;
2305 u64 cpos = start_cpos;
2306 struct ocfs2_refcount_block *rb;
2307 struct ocfs2_refcount_rec rec;
2308 struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
2309 u32 len;
2310
2311 mlog(0, "start_cpos %llu, clusters %u\n",
2312 (unsigned long long)start_cpos, clusters);
2313 while (clusters) {
2314 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2315 cpos, clusters, &rec,
2316 &index, &ref_leaf_bh);
2317 if (ret) {
2318 mlog_errno(ret);
2319 goto out;
2320 }
2321
2322 if (ref_leaf_bh != prev_bh) {
2323 /*
2324 * Now we encounter a new leaf block, so calculate
2325 * whether we need to extend the old leaf.
2326 */
2327 if (prev_bh) {
2328 rb = (struct ocfs2_refcount_block *)
2329 prev_bh->b_data;
2330
2331 if (le64_to_cpu(rb->rf_records.rl_used) +
2332 recs_add >
2333 le16_to_cpu(rb->rf_records.rl_count))
2334 ref_blocks++;
2335 }
2336
2337 recs_add = 0;
2338 *credits += 1;
2339 brelse(prev_bh);
2340 prev_bh = ref_leaf_bh;
2341 get_bh(prev_bh);
2342 }
2343
2344 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2345
2346 mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu,"
2347 "rec->r_clusters %u, rec->r_refcount %u, index %d\n",
2348 recs_add, (unsigned long long)cpos, clusters,
2349 (unsigned long long)le64_to_cpu(rec.r_cpos),
2350 le32_to_cpu(rec.r_clusters),
2351 le32_to_cpu(rec.r_refcount), index);
2352
2353 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
2354 le32_to_cpu(rec.r_clusters)) - cpos;
2355 /*
2356 * If the refcount rec already exist, cool. We just need
2357 * to check whether there is a split. Otherwise we just need
2358 * to increase the refcount.
2359 * If we will insert one, increases recs_add.
2360 *
2361 * We record all the records which will be inserted to the
2362 * same refcount block, so that we can tell exactly whether
2363 * we need a new refcount block or not.
2364 */
2365 if (rec.r_refcount) {
2366 /* Check whether we need a split at the beginning. */
2367 if (cpos == start_cpos &&
2368 cpos != le64_to_cpu(rec.r_cpos))
2369 recs_add++;
2370
2371 /* Check whether we need a split in the end. */
2372 if (cpos + clusters < le64_to_cpu(rec.r_cpos) +
2373 le32_to_cpu(rec.r_clusters))
2374 recs_add++;
2375 } else
2376 recs_add++;
2377
2378 brelse(ref_leaf_bh);
2379 ref_leaf_bh = NULL;
2380 clusters -= len;
2381 cpos += len;
2382 }
2383
2384 if (prev_bh) {
2385 rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
2386
2387 if (le64_to_cpu(rb->rf_records.rl_used) + recs_add >
2388 le16_to_cpu(rb->rf_records.rl_count))
2389 ref_blocks++;
2390
2391 *credits += 1;
2392 }
2393
2394 if (!ref_blocks)
2395 goto out;
2396
2397 mlog(0, "we need ref_blocks %d\n", ref_blocks);
2398 *meta_add += ref_blocks;
2399 *credits += ref_blocks;
2400
2401 /*
2402 * So we may need ref_blocks to insert into the tree.
2403 * That also means we need to change the b-tree and add that number
2404 * of records since we never merge them.
2405 * We need one more block for expansion since the new created leaf
2406 * block is also full and needs split.
2407 */
2408 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
2409 if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) {
2410 struct ocfs2_extent_tree et;
2411
2412 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2413 *meta_add += ocfs2_extend_meta_needed(et.et_root_el);
2414 *credits += ocfs2_calc_extend_credits(sb,
2415 et.et_root_el,
2416 ref_blocks);
2417 } else {
2418 *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
2419 *meta_add += 1;
2420 }
2421
2422out:
2423 brelse(ref_leaf_bh);
2424 brelse(prev_bh);
2425 return ret;
2426}
2427
2428/*
2429 * For refcount tree, we will decrease some contiguous clusters
2430 * refcount count, so just go through it to see how many blocks
2431 * we gonna touch and whether we need to create new blocks.
2432 *
2433 * Normally the refcount blocks store these refcount should be
2434 * continguous also, so that we can get the number easily.
2435 * As for meta_ac, we will at most add split 2 refcount record and
2436 * 2 more refcount block, so just check it in a rough way.
2437 *
2438 * Caller must hold refcount tree lock.
2439 */
2440int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2441 struct buffer_head *di_bh,
2442 u64 phys_blkno,
2443 u32 clusters,
2444 int *credits,
2445 struct ocfs2_alloc_context **meta_ac)
2446{
2447 int ret, ref_blocks = 0;
2448 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2449 struct ocfs2_inode_info *oi = OCFS2_I(inode);
2450 struct buffer_head *ref_root_bh = NULL;
2451 struct ocfs2_refcount_tree *tree;
2452 u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
2453
2454 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2455 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
2456 "tree, but the feature bit is not set in the "
2457 "super block.", inode->i_ino);
2458 ret = -EROFS;
2459 goto out;
2460 }
2461
2462 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
2463
2464 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
2465 le64_to_cpu(di->i_refcount_loc), &tree);
2466 if (ret) {
2467 mlog_errno(ret);
2468 goto out;
2469 }
2470
2471 ret = ocfs2_read_refcount_block(&tree->rf_ci,
2472 le64_to_cpu(di->i_refcount_loc),
2473 &ref_root_bh);
2474 if (ret) {
2475 mlog_errno(ret);
2476 goto out;
2477 }
2478
2479 ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
2480 &tree->rf_ci,
2481 ref_root_bh,
2482 start_cpos, clusters,
2483 &ref_blocks, credits);
2484 if (ret) {
2485 mlog_errno(ret);
2486 goto out;
2487 }
2488
2489 mlog(0, "reserve new metadata %d, credits = %d\n",
2490 ref_blocks, *credits);
2491
2492 if (ref_blocks) {
2493 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
2494 ref_blocks, meta_ac);
2495 if (ret)
2496 mlog_errno(ret);
2497 }
2498
2499out:
2500 brelse(ref_root_bh);
2501 return ret;
2502}
2503
2504#define MAX_CONTIG_BYTES 1048576
2505
2506static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
2507{
2508 return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
2509}
2510
2511static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
2512{
2513 return ~(ocfs2_cow_contig_clusters(sb) - 1);
2514}
2515
2516/*
2517 * Given an extent that starts at 'start' and an I/O that starts at 'cpos',
2518 * find an offset (start + (n * contig_clusters)) that is closest to cpos
2519 * while still being less than or equal to it.
2520 *
2521 * The goal is to break the extent at a multiple of contig_clusters.
2522 */
2523static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
2524 unsigned int start,
2525 unsigned int cpos)
2526{
2527 BUG_ON(start > cpos);
2528
2529 return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
2530}
2531
2532/*
2533 * Given a cluster count of len, pad it out so that it is a multiple
2534 * of contig_clusters.
2535 */
2536static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
2537 unsigned int len)
2538{
2539 unsigned int padded =
2540 (len + (ocfs2_cow_contig_clusters(sb) - 1)) &
2541 ocfs2_cow_contig_mask(sb);
2542
2543 /* Did we wrap? */
2544 if (padded < len)
2545 padded = UINT_MAX;
2546
2547 return padded;
2548}
2549
2550/*
2551 * Calculate out the start and number of virtual clusters we need to to CoW.
2552 *
2553 * cpos is vitual start cluster position we want to do CoW in a
2554 * file and write_len is the cluster length.
2555 * max_cpos is the place where we want to stop CoW intentionally.
2556 *
2557 * Normal we will start CoW from the beginning of extent record cotaining cpos.
2558 * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
2559 * get good I/O from the resulting extent tree.
2560 */
2561static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
2562 struct ocfs2_extent_list *el,
2563 u32 cpos,
2564 u32 write_len,
2565 u32 max_cpos,
2566 u32 *cow_start,
2567 u32 *cow_len)
2568{
2569 int ret = 0;
2570 int tree_height = le16_to_cpu(el->l_tree_depth), i;
2571 struct buffer_head *eb_bh = NULL;
2572 struct ocfs2_extent_block *eb = NULL;
2573 struct ocfs2_extent_rec *rec;
2574 unsigned int want_clusters, rec_end = 0;
2575 int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
2576 int leaf_clusters;
2577
2578 BUG_ON(cpos + write_len > max_cpos);
2579
2580 if (tree_height > 0) {
2581 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
2582 if (ret) {
2583 mlog_errno(ret);
2584 goto out;
2585 }
2586
2587 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2588 el = &eb->h_list;
2589
2590 if (el->l_tree_depth) {
2591 ocfs2_error(inode->i_sb,
2592 "Inode %lu has non zero tree depth in "
2593 "leaf block %llu\n", inode->i_ino,
2594 (unsigned long long)eb_bh->b_blocknr);
2595 ret = -EROFS;
2596 goto out;
2597 }
2598 }
2599
2600 *cow_len = 0;
2601 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
2602 rec = &el->l_recs[i];
2603
2604 if (ocfs2_is_empty_extent(rec)) {
2605 mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
2606 "index %d\n", inode->i_ino, i);
2607 continue;
2608 }
2609
2610 if (le32_to_cpu(rec->e_cpos) +
2611 le16_to_cpu(rec->e_leaf_clusters) <= cpos)
2612 continue;
2613
2614 if (*cow_len == 0) {
2615 /*
2616 * We should find a refcounted record in the
2617 * first pass.
2618 */
2619 BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
2620 *cow_start = le32_to_cpu(rec->e_cpos);
2621 }
2622
2623 /*
2624 * If we encounter a hole, a non-refcounted record or
2625 * pass the max_cpos, stop the search.
2626 */
2627 if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
2628 (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) ||
2629 (max_cpos <= le32_to_cpu(rec->e_cpos)))
2630 break;
2631
2632 leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
2633 rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
2634 if (rec_end > max_cpos) {
2635 rec_end = max_cpos;
2636 leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
2637 }
2638
2639 /*
2640 * How many clusters do we actually need from
2641 * this extent? First we see how many we actually
2642 * need to complete the write. If that's smaller
2643 * than contig_clusters, we try for contig_clusters.
2644 */
2645 if (!*cow_len)
2646 want_clusters = write_len;
2647 else
2648 want_clusters = (cpos + write_len) -
2649 (*cow_start + *cow_len);
2650 if (want_clusters < contig_clusters)
2651 want_clusters = contig_clusters;
2652
2653 /*
2654 * If the write does not cover the whole extent, we
2655 * need to calculate how we're going to split the extent.
2656 * We try to do it on contig_clusters boundaries.
2657 *
2658 * Any extent smaller than contig_clusters will be
2659 * CoWed in its entirety.
2660 */
2661 if (leaf_clusters <= contig_clusters)
2662 *cow_len += leaf_clusters;
2663 else if (*cow_len || (*cow_start == cpos)) {
2664 /*
2665 * This extent needs to be CoW'd from its
2666 * beginning, so all we have to do is compute
2667 * how many clusters to grab. We align
2668 * want_clusters to the edge of contig_clusters
2669 * to get better I/O.
2670 */
2671 want_clusters = ocfs2_cow_align_length(inode->i_sb,
2672 want_clusters);
2673
2674 if (leaf_clusters < want_clusters)
2675 *cow_len += leaf_clusters;
2676 else
2677 *cow_len += want_clusters;
2678 } else if ((*cow_start + contig_clusters) >=
2679 (cpos + write_len)) {
2680 /*
2681 * Breaking off contig_clusters at the front
2682 * of the extent will cover our write. That's
2683 * easy.
2684 */
2685 *cow_len = contig_clusters;
2686 } else if ((rec_end - cpos) <= contig_clusters) {
2687 /*
2688 * Breaking off contig_clusters at the tail of
2689 * this extent will cover cpos.
2690 */
2691 *cow_start = rec_end - contig_clusters;
2692 *cow_len = contig_clusters;
2693 } else if ((rec_end - cpos) <= want_clusters) {
2694 /*
2695 * While we can't fit the entire write in this
2696 * extent, we know that the write goes from cpos
2697 * to the end of the extent. Break that off.
2698 * We try to break it at some multiple of
2699 * contig_clusters from the front of the extent.
2700 * Failing that (ie, cpos is within
2701 * contig_clusters of the front), we'll CoW the
2702 * entire extent.
2703 */
2704 *cow_start = ocfs2_cow_align_start(inode->i_sb,
2705 *cow_start, cpos);
2706 *cow_len = rec_end - *cow_start;
2707 } else {
2708 /*
2709 * Ok, the entire write lives in the middle of
2710 * this extent. Let's try to slice the extent up
2711 * nicely. Optimally, our CoW region starts at
2712 * m*contig_clusters from the beginning of the
2713 * extent and goes for n*contig_clusters,
2714 * covering the entire write.
2715 */
2716 *cow_start = ocfs2_cow_align_start(inode->i_sb,
2717 *cow_start, cpos);
2718
2719 want_clusters = (cpos + write_len) - *cow_start;
2720 want_clusters = ocfs2_cow_align_length(inode->i_sb,
2721 want_clusters);
2722 if (*cow_start + want_clusters <= rec_end)
2723 *cow_len = want_clusters;
2724 else
2725 *cow_len = rec_end - *cow_start;
2726 }
2727
2728 /* Have we covered our entire write yet? */
2729 if ((*cow_start + *cow_len) >= (cpos + write_len))
2730 break;
2731
2732 /*
2733 * If we reach the end of the extent block and don't get enough
2734 * clusters, continue with the next extent block if possible.
2735 */
2736 if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
2737 eb && eb->h_next_leaf_blk) {
2738 brelse(eb_bh);
2739 eb_bh = NULL;
2740
2741 ret = ocfs2_read_extent_block(INODE_CACHE(inode),
2742 le64_to_cpu(eb->h_next_leaf_blk),
2743 &eb_bh);
2744 if (ret) {
2745 mlog_errno(ret);
2746 goto out;
2747 }
2748
2749 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2750 el = &eb->h_list;
2751 i = -1;
2752 }
2753 }
2754
2755out:
2756 brelse(eb_bh);
2757 return ret;
2758}
2759
2760/*
2761 * Prepare meta_ac, data_ac and calculate credits when we want to add some
2762 * num_clusters in data_tree "et" and change the refcount for the old
2763 * clusters(starting form p_cluster) in the refcount tree.
2764 *
2765 * Note:
2766 * 1. since we may split the old tree, so we at most will need num_clusters + 2
2767 * more new leaf records.
2768 * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
2769 * just give data_ac = NULL.
2770 */
2771static int ocfs2_lock_refcount_allocators(struct super_block *sb,
2772 u32 p_cluster, u32 num_clusters,
2773 struct ocfs2_extent_tree *et,
2774 struct ocfs2_caching_info *ref_ci,
2775 struct buffer_head *ref_root_bh,
2776 struct ocfs2_alloc_context **meta_ac,
2777 struct ocfs2_alloc_context **data_ac,
2778 int *credits)
2779{
2780 int ret = 0, meta_add = 0;
2781 int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
2782
2783 if (num_free_extents < 0) {
2784 ret = num_free_extents;
2785 mlog_errno(ret);
2786 goto out;
2787 }
2788
2789 if (num_free_extents < num_clusters + 2)
2790 meta_add =
2791 ocfs2_extend_meta_needed(et->et_root_el);
2792
2793 *credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
2794 num_clusters + 2);
2795
2796 ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
2797 p_cluster, num_clusters,
2798 &meta_add, credits);
2799 if (ret) {
2800 mlog_errno(ret);
2801 goto out;
2802 }
2803
2804 mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n",
2805 meta_add, num_clusters, *credits);
2806 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
2807 meta_ac);
2808 if (ret) {
2809 mlog_errno(ret);
2810 goto out;
2811 }
2812
2813 if (data_ac) {
2814 ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
2815 data_ac);
2816 if (ret)
2817 mlog_errno(ret);
2818 }
2819
2820out:
2821 if (ret) {
2822 if (*meta_ac) {
2823 ocfs2_free_alloc_context(*meta_ac);
2824 *meta_ac = NULL;
2825 }
2826 }
2827
2828 return ret;
2829}
2830
2831static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
2832{
2833 BUG_ON(buffer_dirty(bh));
2834
2835 clear_buffer_mapped(bh);
2836
2837 return 0;
2838}
2839
2840static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2841 struct ocfs2_cow_context *context,
2842 u32 cpos, u32 old_cluster,
2843 u32 new_cluster, u32 new_len)
2844{
2845 int ret = 0, partial;
2846 struct ocfs2_caching_info *ci = context->data_et.et_ci;
2847 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2848 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2849 struct page *page;
2850 pgoff_t page_index;
2851 unsigned int from, to;
2852 loff_t offset, end, map_end;
2853 struct address_space *mapping = context->inode->i_mapping;
2854
2855 mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
2856 new_cluster, new_len, cpos);
2857
2858 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2859 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
2860
2861 while (offset < end) {
2862 page_index = offset >> PAGE_CACHE_SHIFT;
2863 map_end = (page_index + 1) << PAGE_CACHE_SHIFT;
2864 if (map_end > end)
2865 map_end = end;
2866
2867 /* from, to is the offset within the page. */
2868 from = offset & (PAGE_CACHE_SIZE - 1);
2869 to = PAGE_CACHE_SIZE;
2870 if (map_end & (PAGE_CACHE_SIZE - 1))
2871 to = map_end & (PAGE_CACHE_SIZE - 1);
2872
2873 page = grab_cache_page(mapping, page_index);
2874
2875 /* This page can't be dirtied before we CoW it out. */
2876 BUG_ON(PageDirty(page));
2877
2878 if (!PageUptodate(page)) {
2879 ret = block_read_full_page(page, ocfs2_get_block);
2880 if (ret) {
2881 mlog_errno(ret);
2882 goto unlock;
2883 }
2884 lock_page(page);
2885 }
2886
2887 if (page_has_buffers(page)) {
2888 ret = walk_page_buffers(handle, page_buffers(page),
2889 from, to, &partial,
2890 ocfs2_clear_cow_buffer);
2891 if (ret) {
2892 mlog_errno(ret);
2893 goto unlock;
2894 }
2895 }
2896
2897 ocfs2_map_and_dirty_page(context->inode,
2898 handle, from, to,
2899 page, 0, &new_block);
2900 mark_page_accessed(page);
2901unlock:
2902 unlock_page(page);
2903 page_cache_release(page);
2904 page = NULL;
2905 offset = map_end;
2906 if (ret)
2907 break;
2908 }
2909
2910 return ret;
2911}
2912
2913static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
2914 struct ocfs2_cow_context *context,
2915 u32 cpos, u32 old_cluster,
2916 u32 new_cluster, u32 new_len)
2917{
2918 int ret = 0;
2919 struct super_block *sb = context->inode->i_sb;
2920 struct ocfs2_caching_info *ci = context->data_et.et_ci;
2921 int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
2922 u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
2923 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2924 struct ocfs2_super *osb = OCFS2_SB(sb);
2925 struct buffer_head *old_bh = NULL;
2926 struct buffer_head *new_bh = NULL;
2927
2928 mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster,
2929 new_cluster, new_len);
2930
2931 for (i = 0; i < blocks; i++, old_block++, new_block++) {
2932 new_bh = sb_getblk(osb->sb, new_block);
2933 if (new_bh == NULL) {
2934 ret = -EIO;
2935 mlog_errno(ret);
2936 break;
2937 }
2938
2939 ocfs2_set_new_buffer_uptodate(ci, new_bh);
2940
2941 ret = ocfs2_read_block(ci, old_block, &old_bh, NULL);
2942 if (ret) {
2943 mlog_errno(ret);
2944 break;
2945 }
2946
2947 ret = ocfs2_journal_access(handle, ci, new_bh,
2948 OCFS2_JOURNAL_ACCESS_CREATE);
2949 if (ret) {
2950 mlog_errno(ret);
2951 break;
2952 }
2953
2954 memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
2955 ret = ocfs2_journal_dirty(handle, new_bh);
2956 if (ret) {
2957 mlog_errno(ret);
2958 break;
2959 }
2960
2961 brelse(new_bh);
2962 brelse(old_bh);
2963 new_bh = NULL;
2964 old_bh = NULL;
2965 }
2966
2967 brelse(new_bh);
2968 brelse(old_bh);
2969 return ret;
2970}
2971
2972static int ocfs2_clear_ext_refcount(handle_t *handle,
2973 struct ocfs2_extent_tree *et,
2974 u32 cpos, u32 p_cluster, u32 len,
2975 unsigned int ext_flags,
2976 struct ocfs2_alloc_context *meta_ac,
2977 struct ocfs2_cached_dealloc_ctxt *dealloc)
2978{
2979 int ret, index;
2980 struct ocfs2_extent_rec replace_rec;
2981 struct ocfs2_path *path = NULL;
2982 struct ocfs2_extent_list *el;
2983 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2984 u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
2985
2986 mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n",
2987 (unsigned long long)ino, cpos, len, p_cluster, ext_flags);
2988
2989 memset(&replace_rec, 0, sizeof(replace_rec));
2990 replace_rec.e_cpos = cpu_to_le32(cpos);
2991 replace_rec.e_leaf_clusters = cpu_to_le16(len);
2992 replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
2993 p_cluster));
2994 replace_rec.e_flags = ext_flags;
2995 replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
2996
2997 path = ocfs2_new_path_from_et(et);
2998 if (!path) {
2999 ret = -ENOMEM;
3000 mlog_errno(ret);
3001 goto out;
3002 }
3003
3004 ret = ocfs2_find_path(et->et_ci, path, cpos);
3005 if (ret) {
3006 mlog_errno(ret);
3007 goto out;
3008 }
3009
3010 el = path_leaf_el(path);
3011
3012 index = ocfs2_search_extent_list(el, cpos);
3013 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
3014 ocfs2_error(sb,
3015 "Inode %llu has an extent at cpos %u which can no "
3016 "longer be found.\n",
3017 (unsigned long long)ino, cpos);
3018 ret = -EROFS;
3019 goto out;
3020 }
3021
3022 ret = ocfs2_split_extent(handle, et, path, index,
3023 &replace_rec, meta_ac, dealloc);
3024 if (ret)
3025 mlog_errno(ret);
3026
3027out:
3028 ocfs2_free_path(path);
3029 return ret;
3030}
3031
3032static int ocfs2_replace_clusters(handle_t *handle,
3033 struct ocfs2_cow_context *context,
3034 u32 cpos, u32 old,
3035 u32 new, u32 len,
3036 unsigned int ext_flags)
3037{
3038 int ret;
3039 struct ocfs2_caching_info *ci = context->data_et.et_ci;
3040 u64 ino = ocfs2_metadata_cache_owner(ci);
3041
3042 mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n",
3043 (unsigned long long)ino, cpos, old, new, len, ext_flags);
3044
3045 /*If the old clusters is unwritten, no need to duplicate. */
3046 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
3047 ret = context->cow_duplicate_clusters(handle, context, cpos,
3048 old, new, len);
3049 if (ret) {
3050 mlog_errno(ret);
3051 goto out;
3052 }
3053 }
3054
3055 ret = ocfs2_clear_ext_refcount(handle, &context->data_et,
3056 cpos, new, len, ext_flags,
3057 context->meta_ac, &context->dealloc);
3058 if (ret)
3059 mlog_errno(ret);
3060out:
3061 return ret;
3062}
3063
3064static int ocfs2_cow_sync_writeback(struct super_block *sb,
3065 struct ocfs2_cow_context *context,
3066 u32 cpos, u32 num_clusters)
3067{
3068 int ret = 0;
3069 loff_t offset, end, map_end;
3070 pgoff_t page_index;
3071 struct page *page;
3072
3073 if (ocfs2_should_order_data(context->inode))
3074 return 0;
3075
3076 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
3077 end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
3078
3079 ret = filemap_fdatawrite_range(context->inode->i_mapping,
3080 offset, end - 1);
3081 if (ret < 0) {
3082 mlog_errno(ret);
3083 return ret;
3084 }
3085
3086 while (offset < end) {
3087 page_index = offset >> PAGE_CACHE_SHIFT;
3088 map_end = (page_index + 1) << PAGE_CACHE_SHIFT;
3089 if (map_end > end)
3090 map_end = end;
3091
3092 page = grab_cache_page(context->inode->i_mapping, page_index);
3093 BUG_ON(!page);
3094
3095 wait_on_page_writeback(page);
3096 if (PageError(page)) {
3097 ret = -EIO;
3098 mlog_errno(ret);
3099 } else
3100 mark_page_accessed(page);
3101
3102 unlock_page(page);
3103 page_cache_release(page);
3104 page = NULL;
3105 offset = map_end;
3106 if (ret)
3107 break;
3108 }
3109
3110 return ret;
3111}
3112
3113static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context,
3114 u32 v_cluster, u32 *p_cluster,
3115 u32 *num_clusters,
3116 unsigned int *extent_flags)
3117{
3118 return ocfs2_get_clusters(context->inode, v_cluster, p_cluster,
3119 num_clusters, extent_flags);
3120}
3121
3122static int ocfs2_make_clusters_writable(struct super_block *sb,
3123 struct ocfs2_cow_context *context,
3124 u32 cpos, u32 p_cluster,
3125 u32 num_clusters, unsigned int e_flags)
3126{
3127 int ret, delete, index, credits = 0;
3128 u32 new_bit, new_len;
3129 unsigned int set_len;
3130 struct ocfs2_super *osb = OCFS2_SB(sb);
3131 handle_t *handle;
3132 struct buffer_head *ref_leaf_bh = NULL;
3133 struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
3134 struct ocfs2_refcount_rec rec;
3135
3136 mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n",
3137 cpos, p_cluster, num_clusters, e_flags);
3138
3139 ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
3140 &context->data_et,
3141 ref_ci,
3142 context->ref_root_bh,
3143 &context->meta_ac,
3144 &context->data_ac, &credits);
3145 if (ret) {
3146 mlog_errno(ret);
3147 return ret;
3148 }
3149
3150 if (context->post_refcount)
3151 credits += context->post_refcount->credits;
3152
3153 credits += context->extra_credits;
3154 handle = ocfs2_start_trans(osb, credits);
3155 if (IS_ERR(handle)) {
3156 ret = PTR_ERR(handle);
3157 mlog_errno(ret);
3158 goto out;
3159 }
3160
3161 while (num_clusters) {
3162 ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
3163 p_cluster, num_clusters,
3164 &rec, &index, &ref_leaf_bh);
3165 if (ret) {
3166 mlog_errno(ret);
3167 goto out_commit;
3168 }
3169
3170 BUG_ON(!rec.r_refcount);
3171 set_len = min((u64)p_cluster + num_clusters,
3172 le64_to_cpu(rec.r_cpos) +
3173 le32_to_cpu(rec.r_clusters)) - p_cluster;
3174
3175 /*
3176 * There are many different situation here.
3177 * 1. If refcount == 1, remove the flag and don't COW.
3178 * 2. If refcount > 1, allocate clusters.
3179 * Here we may not allocate r_len once at a time, so continue
3180 * until we reach num_clusters.
3181 */
3182 if (le32_to_cpu(rec.r_refcount) == 1) {
3183 delete = 0;
3184 ret = ocfs2_clear_ext_refcount(handle,
3185 &context->data_et,
3186 cpos, p_cluster,
3187 set_len, e_flags,
3188 context->meta_ac,
3189 &context->dealloc);
3190 if (ret) {
3191 mlog_errno(ret);
3192 goto out_commit;
3193 }
3194 } else {
3195 delete = 1;
3196
3197 ret = __ocfs2_claim_clusters(osb, handle,
3198 context->data_ac,
3199 1, set_len,
3200 &new_bit, &new_len);
3201 if (ret) {
3202 mlog_errno(ret);
3203 goto out_commit;
3204 }
3205
3206 ret = ocfs2_replace_clusters(handle, context,
3207 cpos, p_cluster, new_bit,
3208 new_len, e_flags);
3209 if (ret) {
3210 mlog_errno(ret);
3211 goto out_commit;
3212 }
3213 set_len = new_len;
3214 }
3215
3216 ret = __ocfs2_decrease_refcount(handle, ref_ci,
3217 context->ref_root_bh,
3218 p_cluster, set_len,
3219 context->meta_ac,
3220 &context->dealloc, delete);
3221 if (ret) {
3222 mlog_errno(ret);
3223 goto out_commit;
3224 }
3225
3226 cpos += set_len;
3227 p_cluster += set_len;
3228 num_clusters -= set_len;
3229 brelse(ref_leaf_bh);
3230 ref_leaf_bh = NULL;
3231 }
3232
3233 /* handle any post_cow action. */
3234 if (context->post_refcount && context->post_refcount->func) {
3235 ret = context->post_refcount->func(context->inode, handle,
3236 context->post_refcount->para);
3237 if (ret) {
3238 mlog_errno(ret);
3239 goto out_commit;
3240 }
3241 }
3242
3243 /*
3244 * Here we should write the new page out first if we are
3245 * in write-back mode.
3246 */
3247 if (context->get_clusters == ocfs2_di_get_clusters) {
3248 ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters);
3249 if (ret)
3250 mlog_errno(ret);
3251 }
3252
3253out_commit:
3254 ocfs2_commit_trans(osb, handle);
3255
3256out:
3257 if (context->data_ac) {
3258 ocfs2_free_alloc_context(context->data_ac);
3259 context->data_ac = NULL;
3260 }
3261 if (context->meta_ac) {
3262 ocfs2_free_alloc_context(context->meta_ac);
3263 context->meta_ac = NULL;
3264 }
3265 brelse(ref_leaf_bh);
3266
3267 return ret;
3268}
3269
3270static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
3271{
3272 int ret = 0;
3273 struct inode *inode = context->inode;
3274 u32 cow_start = context->cow_start, cow_len = context->cow_len;
3275 u32 p_cluster, num_clusters;
3276 unsigned int ext_flags;
3277 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3278
3279 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
3280 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
3281 "tree, but the feature bit is not set in the "
3282 "super block.", inode->i_ino);
3283 return -EROFS;
3284 }
3285
3286 ocfs2_init_dealloc_ctxt(&context->dealloc);
3287
3288 while (cow_len) {
3289 ret = context->get_clusters(context, cow_start, &p_cluster,
3290 &num_clusters, &ext_flags);
3291 if (ret) {
3292 mlog_errno(ret);
3293 break;
3294 }
3295
3296 BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
3297
3298 if (cow_len < num_clusters)
3299 num_clusters = cow_len;
3300
3301 ret = ocfs2_make_clusters_writable(inode->i_sb, context,
3302 cow_start, p_cluster,
3303 num_clusters, ext_flags);
3304 if (ret) {
3305 mlog_errno(ret);
3306 break;
3307 }
3308
3309 cow_len -= num_clusters;
3310 cow_start += num_clusters;
3311 }
3312
3313 if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
3314 ocfs2_schedule_truncate_log_flush(osb, 1);
3315 ocfs2_run_deallocs(osb, &context->dealloc);
3316 }
3317
3318 return ret;
3319}
3320
3321/*
3322 * Starting at cpos, try to CoW write_len clusters. Don't CoW
3323 * past max_cpos. This will stop when it runs into a hole or an
3324 * unrefcounted extent.
3325 */
3326static int ocfs2_refcount_cow_hunk(struct inode *inode,
3327 struct buffer_head *di_bh,
3328 u32 cpos, u32 write_len, u32 max_cpos)
3329{
3330 int ret;
3331 u32 cow_start = 0, cow_len = 0;
3332 struct ocfs2_inode_info *oi = OCFS2_I(inode);
3333 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3334 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3335 struct buffer_head *ref_root_bh = NULL;
3336 struct ocfs2_refcount_tree *ref_tree;
3337 struct ocfs2_cow_context *context = NULL;
3338
3339 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
3340
3341 ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
3342 cpos, write_len, max_cpos,
3343 &cow_start, &cow_len);
3344 if (ret) {
3345 mlog_errno(ret);
3346 goto out;
3347 }
3348
3349 mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, "
3350 "cow_len %u\n", inode->i_ino,
3351 cpos, write_len, cow_start, cow_len);
3352
3353 BUG_ON(cow_len == 0);
3354
3355 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3356 if (!context) {
3357 ret = -ENOMEM;
3358 mlog_errno(ret);
3359 goto out;
3360 }
3361
3362 ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
3363 1, &ref_tree, &ref_root_bh);
3364 if (ret) {
3365 mlog_errno(ret);
3366 goto out;
3367 }
3368
3369 context->inode = inode;
3370 context->cow_start = cow_start;
3371 context->cow_len = cow_len;
3372 context->ref_tree = ref_tree;
3373 context->ref_root_bh = ref_root_bh;
3374 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
3375 context->get_clusters = ocfs2_di_get_clusters;
3376
3377 ocfs2_init_dinode_extent_tree(&context->data_et,
3378 INODE_CACHE(inode), di_bh);
3379
3380 ret = ocfs2_replace_cow(context);
3381 if (ret)
3382 mlog_errno(ret);
3383
3384 /*
3385 * truncate the extent map here since no matter whether we meet with
3386 * any error during the action, we shouldn't trust cached extent map
3387 * any more.
3388 */
3389 ocfs2_extent_map_trunc(inode, cow_start);
3390
3391 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3392 brelse(ref_root_bh);
3393out:
3394 kfree(context);
3395 return ret;
3396}
3397
3398/*
3399 * CoW any and all clusters between cpos and cpos+write_len.
3400 * Don't CoW past max_cpos. If this returns successfully, all
3401 * clusters between cpos and cpos+write_len are safe to modify.
3402 */
3403int ocfs2_refcount_cow(struct inode *inode,
3404 struct buffer_head *di_bh,
3405 u32 cpos, u32 write_len, u32 max_cpos)
3406{
3407 int ret = 0;
3408 u32 p_cluster, num_clusters;
3409 unsigned int ext_flags;
3410
3411 while (write_len) {
3412 ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3413 &num_clusters, &ext_flags);
3414 if (ret) {
3415 mlog_errno(ret);
3416 break;
3417 }
3418
3419 if (write_len < num_clusters)
3420 num_clusters = write_len;
3421
3422 if (ext_flags & OCFS2_EXT_REFCOUNTED) {
3423 ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
3424 num_clusters, max_cpos);
3425 if (ret) {
3426 mlog_errno(ret);
3427 break;
3428 }
3429 }
3430
3431 write_len -= num_clusters;
3432 cpos += num_clusters;
3433 }
3434
3435 return ret;
3436}
3437
3438static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context,
3439 u32 v_cluster, u32 *p_cluster,
3440 u32 *num_clusters,
3441 unsigned int *extent_flags)
3442{
3443 struct inode *inode = context->inode;
3444 struct ocfs2_xattr_value_root *xv = context->cow_object;
3445
3446 return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster,
3447 num_clusters, &xv->xr_list,
3448 extent_flags);
3449}
3450
3451/*
3452 * Given a xattr value root, calculate the most meta/credits we need for
3453 * refcount tree change if we truncate it to 0.
3454 */
3455int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
3456 struct ocfs2_caching_info *ref_ci,
3457 struct buffer_head *ref_root_bh,
3458 struct ocfs2_xattr_value_root *xv,
3459 int *meta_add, int *credits)
3460{
3461 int ret = 0, index, ref_blocks = 0;
3462 u32 p_cluster, num_clusters;
3463 u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters);
3464 struct ocfs2_refcount_block *rb;
3465 struct ocfs2_refcount_rec rec;
3466 struct buffer_head *ref_leaf_bh = NULL;
3467
3468 while (cpos < clusters) {
3469 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
3470 &num_clusters, &xv->xr_list,
3471 NULL);
3472 if (ret) {
3473 mlog_errno(ret);
3474 goto out;
3475 }
3476
3477 cpos += num_clusters;
3478
3479 while (num_clusters) {
3480 ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh,
3481 p_cluster, num_clusters,
3482 &rec, &index,
3483 &ref_leaf_bh);
3484 if (ret) {
3485 mlog_errno(ret);
3486 goto out;
3487 }
3488
3489 BUG_ON(!rec.r_refcount);
3490
3491 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
3492
3493 /*
3494 * We really don't know whether the other clusters is in
3495 * this refcount block or not, so just take the worst
3496 * case that all the clusters are in this block and each
3497 * one will split a refcount rec, so totally we need
3498 * clusters * 2 new refcount rec.
3499 */
3500 if (le64_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
3501 le16_to_cpu(rb->rf_records.rl_count))
3502 ref_blocks++;
3503
3504 *credits += 1;
3505 brelse(ref_leaf_bh);
3506 ref_leaf_bh = NULL;
3507
3508 if (num_clusters <= le32_to_cpu(rec.r_clusters))
3509 break;
3510 else
3511 num_clusters -= le32_to_cpu(rec.r_clusters);
3512 p_cluster += num_clusters;
3513 }
3514 }
3515
3516 *meta_add += ref_blocks;
3517 if (!ref_blocks)
3518 goto out;
3519
3520 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
3521 if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
3522 *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
3523 else {
3524 struct ocfs2_extent_tree et;
3525
3526 ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
3527 *credits += ocfs2_calc_extend_credits(inode->i_sb,
3528 et.et_root_el,
3529 ref_blocks);
3530 }
3531
3532out:
3533 brelse(ref_leaf_bh);
3534 return ret;
3535}
3536
3537/*
3538 * Do CoW for xattr.
3539 */
3540int ocfs2_refcount_cow_xattr(struct inode *inode,
3541 struct ocfs2_dinode *di,
3542 struct ocfs2_xattr_value_buf *vb,
3543 struct ocfs2_refcount_tree *ref_tree,
3544 struct buffer_head *ref_root_bh,
3545 u32 cpos, u32 write_len,
3546 struct ocfs2_post_refcount *post)
3547{
3548 int ret;
3549 struct ocfs2_xattr_value_root *xv = vb->vb_xv;
3550 struct ocfs2_inode_info *oi = OCFS2_I(inode);
3551 struct ocfs2_cow_context *context = NULL;
3552 u32 cow_start, cow_len;
3553
3554 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
3555
3556 ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
3557 cpos, write_len, UINT_MAX,
3558 &cow_start, &cow_len);
3559 if (ret) {
3560 mlog_errno(ret);
3561 goto out;
3562 }
3563
3564 BUG_ON(cow_len == 0);
3565
3566 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3567 if (!context) {
3568 ret = -ENOMEM;
3569 mlog_errno(ret);
3570 goto out;
3571 }
3572
3573 context->inode = inode;
3574 context->cow_start = cow_start;
3575 context->cow_len = cow_len;
3576 context->ref_tree = ref_tree;
3577 context->ref_root_bh = ref_root_bh;;
3578 context->cow_object = xv;
3579
3580 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
3581 /* We need the extra credits for duplicate_clusters by jbd. */
3582 context->extra_credits =
3583 ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len;
3584 context->get_clusters = ocfs2_xattr_value_get_clusters;
3585 context->post_refcount = post;
3586
3587 ocfs2_init_xattr_value_extent_tree(&context->data_et,
3588 INODE_CACHE(inode), vb);
3589
3590 ret = ocfs2_replace_cow(context);
3591 if (ret)
3592 mlog_errno(ret);
3593
3594out:
3595 kfree(context);
3596 return ret;
3597}
3598
3599/*
3600 * Insert a new extent into refcount tree and mark a extent rec
3601 * as refcounted in the dinode tree.
3602 */
3603int ocfs2_add_refcount_flag(struct inode *inode,
3604 struct ocfs2_extent_tree *data_et,
3605 struct ocfs2_caching_info *ref_ci,
3606 struct buffer_head *ref_root_bh,
3607 u32 cpos, u32 p_cluster, u32 num_clusters,
3608 struct ocfs2_cached_dealloc_ctxt *dealloc,
3609 struct ocfs2_post_refcount *post)
3610{
3611 int ret;
3612 handle_t *handle;
3613 int credits = 1, ref_blocks = 0;
3614 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3615 struct ocfs2_alloc_context *meta_ac = NULL;
3616
3617 ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
3618 ref_ci, ref_root_bh,
3619 p_cluster, num_clusters,
3620 &ref_blocks, &credits);
3621 if (ret) {
3622 mlog_errno(ret);
3623 goto out;
3624 }
3625
3626 mlog(0, "reserve new metadata %d, credits = %d\n",
3627 ref_blocks, credits);
3628
3629 if (ref_blocks) {
3630 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
3631 ref_blocks, &meta_ac);
3632 if (ret) {
3633 mlog_errno(ret);
3634 goto out;
3635 }
3636 }
3637
3638 if (post)
3639 credits += post->credits;
3640
3641 handle = ocfs2_start_trans(osb, credits);
3642 if (IS_ERR(handle)) {
3643 ret = PTR_ERR(handle);
3644 mlog_errno(ret);
3645 goto out;
3646 }
3647
3648 ret = ocfs2_mark_extent_refcounted(inode, data_et, handle,
3649 cpos, num_clusters, p_cluster,
3650 meta_ac, dealloc);
3651 if (ret) {
3652 mlog_errno(ret);
3653 goto out_commit;
3654 }
3655
3656 ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
3657 p_cluster, num_clusters, 0,
3658 meta_ac, dealloc);
3659 if (ret) {
3660 mlog_errno(ret);
3661 goto out_commit;
3662 }
3663
3664 if (post && post->func) {
3665 ret = post->func(inode, handle, post->para);
3666 if (ret)
3667 mlog_errno(ret);
3668 }
3669
3670out_commit:
3671 ocfs2_commit_trans(osb, handle);
3672out:
3673 if (meta_ac)
3674 ocfs2_free_alloc_context(meta_ac);
3675 return ret;
3676}
3677
3678static int ocfs2_change_ctime(struct inode *inode,
3679 struct buffer_head *di_bh)
3680{
3681 int ret;
3682 handle_t *handle;
3683 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3684
3685 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
3686 OCFS2_INODE_UPDATE_CREDITS);
3687 if (IS_ERR(handle)) {
3688 ret = PTR_ERR(handle);
3689 mlog_errno(ret);
3690 goto out;
3691 }
3692
3693 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
3694 OCFS2_JOURNAL_ACCESS_WRITE);
3695 if (ret) {
3696 mlog_errno(ret);
3697 goto out_commit;
3698 }
3699
3700 inode->i_ctime = CURRENT_TIME;
3701 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
3702 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
3703
3704 ocfs2_journal_dirty(handle, di_bh);
3705
3706out_commit:
3707 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
3708out:
3709 return ret;
3710}
3711
3712static int ocfs2_attach_refcount_tree(struct inode *inode,
3713 struct buffer_head *di_bh)
3714{
3715 int ret, data_changed = 0;
3716 struct buffer_head *ref_root_bh = NULL;
3717 struct ocfs2_inode_info *oi = OCFS2_I(inode);
3718 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3719 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3720 struct ocfs2_refcount_tree *ref_tree;
3721 unsigned int ext_flags;
3722 loff_t size;
3723 u32 cpos, num_clusters, clusters, p_cluster;
3724 struct ocfs2_cached_dealloc_ctxt dealloc;
3725 struct ocfs2_extent_tree di_et;
3726
3727 ocfs2_init_dealloc_ctxt(&dealloc);
3728
3729 if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
3730 ret = ocfs2_create_refcount_tree(inode, di_bh);
3731 if (ret) {
3732 mlog_errno(ret);
3733 goto out;
3734 }
3735 }
3736
3737 BUG_ON(!di->i_refcount_loc);
3738 ret = ocfs2_lock_refcount_tree(osb,
3739 le64_to_cpu(di->i_refcount_loc), 1,
3740 &ref_tree, &ref_root_bh);
3741 if (ret) {
3742 mlog_errno(ret);
3743 goto out;
3744 }
3745
3746 ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
3747
3748 size = i_size_read(inode);
3749 clusters = ocfs2_clusters_for_bytes(inode->i_sb, size);
3750
3751 cpos = 0;
3752 while (cpos < clusters) {
3753 ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3754 &num_clusters, &ext_flags);
3755
3756 if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
3757 ret = ocfs2_add_refcount_flag(inode, &di_et,
3758 &ref_tree->rf_ci,
3759 ref_root_bh, cpos,
3760 p_cluster, num_clusters,
3761 &dealloc, NULL);
3762 if (ret) {
3763 mlog_errno(ret);
3764 goto unlock;
3765 }
3766
3767 data_changed = 1;
3768 }
3769 cpos += num_clusters;
3770 }
3771
3772 if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
3773 ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
3774 &ref_tree->rf_ci,
3775 ref_root_bh,
3776 &dealloc);
3777 if (ret) {
3778 mlog_errno(ret);
3779 goto unlock;
3780 }
3781 }
3782
3783 if (data_changed) {
3784 ret = ocfs2_change_ctime(inode, di_bh);
3785 if (ret)
3786 mlog_errno(ret);
3787 }
3788
3789unlock:
3790 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3791 brelse(ref_root_bh);
3792
3793 if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
3794 ocfs2_schedule_truncate_log_flush(osb, 1);
3795 ocfs2_run_deallocs(osb, &dealloc);
3796 }
3797out:
3798 /*
3799 * Empty the extent map so that we may get the right extent
3800 * record from the disk.
3801 */
3802 ocfs2_extent_map_trunc(inode, 0);
3803
3804 return ret;
3805}
3806
3807static int ocfs2_add_refcounted_extent(struct inode *inode,
3808 struct ocfs2_extent_tree *et,
3809 struct ocfs2_caching_info *ref_ci,
3810 struct buffer_head *ref_root_bh,
3811 u32 cpos, u32 p_cluster, u32 num_clusters,
3812 unsigned int ext_flags,
3813 struct ocfs2_cached_dealloc_ctxt *dealloc)
3814{
3815 int ret;
3816 handle_t *handle;
3817 int credits = 0;
3818 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3819 struct ocfs2_alloc_context *meta_ac = NULL;
3820
3821 ret = ocfs2_lock_refcount_allocators(inode->i_sb,
3822 p_cluster, num_clusters,
3823 et, ref_ci,
3824 ref_root_bh, &meta_ac,
3825 NULL, &credits);
3826 if (ret) {
3827 mlog_errno(ret);
3828 goto out;
3829 }
3830
3831 handle = ocfs2_start_trans(osb, credits);
3832 if (IS_ERR(handle)) {
3833 ret = PTR_ERR(handle);
3834 mlog_errno(ret);
3835 goto out;
3836 }
3837
3838 ret = ocfs2_insert_extent(handle, et, cpos,
3839 cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
3840 p_cluster)),
3841 num_clusters, ext_flags, meta_ac);
3842 if (ret) {
3843 mlog_errno(ret);
3844 goto out_commit;
3845 }
3846
3847 ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
3848 p_cluster, num_clusters,
3849 meta_ac, dealloc);
3850 if (ret)
3851 mlog_errno(ret);
3852
3853out_commit:
3854 ocfs2_commit_trans(osb, handle);
3855out:
3856 if (meta_ac)
3857 ocfs2_free_alloc_context(meta_ac);
3858 return ret;
3859}
3860
3861static int ocfs2_duplicate_extent_list(struct inode *s_inode,
3862 struct inode *t_inode,
3863 struct buffer_head *t_bh,
3864 struct ocfs2_caching_info *ref_ci,
3865 struct buffer_head *ref_root_bh,
3866 struct ocfs2_cached_dealloc_ctxt *dealloc)
3867{
3868 int ret = 0;
3869 u32 p_cluster, num_clusters, clusters, cpos;
3870 loff_t size;
3871 unsigned int ext_flags;
3872 struct ocfs2_extent_tree et;
3873
3874 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh);
3875
3876 size = i_size_read(s_inode);
3877 clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size);
3878
3879 cpos = 0;
3880 while (cpos < clusters) {
3881 ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
3882 &num_clusters, &ext_flags);
3883
3884 if (p_cluster) {
3885 ret = ocfs2_add_refcounted_extent(t_inode, &et,
3886 ref_ci, ref_root_bh,
3887 cpos, p_cluster,
3888 num_clusters,
3889 ext_flags,
3890 dealloc);
3891 if (ret) {
3892 mlog_errno(ret);
3893 goto out;
3894 }
3895 }
3896
3897 cpos += num_clusters;
3898 }
3899
3900out:
3901 return ret;
3902}
3903
3904/*
3905 * change the new file's attributes to the src.
3906 *
3907 * reflink creates a snapshot of a file, that means the attributes
3908 * must be identical except for three exceptions - nlink, ino, and ctime.
3909 */
3910static int ocfs2_complete_reflink(struct inode *s_inode,
3911 struct buffer_head *s_bh,
3912 struct inode *t_inode,
3913 struct buffer_head *t_bh,
3914 bool preserve)
3915{
3916 int ret;
3917 handle_t *handle;
3918 struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
3919 struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data;
3920 loff_t size = i_size_read(s_inode);
3921
3922 handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb),
3923 OCFS2_INODE_UPDATE_CREDITS);
3924 if (IS_ERR(handle)) {
3925 ret = PTR_ERR(handle);
3926 mlog_errno(ret);
3927 return ret;
3928 }
3929
3930 ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
3931 OCFS2_JOURNAL_ACCESS_WRITE);
3932 if (ret) {
3933 mlog_errno(ret);
3934 goto out_commit;
3935 }
3936
3937 spin_lock(&OCFS2_I(t_inode)->ip_lock);
3938 OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters;
3939 OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr;
3940 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
3941 spin_unlock(&OCFS2_I(t_inode)->ip_lock);
3942 i_size_write(t_inode, size);
3943
3944 di->i_xattr_inline_size = s_di->i_xattr_inline_size;
3945 di->i_clusters = s_di->i_clusters;
3946 di->i_size = s_di->i_size;
3947 di->i_dyn_features = s_di->i_dyn_features;
3948 di->i_attr = s_di->i_attr;
3949
3950 if (preserve) {
3951 di->i_uid = s_di->i_uid;
3952 di->i_gid = s_di->i_gid;
3953 di->i_mode = s_di->i_mode;
3954
3955 /*
3956 * update time.
3957 * we want mtime to appear identical to the source and
3958 * update ctime.
3959 */
3960 t_inode->i_ctime = CURRENT_TIME;
3961
3962 di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
3963 di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
3964
3965 t_inode->i_mtime = s_inode->i_mtime;
3966 di->i_mtime = s_di->i_mtime;
3967 di->i_mtime_nsec = s_di->i_mtime_nsec;
3968 }
3969
3970 ocfs2_journal_dirty(handle, t_bh);
3971
3972out_commit:
3973 ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle);
3974 return ret;
3975}
3976
3977static int ocfs2_create_reflink_node(struct inode *s_inode,
3978 struct buffer_head *s_bh,
3979 struct inode *t_inode,
3980 struct buffer_head *t_bh,
3981 bool preserve)
3982{
3983 int ret;
3984 struct buffer_head *ref_root_bh = NULL;
3985 struct ocfs2_cached_dealloc_ctxt dealloc;
3986 struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
3987 struct ocfs2_refcount_block *rb;
3988 struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
3989 struct ocfs2_refcount_tree *ref_tree;
3990
3991 ocfs2_init_dealloc_ctxt(&dealloc);
3992
3993 ret = ocfs2_set_refcount_tree(t_inode, t_bh,
3994 le64_to_cpu(di->i_refcount_loc));
3995 if (ret) {
3996 mlog_errno(ret);
3997 goto out;
3998 }
3999
4000 ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
4001 1, &ref_tree, &ref_root_bh);
4002 if (ret) {
4003 mlog_errno(ret);
4004 goto out;
4005 }
4006 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
4007
4008 ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
4009 &ref_tree->rf_ci, ref_root_bh,
4010 &dealloc);
4011 if (ret) {
4012 mlog_errno(ret);
4013 goto out_unlock_refcount;
4014 }
4015
4016 ret = ocfs2_complete_reflink(s_inode, s_bh, t_inode, t_bh, preserve);
4017 if (ret)
4018 mlog_errno(ret);
4019
4020out_unlock_refcount:
4021 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
4022 brelse(ref_root_bh);
4023out:
4024 if (ocfs2_dealloc_has_cluster(&dealloc)) {
4025 ocfs2_schedule_truncate_log_flush(osb, 1);
4026 ocfs2_run_deallocs(osb, &dealloc);
4027 }
4028
4029 return ret;
4030}
4031
4032static int __ocfs2_reflink(struct dentry *old_dentry,
4033 struct buffer_head *old_bh,
4034 struct inode *new_inode,
4035 bool preserve)
4036{
4037 int ret;
4038 struct inode *inode = old_dentry->d_inode;
4039 struct buffer_head *new_bh = NULL;
4040
4041 ret = filemap_fdatawrite(inode->i_mapping);
4042 if (ret) {
4043 mlog_errno(ret);
4044 goto out;
4045 }
4046
4047 ret = ocfs2_attach_refcount_tree(inode, old_bh);
4048 if (ret) {
4049 mlog_errno(ret);
4050 goto out;
4051 }
4052
4053 mutex_lock(&new_inode->i_mutex);
4054 ret = ocfs2_inode_lock(new_inode, &new_bh, 1);
4055 if (ret) {
4056 mlog_errno(ret);
4057 goto out_unlock;
4058 }
4059
4060 ret = ocfs2_create_reflink_node(inode, old_bh,
4061 new_inode, new_bh, preserve);
4062 if (ret) {
4063 mlog_errno(ret);
4064 goto inode_unlock;
4065 }
4066
4067 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
4068 ret = ocfs2_reflink_xattrs(inode, old_bh,
4069 new_inode, new_bh,
4070 preserve);
4071 if (ret)
4072 mlog_errno(ret);
4073 }
4074inode_unlock:
4075 ocfs2_inode_unlock(new_inode, 1);
4076 brelse(new_bh);
4077out_unlock:
4078 mutex_unlock(&new_inode->i_mutex);
4079out:
4080 if (!ret) {
4081 ret = filemap_fdatawait(inode->i_mapping);
4082 if (ret)
4083 mlog_errno(ret);
4084 }
4085 return ret;
4086}
4087
4088static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
4089 struct dentry *new_dentry, bool preserve)
4090{
4091 int error;
4092 struct inode *inode = old_dentry->d_inode;
4093 struct buffer_head *old_bh = NULL;
4094 struct inode *new_orphan_inode = NULL;
4095
4096 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4097 return -EOPNOTSUPP;
4098
4099 error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
4100 &new_orphan_inode);
4101 if (error) {
4102 mlog_errno(error);
4103 goto out;
4104 }
4105
4106 error = ocfs2_inode_lock(inode, &old_bh, 1);
4107 if (error) {
4108 mlog_errno(error);
4109 goto out;
4110 }
4111
4112 down_write(&OCFS2_I(inode)->ip_xattr_sem);
4113 down_write(&OCFS2_I(inode)->ip_alloc_sem);
4114 error = __ocfs2_reflink(old_dentry, old_bh,
4115 new_orphan_inode, preserve);
4116 up_write(&OCFS2_I(inode)->ip_alloc_sem);
4117 up_write(&OCFS2_I(inode)->ip_xattr_sem);
4118
4119 ocfs2_inode_unlock(inode, 1);
4120 brelse(old_bh);
4121
4122 if (error) {
4123 mlog_errno(error);
4124 goto out;
4125 }
4126
4127 /* If the security isn't preserved, we need to re-initialize them. */
4128 if (!preserve) {
4129 error = ocfs2_init_security_and_acl(dir, new_orphan_inode);
4130 if (error)
4131 mlog_errno(error);
4132 }
4133out:
4134 if (!error) {
4135 error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
4136 new_dentry);
4137 if (error)
4138 mlog_errno(error);
4139 }
4140
4141 if (new_orphan_inode) {
4142 /*
4143 * We need to open_unlock the inode no matter whether we
4144 * succeed or not, so that other nodes can delete it later.
4145 */
4146 ocfs2_open_unlock(new_orphan_inode);
4147 if (error)
4148 iput(new_orphan_inode);
4149 }
4150
4151 return error;
4152}
4153
4154/*
4155 * Below here are the bits used by OCFS2_IOC_REFLINK() to fake
4156 * sys_reflink(). This will go away when vfs_reflink() exists in
4157 * fs/namei.c.
4158 */
4159
4160/* copied from may_create in VFS. */
4161static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
4162{
4163 if (child->d_inode)
4164 return -EEXIST;
4165 if (IS_DEADDIR(dir))
4166 return -ENOENT;
4167 return inode_permission(dir, MAY_WRITE | MAY_EXEC);
4168}
4169
4170/* copied from user_path_parent. */
4171static int ocfs2_user_path_parent(const char __user *path,
4172 struct nameidata *nd, char **name)
4173{
4174 char *s = getname(path);
4175 int error;
4176
4177 if (IS_ERR(s))
4178 return PTR_ERR(s);
4179
4180 error = path_lookup(s, LOOKUP_PARENT, nd);
4181 if (error)
4182 putname(s);
4183 else
4184 *name = s;
4185
4186 return error;
4187}
4188
4189/**
4190 * ocfs2_vfs_reflink - Create a reference-counted link
4191 *
4192 * @old_dentry: source dentry + inode
4193 * @dir: directory to create the target
4194 * @new_dentry: target dentry
4195 * @preserve: if true, preserve all file attributes
4196 */
4197int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
4198 struct dentry *new_dentry, bool preserve)
4199{
4200 struct inode *inode = old_dentry->d_inode;
4201 int error;
4202
4203 if (!inode)
4204 return -ENOENT;
4205
4206 error = ocfs2_may_create(dir, new_dentry);
4207 if (error)
4208 return error;
4209
4210 if (dir->i_sb != inode->i_sb)
4211 return -EXDEV;
4212
4213 /*
4214 * A reflink to an append-only or immutable file cannot be created.
4215 */
4216 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4217 return -EPERM;
4218
4219 /* Only regular files can be reflinked. */
4220 if (!S_ISREG(inode->i_mode))
4221 return -EPERM;
4222
4223 /*
4224 * If the caller wants to preserve ownership, they require the
4225 * rights to do so.
4226 */
4227 if (preserve) {
4228 if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN))
4229 return -EPERM;
4230 if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
4231 return -EPERM;
4232 }
4233
4234 /*
4235 * If the caller is modifying any aspect of the attributes, they
4236 * are not creating a snapshot. They need read permission on the
4237 * file.
4238 */
4239 if (!preserve) {
4240 error = inode_permission(inode, MAY_READ);
4241 if (error)
4242 return error;
4243 }
4244
4245 mutex_lock(&inode->i_mutex);
4246 vfs_dq_init(dir);
4247 error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
4248 mutex_unlock(&inode->i_mutex);
4249 if (!error)
4250 fsnotify_create(dir, new_dentry);
4251 return error;
4252}
4253/*
4254 * Most codes are copied from sys_linkat.
4255 */
4256int ocfs2_reflink_ioctl(struct inode *inode,
4257 const char __user *oldname,
4258 const char __user *newname,
4259 bool preserve)
4260{
4261 struct dentry *new_dentry;
4262 struct nameidata nd;
4263 struct path old_path;
4264 int error;
4265 char *to = NULL;
4266
4267 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4268 return -EOPNOTSUPP;
4269
4270 error = user_path_at(AT_FDCWD, oldname, 0, &old_path);
4271 if (error) {
4272 mlog_errno(error);
4273 return error;
4274 }
4275
4276 error = ocfs2_user_path_parent(newname, &nd, &to);
4277 if (error) {
4278 mlog_errno(error);
4279 goto out;
4280 }
4281
4282 error = -EXDEV;
4283 if (old_path.mnt != nd.path.mnt)
4284 goto out_release;
4285 new_dentry = lookup_create(&nd, 0);
4286 error = PTR_ERR(new_dentry);
4287 if (IS_ERR(new_dentry)) {
4288 mlog_errno(error);
4289 goto out_unlock;
4290 }
4291
4292 error = mnt_want_write(nd.path.mnt);
4293 if (error) {
4294 mlog_errno(error);
4295 goto out_dput;
4296 }
4297
4298 error = ocfs2_vfs_reflink(old_path.dentry,
4299 nd.path.dentry->d_inode,
4300 new_dentry, preserve);
4301 mnt_drop_write(nd.path.mnt);
4302out_dput:
4303 dput(new_dentry);
4304out_unlock:
4305 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
4306out_release:
4307 path_put(&nd.path);
4308 putname(to);
4309out:
4310 path_put(&old_path);
4311
4312 return error;
4313}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
new file mode 100644
index 000000000000..c1d19b1d3ecc
--- /dev/null
+++ b/fs/ocfs2/refcounttree.h
@@ -0,0 +1,106 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * refcounttree.h
5 *
6 * Copyright (C) 2009 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License version 2 as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 */
17#ifndef OCFS2_REFCOUNTTREE_H
18#define OCFS2_REFCOUNTTREE_H
19
20struct ocfs2_refcount_tree {
21 struct rb_node rf_node;
22 u64 rf_blkno;
23 u32 rf_generation;
24 struct rw_semaphore rf_sem;
25 struct ocfs2_lock_res rf_lockres;
26 struct kref rf_getcnt;
27 int rf_removed;
28
29 /* the following 4 fields are used by caching_info. */
30 struct ocfs2_caching_info rf_ci;
31 spinlock_t rf_lock;
32 struct mutex rf_io_mutex;
33 struct super_block *rf_sb;
34};
35
36void ocfs2_purge_refcount_trees(struct ocfs2_super *osb);
37int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, u64 ref_blkno, int rw,
38 struct ocfs2_refcount_tree **tree,
39 struct buffer_head **ref_bh);
40void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
41 struct ocfs2_refcount_tree *tree,
42 int rw);
43
44int ocfs2_decrease_refcount(struct inode *inode,
45 handle_t *handle, u32 cpos, u32 len,
46 struct ocfs2_alloc_context *meta_ac,
47 struct ocfs2_cached_dealloc_ctxt *dealloc,
48 int delete);
49int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
50 struct buffer_head *di_bh,
51 u64 phys_blkno,
52 u32 clusters,
53 int *credits,
54 struct ocfs2_alloc_context **meta_ac);
55int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
56 u32 cpos, u32 write_len, u32 max_cpos);
57
58typedef int (ocfs2_post_refcount_func)(struct inode *inode,
59 handle_t *handle,
60 void *para);
61/*
62 * Some refcount caller need to do more work after we modify the data b-tree
63 * during refcount operation(including CoW and add refcount flag), and make the
64 * transaction complete. So it must give us this structure so that we can do it
65 * within our transaction.
66 *
67 */
68struct ocfs2_post_refcount {
69 int credits; /* credits it need for journal. */
70 ocfs2_post_refcount_func *func; /* real function. */
71 void *para;
72};
73
74int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
75 struct ocfs2_caching_info *ref_ci,
76 struct buffer_head *ref_root_bh,
77 struct ocfs2_xattr_value_root *xv,
78 int *meta_add, int *credits);
79int ocfs2_refcount_cow_xattr(struct inode *inode,
80 struct ocfs2_dinode *di,
81 struct ocfs2_xattr_value_buf *vb,
82 struct ocfs2_refcount_tree *ref_tree,
83 struct buffer_head *ref_root_bh,
84 u32 cpos, u32 write_len,
85 struct ocfs2_post_refcount *post);
86int ocfs2_add_refcount_flag(struct inode *inode,
87 struct ocfs2_extent_tree *data_et,
88 struct ocfs2_caching_info *ref_ci,
89 struct buffer_head *ref_root_bh,
90 u32 cpos, u32 p_cluster, u32 num_clusters,
91 struct ocfs2_cached_dealloc_ctxt *dealloc,
92 struct ocfs2_post_refcount *post);
93int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh);
94int ocfs2_try_remove_refcount_tree(struct inode *inode,
95 struct buffer_head *di_bh);
96int ocfs2_increase_refcount(handle_t *handle,
97 struct ocfs2_caching_info *ci,
98 struct buffer_head *ref_root_bh,
99 u64 cpos, u32 len,
100 struct ocfs2_alloc_context *meta_ac,
101 struct ocfs2_cached_dealloc_ctxt *dealloc);
102int ocfs2_reflink_ioctl(struct inode *inode,
103 const char __user *oldname,
104 const char __user *newname,
105 bool preserve);
106#endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 424adaa5f900..3c3d673a4d20 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
106 mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n", 106 mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
107 new_clusters, first_new_cluster); 107 new_clusters, first_new_cluster);
108 108
109 ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh, 109 ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode),
110 OCFS2_JOURNAL_ACCESS_WRITE); 110 group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
111 if (ret < 0) { 111 if (ret < 0) {
112 mlog_errno(ret); 112 mlog_errno(ret);
113 goto out; 113 goto out;
@@ -141,7 +141,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
141 } 141 }
142 142
143 /* update the inode accordingly. */ 143 /* update the inode accordingly. */
144 ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh, 144 ret = ocfs2_journal_access_di(handle, INODE_CACHE(bm_inode), bm_bh,
145 OCFS2_JOURNAL_ACCESS_WRITE); 145 OCFS2_JOURNAL_ACCESS_WRITE);
146 if (ret < 0) { 146 if (ret < 0) {
147 mlog_errno(ret); 147 mlog_errno(ret);
@@ -514,7 +514,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
514 goto out_unlock; 514 goto out_unlock;
515 } 515 }
516 516
517 ocfs2_set_new_buffer_uptodate(inode, group_bh); 517 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), group_bh);
518 518
519 ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh); 519 ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
520 if (ret) { 520 if (ret) {
@@ -536,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
536 cl = &fe->id2.i_chain; 536 cl = &fe->id2.i_chain;
537 cr = &cl->cl_recs[input->chain]; 537 cr = &cl->cl_recs[input->chain];
538 538
539 ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh, 539 ret = ocfs2_journal_access_gd(handle, INODE_CACHE(main_bm_inode),
540 OCFS2_JOURNAL_ACCESS_WRITE); 540 group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
541 if (ret < 0) { 541 if (ret < 0) {
542 mlog_errno(ret); 542 mlog_errno(ret);
543 goto out_commit; 543 goto out_commit;
@@ -552,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
552 goto out_commit; 552 goto out_commit;
553 } 553 }
554 554
555 ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh, 555 ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
556 OCFS2_JOURNAL_ACCESS_WRITE); 556 main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
557 if (ret < 0) { 557 if (ret < 0) {
558 mlog_errno(ret); 558 mlog_errno(ret);
559 goto out_commit; 559 goto out_commit;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 40661e7824e9..bfbd7e9e949f 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -150,8 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
150 * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If 150 * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If
151 * this is not true, the read of -1 (UINT64_MAX) will fail. 151 * this is not true, the read of -1 (UINT64_MAX) will fail.
152 */ 152 */
153 ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh, 153 ret = ocfs2_read_blocks(INODE_CACHE(si->si_inode), -1, si->si_blocks,
154 OCFS2_BH_IGNORE_CACHE, NULL); 154 si->si_bh, OCFS2_BH_IGNORE_CACHE, NULL);
155 if (ret == 0) { 155 if (ret == 0) {
156 spin_lock(&osb->osb_lock); 156 spin_lock(&osb->osb_lock);
157 ocfs2_update_slot_info(si); 157 ocfs2_update_slot_info(si);
@@ -213,7 +213,7 @@ static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
213 ocfs2_update_disk_slot_old(si, slot_num, &bh); 213 ocfs2_update_disk_slot_old(si, slot_num, &bh);
214 spin_unlock(&osb->osb_lock); 214 spin_unlock(&osb->osb_lock);
215 215
216 status = ocfs2_write_block(osb, bh, si->si_inode); 216 status = ocfs2_write_block(osb, bh, INODE_CACHE(si->si_inode));
217 if (status < 0) 217 if (status < 0)
218 mlog_errno(status); 218 mlog_errno(status);
219 219
@@ -404,8 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
404 (unsigned long long)blkno); 404 (unsigned long long)blkno);
405 405
406 bh = NULL; /* Acquire a fresh bh */ 406 bh = NULL; /* Acquire a fresh bh */
407 status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh, 407 status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno,
408 OCFS2_BH_IGNORE_CACHE, NULL); 408 1, &bh, OCFS2_BH_IGNORE_CACHE, NULL);
409 if (status < 0) { 409 if (status < 0) {
410 mlog_errno(status); 410 mlog_errno(status);
411 goto bail; 411 goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 73a16d4666dc..c30b644d9572 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -310,7 +310,7 @@ int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
310 int rc; 310 int rc;
311 struct buffer_head *tmp = *bh; 311 struct buffer_head *tmp = *bh;
312 312
313 rc = ocfs2_read_block(inode, gd_blkno, &tmp, 313 rc = ocfs2_read_block(INODE_CACHE(inode), gd_blkno, &tmp,
314 ocfs2_validate_group_descriptor); 314 ocfs2_validate_group_descriptor);
315 if (rc) 315 if (rc)
316 goto out; 316 goto out;
@@ -352,7 +352,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
352 } 352 }
353 353
354 status = ocfs2_journal_access_gd(handle, 354 status = ocfs2_journal_access_gd(handle,
355 alloc_inode, 355 INODE_CACHE(alloc_inode),
356 bg_bh, 356 bg_bh,
357 OCFS2_JOURNAL_ACCESS_CREATE); 357 OCFS2_JOURNAL_ACCESS_CREATE);
358 if (status < 0) { 358 if (status < 0) {
@@ -476,7 +476,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
476 mlog_errno(status); 476 mlog_errno(status);
477 goto bail; 477 goto bail;
478 } 478 }
479 ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh); 479 ocfs2_set_new_buffer_uptodate(INODE_CACHE(alloc_inode), bg_bh);
480 480
481 status = ocfs2_block_group_fill(handle, 481 status = ocfs2_block_group_fill(handle,
482 alloc_inode, 482 alloc_inode,
@@ -491,7 +491,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
491 491
492 bg = (struct ocfs2_group_desc *) bg_bh->b_data; 492 bg = (struct ocfs2_group_desc *) bg_bh->b_data;
493 493
494 status = ocfs2_journal_access_di(handle, alloc_inode, 494 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
495 bh, OCFS2_JOURNAL_ACCESS_WRITE); 495 bh, OCFS2_JOURNAL_ACCESS_WRITE);
496 if (status < 0) { 496 if (status < 0) {
497 mlog_errno(status); 497 mlog_errno(status);
@@ -1033,7 +1033,7 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
1033 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1033 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1034 1034
1035 status = ocfs2_journal_access_gd(handle, 1035 status = ocfs2_journal_access_gd(handle,
1036 alloc_inode, 1036 INODE_CACHE(alloc_inode),
1037 group_bh, 1037 group_bh,
1038 journal_type); 1038 journal_type);
1039 if (status < 0) { 1039 if (status < 0) {
@@ -1106,7 +1106,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
1106 bg_ptr = le64_to_cpu(bg->bg_next_group); 1106 bg_ptr = le64_to_cpu(bg->bg_next_group);
1107 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); 1107 prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1108 1108
1109 status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh, 1109 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1110 prev_bg_bh,
1110 OCFS2_JOURNAL_ACCESS_WRITE); 1111 OCFS2_JOURNAL_ACCESS_WRITE);
1111 if (status < 0) { 1112 if (status < 0) {
1112 mlog_errno(status); 1113 mlog_errno(status);
@@ -1121,8 +1122,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
1121 goto out_rollback; 1122 goto out_rollback;
1122 } 1123 }
1123 1124
1124 status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh, 1125 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1125 OCFS2_JOURNAL_ACCESS_WRITE); 1126 bg_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1126 if (status < 0) { 1127 if (status < 0) {
1127 mlog_errno(status); 1128 mlog_errno(status);
1128 goto out_rollback; 1129 goto out_rollback;
@@ -1136,8 +1137,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
1136 goto out_rollback; 1137 goto out_rollback;
1137 } 1138 }
1138 1139
1139 status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh, 1140 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1140 OCFS2_JOURNAL_ACCESS_WRITE); 1141 fe_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1141 if (status < 0) { 1142 if (status < 0) {
1142 mlog_errno(status); 1143 mlog_errno(status);
1143 goto out_rollback; 1144 goto out_rollback;
@@ -1288,7 +1289,7 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1288 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 1289 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1289 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; 1290 struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1290 1291
1291 ret = ocfs2_journal_access_di(handle, inode, di_bh, 1292 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1292 OCFS2_JOURNAL_ACCESS_WRITE); 1293 OCFS2_JOURNAL_ACCESS_WRITE);
1293 if (ret < 0) { 1294 if (ret < 0) {
1294 mlog_errno(ret); 1295 mlog_errno(ret);
@@ -1461,7 +1462,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1461 /* Ok, claim our bits now: set the info on dinode, chainlist 1462 /* Ok, claim our bits now: set the info on dinode, chainlist
1462 * and then the group */ 1463 * and then the group */
1463 status = ocfs2_journal_access_di(handle, 1464 status = ocfs2_journal_access_di(handle,
1464 alloc_inode, 1465 INODE_CACHE(alloc_inode),
1465 ac->ac_bh, 1466 ac->ac_bh,
1466 OCFS2_JOURNAL_ACCESS_WRITE); 1467 OCFS2_JOURNAL_ACCESS_WRITE);
1467 if (status < 0) { 1468 if (status < 0) {
@@ -1907,8 +1908,8 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1907 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1908 if (ocfs2_is_cluster_bitmap(alloc_inode))
1908 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1909 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1909 1910
1910 status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh, 1911 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
1911 journal_type); 1912 group_bh, journal_type);
1912 if (status < 0) { 1913 if (status < 0) {
1913 mlog_errno(status); 1914 mlog_errno(status);
1914 goto bail; 1915 goto bail;
@@ -1993,8 +1994,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
1993 goto bail; 1994 goto bail;
1994 } 1995 }
1995 1996
1996 status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh, 1997 status = ocfs2_journal_access_di(handle, INODE_CACHE(alloc_inode),
1997 OCFS2_JOURNAL_ACCESS_WRITE); 1998 alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE);
1998 if (status < 0) { 1999 if (status < 0) {
1999 mlog_errno(status); 2000 mlog_errno(status);
2000 goto bail; 2001 goto bail;
@@ -2151,7 +2152,7 @@ int ocfs2_lock_allocators(struct inode *inode,
2151 2152
2152 BUG_ON(clusters_to_add != 0 && data_ac == NULL); 2153 BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2153 2154
2154 num_free_extents = ocfs2_num_free_extents(osb, inode, et); 2155 num_free_extents = ocfs2_num_free_extents(osb, et);
2155 if (num_free_extents < 0) { 2156 if (num_free_extents < 0) {
2156 ret = num_free_extents; 2157 ret = num_free_extents;
2157 mlog_errno(ret); 2158 mlog_errno(ret);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index a3f8871d21fd..4cc3c890a2cd 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -28,7 +28,6 @@
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/utsname.h>
32#include <linux/init.h> 31#include <linux/init.h>
33#include <linux/random.h> 32#include <linux/random.h>
34#include <linux/statfs.h> 33#include <linux/statfs.h>
@@ -69,6 +68,7 @@
69#include "ver.h" 68#include "ver.h"
70#include "xattr.h" 69#include "xattr.h"
71#include "quota.h" 70#include "quota.h"
71#include "refcounttree.h"
72 72
73#include "buffer_head_io.h" 73#include "buffer_head_io.h"
74 74
@@ -965,7 +965,7 @@ static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
965 return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED); 965 return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
966} 966}
967 967
968static struct quotactl_ops ocfs2_quotactl_ops = { 968static const struct quotactl_ops ocfs2_quotactl_ops = {
969 .quota_on = ocfs2_quota_on, 969 .quota_on = ocfs2_quota_on,
970 .quota_off = ocfs2_quota_off, 970 .quota_off = ocfs2_quota_off,
971 .quota_sync = vfs_quota_sync, 971 .quota_sync = vfs_quota_sync,
@@ -1668,8 +1668,6 @@ static void ocfs2_inode_init_once(void *data)
1668 spin_lock_init(&oi->ip_lock); 1668 spin_lock_init(&oi->ip_lock);
1669 ocfs2_extent_map_init(&oi->vfs_inode); 1669 ocfs2_extent_map_init(&oi->vfs_inode);
1670 INIT_LIST_HEAD(&oi->ip_io_markers); 1670 INIT_LIST_HEAD(&oi->ip_io_markers);
1671 oi->ip_created_trans = 0;
1672 oi->ip_last_trans = 0;
1673 oi->ip_dir_start_lookup = 0; 1671 oi->ip_dir_start_lookup = 0;
1674 1672
1675 init_rwsem(&oi->ip_alloc_sem); 1673 init_rwsem(&oi->ip_alloc_sem);
@@ -1683,7 +1681,8 @@ static void ocfs2_inode_init_once(void *data)
1683 ocfs2_lock_res_init_once(&oi->ip_inode_lockres); 1681 ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
1684 ocfs2_lock_res_init_once(&oi->ip_open_lockres); 1682 ocfs2_lock_res_init_once(&oi->ip_open_lockres);
1685 1683
1686 ocfs2_metadata_cache_init(&oi->vfs_inode); 1684 ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
1685 &ocfs2_inode_caching_ops);
1687 1686
1688 inode_init_once(&oi->vfs_inode); 1687 inode_init_once(&oi->vfs_inode);
1689} 1688}
@@ -1859,6 +1858,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1859 1858
1860 ocfs2_sync_blockdev(sb); 1859 ocfs2_sync_blockdev(sb);
1861 1860
1861 ocfs2_purge_refcount_trees(osb);
1862
1862 /* No cluster connection means we've failed during mount, so skip 1863 /* No cluster connection means we've failed during mount, so skip
1863 * all the steps which depended on that to complete. */ 1864 * all the steps which depended on that to complete. */
1864 if (osb->cconn) { 1865 if (osb->cconn) {
@@ -2065,6 +2066,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
2065 goto bail; 2066 goto bail;
2066 } 2067 }
2067 2068
2069 osb->osb_rf_lock_tree = RB_ROOT;
2070
2068 osb->s_feature_compat = 2071 osb->s_feature_compat =
2069 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); 2072 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);
2070 osb->s_feature_ro_compat = 2073 osb->s_feature_ro_compat =
@@ -2490,7 +2493,8 @@ void __ocfs2_abort(struct super_block* sb,
2490 /* Force a panic(). This stinks, but it's better than letting 2493 /* Force a panic(). This stinks, but it's better than letting
2491 * things continue without having a proper hard readonly 2494 * things continue without having a proper hard readonly
2492 * here. */ 2495 * here. */
2493 OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; 2496 if (!ocfs2_mount_local(OCFS2_SB(sb)))
2497 OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
2494 ocfs2_handle_error(sb); 2498 ocfs2_handle_error(sb);
2495} 2499}
2496 2500
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 579dd1b1110f..e3421030a69f 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -38,7 +38,6 @@
38#include <linux/types.h> 38#include <linux/types.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/utsname.h>
42#include <linux/namei.h> 41#include <linux/namei.h>
43 42
44#define MLOG_MASK_PREFIX ML_NAMEI 43#define MLOG_MASK_PREFIX ML_NAMEI
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 187b99ff0368..b6284f235d2f 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -75,15 +75,77 @@ struct ocfs2_meta_cache_item {
75 75
76static struct kmem_cache *ocfs2_uptodate_cachep = NULL; 76static struct kmem_cache *ocfs2_uptodate_cachep = NULL;
77 77
78void ocfs2_metadata_cache_init(struct inode *inode) 78u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci)
79{ 79{
80 struct ocfs2_inode_info *oi = OCFS2_I(inode); 80 BUG_ON(!ci || !ci->ci_ops);
81 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
82 81
83 oi->ip_flags |= OCFS2_INODE_CACHE_INLINE; 82 return ci->ci_ops->co_owner(ci);
83}
84
85struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci)
86{
87 BUG_ON(!ci || !ci->ci_ops);
88
89 return ci->ci_ops->co_get_super(ci);
90}
91
92static void ocfs2_metadata_cache_lock(struct ocfs2_caching_info *ci)
93{
94 BUG_ON(!ci || !ci->ci_ops);
95
96 ci->ci_ops->co_cache_lock(ci);
97}
98
99static void ocfs2_metadata_cache_unlock(struct ocfs2_caching_info *ci)
100{
101 BUG_ON(!ci || !ci->ci_ops);
102
103 ci->ci_ops->co_cache_unlock(ci);
104}
105
106void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci)
107{
108 BUG_ON(!ci || !ci->ci_ops);
109
110 ci->ci_ops->co_io_lock(ci);
111}
112
113void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci)
114{
115 BUG_ON(!ci || !ci->ci_ops);
116
117 ci->ci_ops->co_io_unlock(ci);
118}
119
120
121static void ocfs2_metadata_cache_reset(struct ocfs2_caching_info *ci,
122 int clear)
123{
124 ci->ci_flags |= OCFS2_CACHE_FL_INLINE;
84 ci->ci_num_cached = 0; 125 ci->ci_num_cached = 0;
126
127 if (clear) {
128 ci->ci_created_trans = 0;
129 ci->ci_last_trans = 0;
130 }
131}
132
133void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci,
134 const struct ocfs2_caching_operations *ops)
135{
136 BUG_ON(!ops);
137
138 ci->ci_ops = ops;
139 ocfs2_metadata_cache_reset(ci, 1);
85} 140}
86 141
142void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci)
143{
144 ocfs2_metadata_cache_purge(ci);
145 ocfs2_metadata_cache_reset(ci, 1);
146}
147
148
87/* No lock taken here as 'root' is not expected to be visible to other 149/* No lock taken here as 'root' is not expected to be visible to other
88 * processes. */ 150 * processes. */
89static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) 151static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
@@ -112,19 +174,20 @@ static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
112 * This function is a few more lines longer than necessary due to some 174 * This function is a few more lines longer than necessary due to some
113 * accounting done here, but I think it's worth tracking down those 175 * accounting done here, but I think it's worth tracking down those
114 * bugs sooner -- Mark */ 176 * bugs sooner -- Mark */
115void ocfs2_metadata_cache_purge(struct inode *inode) 177void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci)
116{ 178{
117 struct ocfs2_inode_info *oi = OCFS2_I(inode);
118 unsigned int tree, to_purge, purged; 179 unsigned int tree, to_purge, purged;
119 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
120 struct rb_root root = RB_ROOT; 180 struct rb_root root = RB_ROOT;
121 181
122 spin_lock(&oi->ip_lock); 182 BUG_ON(!ci || !ci->ci_ops);
123 tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE); 183
184 ocfs2_metadata_cache_lock(ci);
185 tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE);
124 to_purge = ci->ci_num_cached; 186 to_purge = ci->ci_num_cached;
125 187
126 mlog(0, "Purge %u %s items from Inode %llu\n", to_purge, 188 mlog(0, "Purge %u %s items from Owner %llu\n", to_purge,
127 tree ? "array" : "tree", (unsigned long long)oi->ip_blkno); 189 tree ? "array" : "tree",
190 (unsigned long long)ocfs2_metadata_cache_owner(ci));
128 191
129 /* If we're a tree, save off the root so that we can safely 192 /* If we're a tree, save off the root so that we can safely
130 * initialize the cache. We do the work to free tree members 193 * initialize the cache. We do the work to free tree members
@@ -132,16 +195,17 @@ void ocfs2_metadata_cache_purge(struct inode *inode)
132 if (tree) 195 if (tree)
133 root = ci->ci_cache.ci_tree; 196 root = ci->ci_cache.ci_tree;
134 197
135 ocfs2_metadata_cache_init(inode); 198 ocfs2_metadata_cache_reset(ci, 0);
136 spin_unlock(&oi->ip_lock); 199 ocfs2_metadata_cache_unlock(ci);
137 200
138 purged = ocfs2_purge_copied_metadata_tree(&root); 201 purged = ocfs2_purge_copied_metadata_tree(&root);
139 /* If possible, track the number wiped so that we can more 202 /* If possible, track the number wiped so that we can more
140 * easily detect counting errors. Unfortunately, this is only 203 * easily detect counting errors. Unfortunately, this is only
141 * meaningful for trees. */ 204 * meaningful for trees. */
142 if (tree && purged != to_purge) 205 if (tree && purged != to_purge)
143 mlog(ML_ERROR, "Inode %llu, count = %u, purged = %u\n", 206 mlog(ML_ERROR, "Owner %llu, count = %u, purged = %u\n",
144 (unsigned long long)oi->ip_blkno, to_purge, purged); 207 (unsigned long long)ocfs2_metadata_cache_owner(ci),
208 to_purge, purged);
145} 209}
146 210
147/* Returns the index in the cache array, -1 if not found. 211/* Returns the index in the cache array, -1 if not found.
@@ -182,27 +246,25 @@ ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
182 return NULL; 246 return NULL;
183} 247}
184 248
185static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi, 249static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
186 struct buffer_head *bh) 250 struct buffer_head *bh)
187{ 251{
188 int index = -1; 252 int index = -1;
189 struct ocfs2_meta_cache_item *item = NULL; 253 struct ocfs2_meta_cache_item *item = NULL;
190 254
191 spin_lock(&oi->ip_lock); 255 ocfs2_metadata_cache_lock(ci);
192 256
193 mlog(0, "Inode %llu, query block %llu (inline = %u)\n", 257 mlog(0, "Owner %llu, query block %llu (inline = %u)\n",
194 (unsigned long long)oi->ip_blkno, 258 (unsigned long long)ocfs2_metadata_cache_owner(ci),
195 (unsigned long long) bh->b_blocknr, 259 (unsigned long long) bh->b_blocknr,
196 !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE)); 260 !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
197 261
198 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) 262 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE)
199 index = ocfs2_search_cache_array(&oi->ip_metadata_cache, 263 index = ocfs2_search_cache_array(ci, bh->b_blocknr);
200 bh->b_blocknr);
201 else 264 else
202 item = ocfs2_search_cache_tree(&oi->ip_metadata_cache, 265 item = ocfs2_search_cache_tree(ci, bh->b_blocknr);
203 bh->b_blocknr);
204 266
205 spin_unlock(&oi->ip_lock); 267 ocfs2_metadata_cache_unlock(ci);
206 268
207 mlog(0, "index = %d, item = %p\n", index, item); 269 mlog(0, "index = %d, item = %p\n", index, item);
208 270
@@ -214,7 +276,7 @@ static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
214 * 276 *
215 * This can be called under lock_buffer() 277 * This can be called under lock_buffer()
216 */ 278 */
217int ocfs2_buffer_uptodate(struct inode *inode, 279int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
218 struct buffer_head *bh) 280 struct buffer_head *bh)
219{ 281{
220 /* Doesn't matter if the bh is in our cache or not -- if it's 282 /* Doesn't matter if the bh is in our cache or not -- if it's
@@ -230,24 +292,24 @@ int ocfs2_buffer_uptodate(struct inode *inode,
230 292
231 /* Ok, locally the buffer is marked as up to date, now search 293 /* Ok, locally the buffer is marked as up to date, now search
232 * our cache to see if we can trust that. */ 294 * our cache to see if we can trust that. */
233 return ocfs2_buffer_cached(OCFS2_I(inode), bh); 295 return ocfs2_buffer_cached(ci, bh);
234} 296}
235 297
236/* 298/*
237 * Determine whether a buffer is currently out on a read-ahead request. 299 * Determine whether a buffer is currently out on a read-ahead request.
238 * ip_io_sem should be held to serialize submitters with the logic here. 300 * ci_io_sem should be held to serialize submitters with the logic here.
239 */ 301 */
240int ocfs2_buffer_read_ahead(struct inode *inode, 302int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci,
241 struct buffer_head *bh) 303 struct buffer_head *bh)
242{ 304{
243 return buffer_locked(bh) && ocfs2_buffer_cached(OCFS2_I(inode), bh); 305 return buffer_locked(bh) && ocfs2_buffer_cached(ci, bh);
244} 306}
245 307
246/* Requires ip_lock */ 308/* Requires ip_lock */
247static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, 309static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
248 sector_t block) 310 sector_t block)
249{ 311{
250 BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY); 312 BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY);
251 313
252 mlog(0, "block %llu takes position %u\n", (unsigned long long) block, 314 mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
253 ci->ci_num_cached); 315 ci->ci_num_cached);
@@ -292,66 +354,64 @@ static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
292 ci->ci_num_cached++; 354 ci->ci_num_cached++;
293} 355}
294 356
295static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi, 357/* co_cache_lock() must be held */
296 struct ocfs2_caching_info *ci) 358static inline int ocfs2_insert_can_use_array(struct ocfs2_caching_info *ci)
297{ 359{
298 assert_spin_locked(&oi->ip_lock); 360 return (ci->ci_flags & OCFS2_CACHE_FL_INLINE) &&
299 361 (ci->ci_num_cached < OCFS2_CACHE_INFO_MAX_ARRAY);
300 return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) &&
301 (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY);
302} 362}
303 363
304/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the 364/* tree should be exactly OCFS2_CACHE_INFO_MAX_ARRAY wide. NULL the
305 * pointers in tree after we use them - this allows caller to detect 365 * pointers in tree after we use them - this allows caller to detect
306 * when to free in case of error. */ 366 * when to free in case of error.
307static void ocfs2_expand_cache(struct ocfs2_inode_info *oi, 367 *
368 * The co_cache_lock() must be held. */
369static void ocfs2_expand_cache(struct ocfs2_caching_info *ci,
308 struct ocfs2_meta_cache_item **tree) 370 struct ocfs2_meta_cache_item **tree)
309{ 371{
310 int i; 372 int i;
311 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
312 373
313 mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY, 374 mlog_bug_on_msg(ci->ci_num_cached != OCFS2_CACHE_INFO_MAX_ARRAY,
314 "Inode %llu, num cached = %u, should be %u\n", 375 "Owner %llu, num cached = %u, should be %u\n",
315 (unsigned long long)oi->ip_blkno, ci->ci_num_cached, 376 (unsigned long long)ocfs2_metadata_cache_owner(ci),
316 OCFS2_INODE_MAX_CACHE_ARRAY); 377 ci->ci_num_cached, OCFS2_CACHE_INFO_MAX_ARRAY);
317 mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), 378 mlog_bug_on_msg(!(ci->ci_flags & OCFS2_CACHE_FL_INLINE),
318 "Inode %llu not marked as inline anymore!\n", 379 "Owner %llu not marked as inline anymore!\n",
319 (unsigned long long)oi->ip_blkno); 380 (unsigned long long)ocfs2_metadata_cache_owner(ci));
320 assert_spin_locked(&oi->ip_lock);
321 381
322 /* Be careful to initialize the tree members *first* because 382 /* Be careful to initialize the tree members *first* because
323 * once the ci_tree is used, the array is junk... */ 383 * once the ci_tree is used, the array is junk... */
324 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) 384 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
325 tree[i]->c_block = ci->ci_cache.ci_array[i]; 385 tree[i]->c_block = ci->ci_cache.ci_array[i];
326 386
327 oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE; 387 ci->ci_flags &= ~OCFS2_CACHE_FL_INLINE;
328 ci->ci_cache.ci_tree = RB_ROOT; 388 ci->ci_cache.ci_tree = RB_ROOT;
329 /* this will be set again by __ocfs2_insert_cache_tree */ 389 /* this will be set again by __ocfs2_insert_cache_tree */
330 ci->ci_num_cached = 0; 390 ci->ci_num_cached = 0;
331 391
332 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { 392 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
333 __ocfs2_insert_cache_tree(ci, tree[i]); 393 __ocfs2_insert_cache_tree(ci, tree[i]);
334 tree[i] = NULL; 394 tree[i] = NULL;
335 } 395 }
336 396
337 mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n", 397 mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n",
338 (unsigned long long)oi->ip_blkno, oi->ip_flags, ci->ci_num_cached); 398 (unsigned long long)ocfs2_metadata_cache_owner(ci),
399 ci->ci_flags, ci->ci_num_cached);
339} 400}
340 401
341/* Slow path function - memory allocation is necessary. See the 402/* Slow path function - memory allocation is necessary. See the
342 * comment above ocfs2_set_buffer_uptodate for more information. */ 403 * comment above ocfs2_set_buffer_uptodate for more information. */
343static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, 404static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
344 sector_t block, 405 sector_t block,
345 int expand_tree) 406 int expand_tree)
346{ 407{
347 int i; 408 int i;
348 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
349 struct ocfs2_meta_cache_item *new = NULL; 409 struct ocfs2_meta_cache_item *new = NULL;
350 struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] = 410 struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] =
351 { NULL, }; 411 { NULL, };
352 412
353 mlog(0, "Inode %llu, block %llu, expand = %d\n", 413 mlog(0, "Owner %llu, block %llu, expand = %d\n",
354 (unsigned long long)oi->ip_blkno, 414 (unsigned long long)ocfs2_metadata_cache_owner(ci),
355 (unsigned long long)block, expand_tree); 415 (unsigned long long)block, expand_tree);
356 416
357 new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS); 417 new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS);
@@ -364,7 +424,7 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
364 if (expand_tree) { 424 if (expand_tree) {
365 /* Do *not* allocate an array here - the removal code 425 /* Do *not* allocate an array here - the removal code
366 * has no way of tracking that. */ 426 * has no way of tracking that. */
367 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { 427 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) {
368 tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep, 428 tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
369 GFP_NOFS); 429 GFP_NOFS);
370 if (!tree[i]) { 430 if (!tree[i]) {
@@ -376,21 +436,21 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
376 } 436 }
377 } 437 }
378 438
379 spin_lock(&oi->ip_lock); 439 ocfs2_metadata_cache_lock(ci);
380 if (ocfs2_insert_can_use_array(oi, ci)) { 440 if (ocfs2_insert_can_use_array(ci)) {
381 mlog(0, "Someone cleared the tree underneath us\n"); 441 mlog(0, "Someone cleared the tree underneath us\n");
382 /* Ok, items were removed from the cache in between 442 /* Ok, items were removed from the cache in between
383 * locks. Detect this and revert back to the fast path */ 443 * locks. Detect this and revert back to the fast path */
384 ocfs2_append_cache_array(ci, block); 444 ocfs2_append_cache_array(ci, block);
385 spin_unlock(&oi->ip_lock); 445 ocfs2_metadata_cache_unlock(ci);
386 goto out_free; 446 goto out_free;
387 } 447 }
388 448
389 if (expand_tree) 449 if (expand_tree)
390 ocfs2_expand_cache(oi, tree); 450 ocfs2_expand_cache(ci, tree);
391 451
392 __ocfs2_insert_cache_tree(ci, new); 452 __ocfs2_insert_cache_tree(ci, new);
393 spin_unlock(&oi->ip_lock); 453 ocfs2_metadata_cache_unlock(ci);
394 454
395 new = NULL; 455 new = NULL;
396out_free: 456out_free:
@@ -400,14 +460,14 @@ out_free:
400 /* If these were used, then ocfs2_expand_cache re-set them to 460 /* If these were used, then ocfs2_expand_cache re-set them to
401 * NULL for us. */ 461 * NULL for us. */
402 if (tree[0]) { 462 if (tree[0]) {
403 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) 463 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++)
404 if (tree[i]) 464 if (tree[i])
405 kmem_cache_free(ocfs2_uptodate_cachep, 465 kmem_cache_free(ocfs2_uptodate_cachep,
406 tree[i]); 466 tree[i]);
407 } 467 }
408} 468}
409 469
410/* Item insertion is guarded by ip_io_mutex, so the insertion path takes 470/* Item insertion is guarded by co_io_lock(), so the insertion path takes
411 * advantage of this by not rechecking for a duplicate insert during 471 * advantage of this by not rechecking for a duplicate insert during
412 * the slow case. Additionally, if the cache needs to be bumped up to 472 * the slow case. Additionally, if the cache needs to be bumped up to
413 * a tree, the code will not recheck after acquiring the lock -- 473 * a tree, the code will not recheck after acquiring the lock --
@@ -425,59 +485,55 @@ out_free:
425 * Readahead buffers can be passed in here before the I/O request is 485 * Readahead buffers can be passed in here before the I/O request is
426 * completed. 486 * completed.
427 */ 487 */
428void ocfs2_set_buffer_uptodate(struct inode *inode, 488void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
429 struct buffer_head *bh) 489 struct buffer_head *bh)
430{ 490{
431 int expand; 491 int expand;
432 struct ocfs2_inode_info *oi = OCFS2_I(inode);
433 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
434 492
435 /* The block may very well exist in our cache already, so avoid 493 /* The block may very well exist in our cache already, so avoid
436 * doing any more work in that case. */ 494 * doing any more work in that case. */
437 if (ocfs2_buffer_cached(oi, bh)) 495 if (ocfs2_buffer_cached(ci, bh))
438 return; 496 return;
439 497
440 mlog(0, "Inode %llu, inserting block %llu\n", 498 mlog(0, "Owner %llu, inserting block %llu\n",
441 (unsigned long long)oi->ip_blkno, 499 (unsigned long long)ocfs2_metadata_cache_owner(ci),
442 (unsigned long long)bh->b_blocknr); 500 (unsigned long long)bh->b_blocknr);
443 501
444 /* No need to recheck under spinlock - insertion is guarded by 502 /* No need to recheck under spinlock - insertion is guarded by
445 * ip_io_mutex */ 503 * co_io_lock() */
446 spin_lock(&oi->ip_lock); 504 ocfs2_metadata_cache_lock(ci);
447 if (ocfs2_insert_can_use_array(oi, ci)) { 505 if (ocfs2_insert_can_use_array(ci)) {
448 /* Fast case - it's an array and there's a free 506 /* Fast case - it's an array and there's a free
449 * spot. */ 507 * spot. */
450 ocfs2_append_cache_array(ci, bh->b_blocknr); 508 ocfs2_append_cache_array(ci, bh->b_blocknr);
451 spin_unlock(&oi->ip_lock); 509 ocfs2_metadata_cache_unlock(ci);
452 return; 510 return;
453 } 511 }
454 512
455 expand = 0; 513 expand = 0;
456 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { 514 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
457 /* We need to bump things up to a tree. */ 515 /* We need to bump things up to a tree. */
458 expand = 1; 516 expand = 1;
459 } 517 }
460 spin_unlock(&oi->ip_lock); 518 ocfs2_metadata_cache_unlock(ci);
461 519
462 __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand); 520 __ocfs2_set_buffer_uptodate(ci, bh->b_blocknr, expand);
463} 521}
464 522
465/* Called against a newly allocated buffer. Most likely nobody should 523/* Called against a newly allocated buffer. Most likely nobody should
466 * be able to read this sort of metadata while it's still being 524 * be able to read this sort of metadata while it's still being
467 * allocated, but this is careful to take ip_io_mutex anyway. */ 525 * allocated, but this is careful to take co_io_lock() anyway. */
468void ocfs2_set_new_buffer_uptodate(struct inode *inode, 526void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci,
469 struct buffer_head *bh) 527 struct buffer_head *bh)
470{ 528{
471 struct ocfs2_inode_info *oi = OCFS2_I(inode);
472
473 /* This should definitely *not* exist in our cache */ 529 /* This should definitely *not* exist in our cache */
474 BUG_ON(ocfs2_buffer_cached(oi, bh)); 530 BUG_ON(ocfs2_buffer_cached(ci, bh));
475 531
476 set_buffer_uptodate(bh); 532 set_buffer_uptodate(bh);
477 533
478 mutex_lock(&oi->ip_io_mutex); 534 ocfs2_metadata_cache_io_lock(ci);
479 ocfs2_set_buffer_uptodate(inode, bh); 535 ocfs2_set_buffer_uptodate(ci, bh);
480 mutex_unlock(&oi->ip_io_mutex); 536 ocfs2_metadata_cache_io_unlock(ci);
481} 537}
482 538
483/* Requires ip_lock. */ 539/* Requires ip_lock. */
@@ -487,7 +543,7 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
487 sector_t *array = ci->ci_cache.ci_array; 543 sector_t *array = ci->ci_cache.ci_array;
488 int bytes; 544 int bytes;
489 545
490 BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY); 546 BUG_ON(index < 0 || index >= OCFS2_CACHE_INFO_MAX_ARRAY);
491 BUG_ON(index >= ci->ci_num_cached); 547 BUG_ON(index >= ci->ci_num_cached);
492 BUG_ON(!ci->ci_num_cached); 548 BUG_ON(!ci->ci_num_cached);
493 549
@@ -515,21 +571,19 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
515 ci->ci_num_cached--; 571 ci->ci_num_cached--;
516} 572}
517 573
518static void ocfs2_remove_block_from_cache(struct inode *inode, 574static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci,
519 sector_t block) 575 sector_t block)
520{ 576{
521 int index; 577 int index;
522 struct ocfs2_meta_cache_item *item = NULL; 578 struct ocfs2_meta_cache_item *item = NULL;
523 struct ocfs2_inode_info *oi = OCFS2_I(inode);
524 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
525 579
526 spin_lock(&oi->ip_lock); 580 ocfs2_metadata_cache_lock(ci);
527 mlog(0, "Inode %llu, remove %llu, items = %u, array = %u\n", 581 mlog(0, "Owner %llu, remove %llu, items = %u, array = %u\n",
528 (unsigned long long)oi->ip_blkno, 582 (unsigned long long)ocfs2_metadata_cache_owner(ci),
529 (unsigned long long) block, ci->ci_num_cached, 583 (unsigned long long) block, ci->ci_num_cached,
530 oi->ip_flags & OCFS2_INODE_CACHE_INLINE); 584 ci->ci_flags & OCFS2_CACHE_FL_INLINE);
531 585
532 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { 586 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
533 index = ocfs2_search_cache_array(ci, block); 587 index = ocfs2_search_cache_array(ci, block);
534 if (index != -1) 588 if (index != -1)
535 ocfs2_remove_metadata_array(ci, index); 589 ocfs2_remove_metadata_array(ci, index);
@@ -538,7 +592,7 @@ static void ocfs2_remove_block_from_cache(struct inode *inode,
538 if (item) 592 if (item)
539 ocfs2_remove_metadata_tree(ci, item); 593 ocfs2_remove_metadata_tree(ci, item);
540 } 594 }
541 spin_unlock(&oi->ip_lock); 595 ocfs2_metadata_cache_unlock(ci);
542 596
543 if (item) 597 if (item)
544 kmem_cache_free(ocfs2_uptodate_cachep, item); 598 kmem_cache_free(ocfs2_uptodate_cachep, item);
@@ -549,23 +603,24 @@ static void ocfs2_remove_block_from_cache(struct inode *inode,
549 * bother reverting things to an inlined array in the case of a remove 603 * bother reverting things to an inlined array in the case of a remove
550 * which moves us back under the limit. 604 * which moves us back under the limit.
551 */ 605 */
552void ocfs2_remove_from_cache(struct inode *inode, 606void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci,
553 struct buffer_head *bh) 607 struct buffer_head *bh)
554{ 608{
555 sector_t block = bh->b_blocknr; 609 sector_t block = bh->b_blocknr;
556 610
557 ocfs2_remove_block_from_cache(inode, block); 611 ocfs2_remove_block_from_cache(ci, block);
558} 612}
559 613
560/* Called when we remove xattr clusters from an inode. */ 614/* Called when we remove xattr clusters from an inode. */
561void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode, 615void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci,
562 sector_t block, 616 sector_t block,
563 u32 c_len) 617 u32 c_len)
564{ 618{
565 unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len; 619 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
620 unsigned int i, b_len = ocfs2_clusters_to_blocks(sb, 1) * c_len;
566 621
567 for (i = 0; i < b_len; i++, block++) 622 for (i = 0; i < b_len; i++, block++)
568 ocfs2_remove_block_from_cache(inode, block); 623 ocfs2_remove_block_from_cache(ci, block);
569} 624}
570 625
571int __init init_ocfs2_uptodate_cache(void) 626int __init init_ocfs2_uptodate_cache(void)
@@ -577,7 +632,7 @@ int __init init_ocfs2_uptodate_cache(void)
577 return -ENOMEM; 632 return -ENOMEM;
578 633
579 mlog(0, "%u inlined cache items per inode.\n", 634 mlog(0, "%u inlined cache items per inode.\n",
580 OCFS2_INODE_MAX_CACHE_ARRAY); 635 OCFS2_CACHE_INFO_MAX_ARRAY);
581 636
582 return 0; 637 return 0;
583} 638}
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 531b4b3a0c47..0d826fe2da0d 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -26,24 +26,59 @@
26#ifndef OCFS2_UPTODATE_H 26#ifndef OCFS2_UPTODATE_H
27#define OCFS2_UPTODATE_H 27#define OCFS2_UPTODATE_H
28 28
29/*
30 * The caching code relies on locking provided by the user of
31 * struct ocfs2_caching_info. These operations connect that up.
32 */
33struct ocfs2_caching_operations {
34 /*
35 * A u64 representing the owning structure. Usually this
36 * is the block number (i_blkno or whatnot). This is used so
37 * that caching log messages can identify the owning structure.
38 */
39 u64 (*co_owner)(struct ocfs2_caching_info *ci);
40
41 /* The superblock is needed during I/O. */
42 struct super_block *(*co_get_super)(struct ocfs2_caching_info *ci);
43 /*
44 * Lock and unlock the caching data. These will not sleep, and
45 * should probably be spinlocks.
46 */
47 void (*co_cache_lock)(struct ocfs2_caching_info *ci);
48 void (*co_cache_unlock)(struct ocfs2_caching_info *ci);
49
50 /*
51 * Lock and unlock for disk I/O. These will sleep, and should
52 * be mutexes.
53 */
54 void (*co_io_lock)(struct ocfs2_caching_info *ci);
55 void (*co_io_unlock)(struct ocfs2_caching_info *ci);
56};
57
29int __init init_ocfs2_uptodate_cache(void); 58int __init init_ocfs2_uptodate_cache(void);
30void exit_ocfs2_uptodate_cache(void); 59void exit_ocfs2_uptodate_cache(void);
31 60
32void ocfs2_metadata_cache_init(struct inode *inode); 61void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci,
33void ocfs2_metadata_cache_purge(struct inode *inode); 62 const struct ocfs2_caching_operations *ops);
63void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci);
64void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci);
65
66u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci);
67void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci);
68void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci);
34 69
35int ocfs2_buffer_uptodate(struct inode *inode, 70int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci,
36 struct buffer_head *bh); 71 struct buffer_head *bh);
37void ocfs2_set_buffer_uptodate(struct inode *inode, 72void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
38 struct buffer_head *bh); 73 struct buffer_head *bh);
39void ocfs2_set_new_buffer_uptodate(struct inode *inode, 74void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci,
40 struct buffer_head *bh); 75 struct buffer_head *bh);
41void ocfs2_remove_from_cache(struct inode *inode, 76void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci,
42 struct buffer_head *bh); 77 struct buffer_head *bh);
43void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode, 78void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci,
44 sector_t block, 79 sector_t block,
45 u32 c_len); 80 u32 c_len);
46int ocfs2_buffer_read_ahead(struct inode *inode, 81int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci,
47 struct buffer_head *bh); 82 struct buffer_head *bh);
48 83
49#endif /* OCFS2_UPTODATE_H */ 84#endif /* OCFS2_UPTODATE_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d1a27cda984f..fe3419068df2 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -55,7 +55,8 @@
55#include "buffer_head_io.h" 55#include "buffer_head_io.h"
56#include "super.h" 56#include "super.h"
57#include "xattr.h" 57#include "xattr.h"
58 58#include "refcounttree.h"
59#include "acl.h"
59 60
60struct ocfs2_xattr_def_value_root { 61struct ocfs2_xattr_def_value_root {
61 struct ocfs2_xattr_value_root xv; 62 struct ocfs2_xattr_value_root xv;
@@ -140,7 +141,7 @@ struct ocfs2_xattr_search {
140 int not_found; 141 int not_found;
141}; 142};
142 143
143static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, 144static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
144 struct ocfs2_xattr_header *xh, 145 struct ocfs2_xattr_header *xh,
145 int index, 146 int index,
146 int *block_off, 147 int *block_off,
@@ -157,7 +158,7 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
157 struct ocfs2_xattr_search *xs); 158 struct ocfs2_xattr_search *xs);
158 159
159static int ocfs2_xattr_tree_list_index_block(struct inode *inode, 160static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
160 struct ocfs2_xattr_tree_root *xt, 161 struct buffer_head *blk_bh,
161 char *buffer, 162 char *buffer,
162 size_t buffer_size); 163 size_t buffer_size);
163 164
@@ -170,12 +171,42 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
170 struct ocfs2_xattr_search *xs, 171 struct ocfs2_xattr_search *xs,
171 struct ocfs2_xattr_set_ctxt *ctxt); 172 struct ocfs2_xattr_set_ctxt *ctxt);
172 173
173static int ocfs2_delete_xattr_index_block(struct inode *inode, 174typedef int (xattr_tree_rec_func)(struct inode *inode,
174 struct buffer_head *xb_bh); 175 struct buffer_head *root_bh,
176 u64 blkno, u32 cpos, u32 len, void *para);
177static int ocfs2_iterate_xattr_index_block(struct inode *inode,
178 struct buffer_head *root_bh,
179 xattr_tree_rec_func *rec_func,
180 void *para);
181static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
182 struct ocfs2_xattr_bucket *bucket,
183 void *para);
184static int ocfs2_rm_xattr_cluster(struct inode *inode,
185 struct buffer_head *root_bh,
186 u64 blkno,
187 u32 cpos,
188 u32 len,
189 void *para);
190
175static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle, 191static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
176 u64 src_blk, u64 last_blk, u64 to_blk, 192 u64 src_blk, u64 last_blk, u64 to_blk,
177 unsigned int start_bucket, 193 unsigned int start_bucket,
178 u32 *first_hash); 194 u32 *first_hash);
195static int ocfs2_prepare_refcount_xattr(struct inode *inode,
196 struct ocfs2_dinode *di,
197 struct ocfs2_xattr_info *xi,
198 struct ocfs2_xattr_search *xis,
199 struct ocfs2_xattr_search *xbs,
200 struct ocfs2_refcount_tree **ref_tree,
201 int *meta_need,
202 int *credits);
203static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
204 struct ocfs2_xattr_bucket *bucket,
205 int offset,
206 struct ocfs2_xattr_value_root **xv,
207 struct buffer_head **bh);
208static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
209 const void *value, size_t size, int flags);
179 210
180static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) 211static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
181{ 212{
@@ -254,9 +285,9 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
254 break; 285 break;
255 } 286 }
256 287
257 if (!ocfs2_buffer_uptodate(bucket->bu_inode, 288 if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
258 bucket->bu_bhs[i])) 289 bucket->bu_bhs[i]))
259 ocfs2_set_new_buffer_uptodate(bucket->bu_inode, 290 ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
260 bucket->bu_bhs[i]); 291 bucket->bu_bhs[i]);
261 } 292 }
262 293
@@ -271,7 +302,7 @@ static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
271{ 302{
272 int rc; 303 int rc;
273 304
274 rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno, 305 rc = ocfs2_read_blocks(INODE_CACHE(bucket->bu_inode), xb_blkno,
275 bucket->bu_blocks, bucket->bu_bhs, 0, 306 bucket->bu_blocks, bucket->bu_bhs, 0,
276 NULL); 307 NULL);
277 if (!rc) { 308 if (!rc) {
@@ -297,7 +328,8 @@ static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
297 int i, rc = 0; 328 int i, rc = 0;
298 329
299 for (i = 0; i < bucket->bu_blocks; i++) { 330 for (i = 0; i < bucket->bu_blocks; i++) {
300 rc = ocfs2_journal_access(handle, bucket->bu_inode, 331 rc = ocfs2_journal_access(handle,
332 INODE_CACHE(bucket->bu_inode),
301 bucket->bu_bhs[i], type); 333 bucket->bu_bhs[i], type);
302 if (rc) { 334 if (rc) {
303 mlog_errno(rc); 335 mlog_errno(rc);
@@ -399,7 +431,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
399 int rc; 431 int rc;
400 struct buffer_head *tmp = *bh; 432 struct buffer_head *tmp = *bh;
401 433
402 rc = ocfs2_read_block(inode, xb_blkno, &tmp, 434 rc = ocfs2_read_block(INODE_CACHE(inode), xb_blkno, &tmp,
403 ocfs2_validate_xattr_block); 435 ocfs2_validate_xattr_block);
404 436
405 /* If ocfs2_read_block() got us a new bh, pass it up. */ 437 /* If ocfs2_read_block() got us a new bh, pass it up. */
@@ -596,15 +628,14 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
596 int status = 0; 628 int status = 0;
597 handle_t *handle = ctxt->handle; 629 handle_t *handle = ctxt->handle;
598 enum ocfs2_alloc_restarted why; 630 enum ocfs2_alloc_restarted why;
599 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
600 u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters); 631 u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
601 struct ocfs2_extent_tree et; 632 struct ocfs2_extent_tree et;
602 633
603 mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add); 634 mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
604 635
605 ocfs2_init_xattr_value_extent_tree(&et, inode, vb); 636 ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
606 637
607 status = vb->vb_access(handle, inode, vb->vb_bh, 638 status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
608 OCFS2_JOURNAL_ACCESS_WRITE); 639 OCFS2_JOURNAL_ACCESS_WRITE);
609 if (status < 0) { 640 if (status < 0) {
610 mlog_errno(status); 641 mlog_errno(status);
@@ -612,13 +643,11 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
612 } 643 }
613 644
614 prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); 645 prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
615 status = ocfs2_add_clusters_in_btree(osb, 646 status = ocfs2_add_clusters_in_btree(handle,
616 inode, 647 &et,
617 &logical_start, 648 &logical_start,
618 clusters_to_add, 649 clusters_to_add,
619 0, 650 0,
620 &et,
621 handle,
622 ctxt->data_ac, 651 ctxt->data_ac,
623 ctxt->meta_ac, 652 ctxt->meta_ac,
624 &why); 653 &why);
@@ -649,6 +678,7 @@ leave:
649static int __ocfs2_remove_xattr_range(struct inode *inode, 678static int __ocfs2_remove_xattr_range(struct inode *inode,
650 struct ocfs2_xattr_value_buf *vb, 679 struct ocfs2_xattr_value_buf *vb,
651 u32 cpos, u32 phys_cpos, u32 len, 680 u32 cpos, u32 phys_cpos, u32 len,
681 unsigned int ext_flags,
652 struct ocfs2_xattr_set_ctxt *ctxt) 682 struct ocfs2_xattr_set_ctxt *ctxt)
653{ 683{
654 int ret; 684 int ret;
@@ -656,16 +686,16 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
656 handle_t *handle = ctxt->handle; 686 handle_t *handle = ctxt->handle;
657 struct ocfs2_extent_tree et; 687 struct ocfs2_extent_tree et;
658 688
659 ocfs2_init_xattr_value_extent_tree(&et, inode, vb); 689 ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
660 690
661 ret = vb->vb_access(handle, inode, vb->vb_bh, 691 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
662 OCFS2_JOURNAL_ACCESS_WRITE); 692 OCFS2_JOURNAL_ACCESS_WRITE);
663 if (ret) { 693 if (ret) {
664 mlog_errno(ret); 694 mlog_errno(ret);
665 goto out; 695 goto out;
666 } 696 }
667 697
668 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac, 698 ret = ocfs2_remove_extent(handle, &et, cpos, len, ctxt->meta_ac,
669 &ctxt->dealloc); 699 &ctxt->dealloc);
670 if (ret) { 700 if (ret) {
671 mlog_errno(ret); 701 mlog_errno(ret);
@@ -680,7 +710,14 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
680 goto out; 710 goto out;
681 } 711 }
682 712
683 ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len); 713 if (ext_flags & OCFS2_EXT_REFCOUNTED)
714 ret = ocfs2_decrease_refcount(inode, handle,
715 ocfs2_blocks_to_clusters(inode->i_sb,
716 phys_blkno),
717 len, ctxt->meta_ac, &ctxt->dealloc, 1);
718 else
719 ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc,
720 phys_blkno, len);
684 if (ret) 721 if (ret)
685 mlog_errno(ret); 722 mlog_errno(ret);
686 723
@@ -695,6 +732,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
695 struct ocfs2_xattr_set_ctxt *ctxt) 732 struct ocfs2_xattr_set_ctxt *ctxt)
696{ 733{
697 int ret = 0; 734 int ret = 0;
735 unsigned int ext_flags;
698 u32 trunc_len, cpos, phys_cpos, alloc_size; 736 u32 trunc_len, cpos, phys_cpos, alloc_size;
699 u64 block; 737 u64 block;
700 738
@@ -706,7 +744,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
706 while (trunc_len) { 744 while (trunc_len) {
707 ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos, 745 ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
708 &alloc_size, 746 &alloc_size,
709 &vb->vb_xv->xr_list); 747 &vb->vb_xv->xr_list, &ext_flags);
710 if (ret) { 748 if (ret) {
711 mlog_errno(ret); 749 mlog_errno(ret);
712 goto out; 750 goto out;
@@ -717,15 +755,15 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
717 755
718 ret = __ocfs2_remove_xattr_range(inode, vb, cpos, 756 ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
719 phys_cpos, alloc_size, 757 phys_cpos, alloc_size,
720 ctxt); 758 ext_flags, ctxt);
721 if (ret) { 759 if (ret) {
722 mlog_errno(ret); 760 mlog_errno(ret);
723 goto out; 761 goto out;
724 } 762 }
725 763
726 block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 764 block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
727 ocfs2_remove_xattr_clusters_from_cache(inode, block, 765 ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode),
728 alloc_size); 766 block, alloc_size);
729 cpos += alloc_size; 767 cpos += alloc_size;
730 trunc_len -= alloc_size; 768 trunc_len -= alloc_size;
731 } 769 }
@@ -810,6 +848,23 @@ static int ocfs2_xattr_list_entries(struct inode *inode,
810 return result; 848 return result;
811} 849}
812 850
851int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
852 struct ocfs2_dinode *di)
853{
854 struct ocfs2_xattr_header *xh;
855 int i;
856
857 xh = (struct ocfs2_xattr_header *)
858 ((void *)di + inode->i_sb->s_blocksize -
859 le16_to_cpu(di->i_xattr_inline_size));
860
861 for (i = 0; i < le16_to_cpu(xh->xh_count); i++)
862 if (!ocfs2_xattr_is_local(&xh->xh_entries[i]))
863 return 1;
864
865 return 0;
866}
867
813static int ocfs2_xattr_ibody_list(struct inode *inode, 868static int ocfs2_xattr_ibody_list(struct inode *inode,
814 struct ocfs2_dinode *di, 869 struct ocfs2_dinode *di,
815 char *buffer, 870 char *buffer,
@@ -855,11 +910,9 @@ static int ocfs2_xattr_block_list(struct inode *inode,
855 struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; 910 struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
856 ret = ocfs2_xattr_list_entries(inode, header, 911 ret = ocfs2_xattr_list_entries(inode, header,
857 buffer, buffer_size); 912 buffer, buffer_size);
858 } else { 913 } else
859 struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; 914 ret = ocfs2_xattr_tree_list_index_block(inode, blk_bh,
860 ret = ocfs2_xattr_tree_list_index_block(inode, xt,
861 buffer, buffer_size); 915 buffer, buffer_size);
862 }
863 916
864 brelse(blk_bh); 917 brelse(blk_bh);
865 918
@@ -961,7 +1014,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
961 cpos = 0; 1014 cpos = 0;
962 while (cpos < clusters) { 1015 while (cpos < clusters) {
963 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, 1016 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
964 &num_clusters, el); 1017 &num_clusters, el, NULL);
965 if (ret) { 1018 if (ret) {
966 mlog_errno(ret); 1019 mlog_errno(ret);
967 goto out; 1020 goto out;
@@ -970,7 +1023,8 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
970 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); 1023 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
971 /* Copy ocfs2_xattr_value */ 1024 /* Copy ocfs2_xattr_value */
972 for (i = 0; i < num_clusters * bpc; i++, blkno++) { 1025 for (i = 0; i < num_clusters * bpc; i++, blkno++) {
973 ret = ocfs2_read_block(inode, blkno, &bh, NULL); 1026 ret = ocfs2_read_block(INODE_CACHE(inode), blkno,
1027 &bh, NULL);
974 if (ret) { 1028 if (ret) {
975 mlog_errno(ret); 1029 mlog_errno(ret);
976 goto out; 1030 goto out;
@@ -1085,7 +1139,7 @@ static int ocfs2_xattr_block_get(struct inode *inode,
1085 i = xs->here - xs->header->xh_entries; 1139 i = xs->here - xs->header->xh_entries;
1086 1140
1087 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { 1141 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
1088 ret = ocfs2_xattr_bucket_get_name_value(inode, 1142 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
1089 bucket_xh(xs->bucket), 1143 bucket_xh(xs->bucket),
1090 i, 1144 i,
1091 &block_off, 1145 &block_off,
@@ -1183,7 +1237,7 @@ static int ocfs2_xattr_get(struct inode *inode,
1183 1237
1184static int __ocfs2_xattr_set_value_outside(struct inode *inode, 1238static int __ocfs2_xattr_set_value_outside(struct inode *inode,
1185 handle_t *handle, 1239 handle_t *handle,
1186 struct ocfs2_xattr_value_root *xv, 1240 struct ocfs2_xattr_value_buf *vb,
1187 const void *value, 1241 const void *value,
1188 int value_len) 1242 int value_len)
1189{ 1243{
@@ -1194,28 +1248,34 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
1194 u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len); 1248 u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
1195 u64 blkno; 1249 u64 blkno;
1196 struct buffer_head *bh = NULL; 1250 struct buffer_head *bh = NULL;
1251 unsigned int ext_flags;
1252 struct ocfs2_xattr_value_root *xv = vb->vb_xv;
1197 1253
1198 BUG_ON(clusters > le32_to_cpu(xv->xr_clusters)); 1254 BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
1199 1255
1200 while (cpos < clusters) { 1256 while (cpos < clusters) {
1201 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, 1257 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
1202 &num_clusters, &xv->xr_list); 1258 &num_clusters, &xv->xr_list,
1259 &ext_flags);
1203 if (ret) { 1260 if (ret) {
1204 mlog_errno(ret); 1261 mlog_errno(ret);
1205 goto out; 1262 goto out;
1206 } 1263 }
1207 1264
1265 BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
1266
1208 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); 1267 blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
1209 1268
1210 for (i = 0; i < num_clusters * bpc; i++, blkno++) { 1269 for (i = 0; i < num_clusters * bpc; i++, blkno++) {
1211 ret = ocfs2_read_block(inode, blkno, &bh, NULL); 1270 ret = ocfs2_read_block(INODE_CACHE(inode), blkno,
1271 &bh, NULL);
1212 if (ret) { 1272 if (ret) {
1213 mlog_errno(ret); 1273 mlog_errno(ret);
1214 goto out; 1274 goto out;
1215 } 1275 }
1216 1276
1217 ret = ocfs2_journal_access(handle, 1277 ret = ocfs2_journal_access(handle,
1218 inode, 1278 INODE_CACHE(inode),
1219 bh, 1279 bh,
1220 OCFS2_JOURNAL_ACCESS_WRITE); 1280 OCFS2_JOURNAL_ACCESS_WRITE);
1221 if (ret < 0) { 1281 if (ret < 0) {
@@ -1266,7 +1326,7 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
1266 void *val = xs->base + offs; 1326 void *val = xs->base + offs;
1267 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; 1327 size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
1268 1328
1269 ret = vb->vb_access(handle, inode, vb->vb_bh, 1329 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
1270 OCFS2_JOURNAL_ACCESS_WRITE); 1330 OCFS2_JOURNAL_ACCESS_WRITE);
1271 if (ret) { 1331 if (ret) {
1272 mlog_errno(ret); 1332 mlog_errno(ret);
@@ -1294,7 +1354,7 @@ static int ocfs2_xattr_update_entry(struct inode *inode,
1294{ 1354{
1295 int ret; 1355 int ret;
1296 1356
1297 ret = vb->vb_access(handle, inode, vb->vb_bh, 1357 ret = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
1298 OCFS2_JOURNAL_ACCESS_WRITE); 1358 OCFS2_JOURNAL_ACCESS_WRITE);
1299 if (ret) { 1359 if (ret) {
1300 mlog_errno(ret); 1360 mlog_errno(ret);
@@ -1355,7 +1415,7 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
1355 mlog_errno(ret); 1415 mlog_errno(ret);
1356 return ret; 1416 return ret;
1357 } 1417 }
1358 ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv, 1418 ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb,
1359 xi->value, xi->value_len); 1419 xi->value, xi->value_len);
1360 if (ret < 0) 1420 if (ret < 0)
1361 mlog_errno(ret); 1421 mlog_errno(ret);
@@ -1594,7 +1654,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1594 1654
1595 ret = __ocfs2_xattr_set_value_outside(inode, 1655 ret = __ocfs2_xattr_set_value_outside(inode,
1596 handle, 1656 handle,
1597 vb.vb_xv, 1657 &vb,
1598 xi->value, 1658 xi->value,
1599 xi->value_len); 1659 xi->value_len);
1600 if (ret < 0) 1660 if (ret < 0)
@@ -1615,7 +1675,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1615 } 1675 }
1616 } 1676 }
1617 1677
1618 ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh, 1678 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), xs->inode_bh,
1619 OCFS2_JOURNAL_ACCESS_WRITE); 1679 OCFS2_JOURNAL_ACCESS_WRITE);
1620 if (ret) { 1680 if (ret) {
1621 mlog_errno(ret); 1681 mlog_errno(ret);
@@ -1623,7 +1683,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
1623 } 1683 }
1624 1684
1625 if (!(flag & OCFS2_INLINE_XATTR_FL)) { 1685 if (!(flag & OCFS2_INLINE_XATTR_FL)) {
1626 ret = vb.vb_access(handle, inode, vb.vb_bh, 1686 ret = vb.vb_access(handle, INODE_CACHE(inode), vb.vb_bh,
1627 OCFS2_JOURNAL_ACCESS_WRITE); 1687 OCFS2_JOURNAL_ACCESS_WRITE);
1628 if (ret) { 1688 if (ret) {
1629 mlog_errno(ret); 1689 mlog_errno(ret);
@@ -1700,51 +1760,112 @@ out:
1700 return ret; 1760 return ret;
1701} 1761}
1702 1762
1763/*
1764 * In xattr remove, if it is stored outside and refcounted, we may have
1765 * the chance to split the refcount tree. So need the allocators.
1766 */
1767static int ocfs2_lock_xattr_remove_allocators(struct inode *inode,
1768 struct ocfs2_xattr_value_root *xv,
1769 struct ocfs2_caching_info *ref_ci,
1770 struct buffer_head *ref_root_bh,
1771 struct ocfs2_alloc_context **meta_ac,
1772 int *ref_credits)
1773{
1774 int ret, meta_add = 0;
1775 u32 p_cluster, num_clusters;
1776 unsigned int ext_flags;
1777
1778 *ref_credits = 0;
1779 ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
1780 &num_clusters,
1781 &xv->xr_list,
1782 &ext_flags);
1783 if (ret) {
1784 mlog_errno(ret);
1785 goto out;
1786 }
1787
1788 if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
1789 goto out;
1790
1791 ret = ocfs2_refcounted_xattr_delete_need(inode, ref_ci,
1792 ref_root_bh, xv,
1793 &meta_add, ref_credits);
1794 if (ret) {
1795 mlog_errno(ret);
1796 goto out;
1797 }
1798
1799 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
1800 meta_add, meta_ac);
1801 if (ret)
1802 mlog_errno(ret);
1803
1804out:
1805 return ret;
1806}
1807
1703static int ocfs2_remove_value_outside(struct inode*inode, 1808static int ocfs2_remove_value_outside(struct inode*inode,
1704 struct ocfs2_xattr_value_buf *vb, 1809 struct ocfs2_xattr_value_buf *vb,
1705 struct ocfs2_xattr_header *header) 1810 struct ocfs2_xattr_header *header,
1811 struct ocfs2_caching_info *ref_ci,
1812 struct buffer_head *ref_root_bh)
1706{ 1813{
1707 int ret = 0, i; 1814 int ret = 0, i, ref_credits;
1708 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1815 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1709 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; 1816 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
1817 void *val;
1710 1818
1711 ocfs2_init_dealloc_ctxt(&ctxt.dealloc); 1819 ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
1712 1820
1713 ctxt.handle = ocfs2_start_trans(osb,
1714 ocfs2_remove_extent_credits(osb->sb));
1715 if (IS_ERR(ctxt.handle)) {
1716 ret = PTR_ERR(ctxt.handle);
1717 mlog_errno(ret);
1718 goto out;
1719 }
1720
1721 for (i = 0; i < le16_to_cpu(header->xh_count); i++) { 1821 for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
1722 struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; 1822 struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
1723 1823
1724 if (!ocfs2_xattr_is_local(entry)) { 1824 if (ocfs2_xattr_is_local(entry))
1725 void *val; 1825 continue;
1726 1826
1727 val = (void *)header + 1827 val = (void *)header +
1728 le16_to_cpu(entry->xe_name_offset); 1828 le16_to_cpu(entry->xe_name_offset);
1729 vb->vb_xv = (struct ocfs2_xattr_value_root *) 1829 vb->vb_xv = (struct ocfs2_xattr_value_root *)
1730 (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); 1830 (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
1731 ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); 1831
1732 if (ret < 0) { 1832 ret = ocfs2_lock_xattr_remove_allocators(inode, vb->vb_xv,
1733 mlog_errno(ret); 1833 ref_ci, ref_root_bh,
1734 break; 1834 &ctxt.meta_ac,
1735 } 1835 &ref_credits);
1836
1837 ctxt.handle = ocfs2_start_trans(osb, ref_credits +
1838 ocfs2_remove_extent_credits(osb->sb));
1839 if (IS_ERR(ctxt.handle)) {
1840 ret = PTR_ERR(ctxt.handle);
1841 mlog_errno(ret);
1842 break;
1843 }
1844
1845 ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
1846 if (ret < 0) {
1847 mlog_errno(ret);
1848 break;
1849 }
1850
1851 ocfs2_commit_trans(osb, ctxt.handle);
1852 if (ctxt.meta_ac) {
1853 ocfs2_free_alloc_context(ctxt.meta_ac);
1854 ctxt.meta_ac = NULL;
1736 } 1855 }
1737 } 1856 }
1738 1857
1739 ocfs2_commit_trans(osb, ctxt.handle); 1858 if (ctxt.meta_ac)
1859 ocfs2_free_alloc_context(ctxt.meta_ac);
1740 ocfs2_schedule_truncate_log_flush(osb, 1); 1860 ocfs2_schedule_truncate_log_flush(osb, 1);
1741 ocfs2_run_deallocs(osb, &ctxt.dealloc); 1861 ocfs2_run_deallocs(osb, &ctxt.dealloc);
1742out:
1743 return ret; 1862 return ret;
1744} 1863}
1745 1864
1746static int ocfs2_xattr_ibody_remove(struct inode *inode, 1865static int ocfs2_xattr_ibody_remove(struct inode *inode,
1747 struct buffer_head *di_bh) 1866 struct buffer_head *di_bh,
1867 struct ocfs2_caching_info *ref_ci,
1868 struct buffer_head *ref_root_bh)
1748{ 1869{
1749 1870
1750 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 1871 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -1759,13 +1880,21 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode,
1759 ((void *)di + inode->i_sb->s_blocksize - 1880 ((void *)di + inode->i_sb->s_blocksize -
1760 le16_to_cpu(di->i_xattr_inline_size)); 1881 le16_to_cpu(di->i_xattr_inline_size));
1761 1882
1762 ret = ocfs2_remove_value_outside(inode, &vb, header); 1883 ret = ocfs2_remove_value_outside(inode, &vb, header,
1884 ref_ci, ref_root_bh);
1763 1885
1764 return ret; 1886 return ret;
1765} 1887}
1766 1888
1889struct ocfs2_rm_xattr_bucket_para {
1890 struct ocfs2_caching_info *ref_ci;
1891 struct buffer_head *ref_root_bh;
1892};
1893
1767static int ocfs2_xattr_block_remove(struct inode *inode, 1894static int ocfs2_xattr_block_remove(struct inode *inode,
1768 struct buffer_head *blk_bh) 1895 struct buffer_head *blk_bh,
1896 struct ocfs2_caching_info *ref_ci,
1897 struct buffer_head *ref_root_bh)
1769{ 1898{
1770 struct ocfs2_xattr_block *xb; 1899 struct ocfs2_xattr_block *xb;
1771 int ret = 0; 1900 int ret = 0;
@@ -1773,19 +1902,29 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
1773 .vb_bh = blk_bh, 1902 .vb_bh = blk_bh,
1774 .vb_access = ocfs2_journal_access_xb, 1903 .vb_access = ocfs2_journal_access_xb,
1775 }; 1904 };
1905 struct ocfs2_rm_xattr_bucket_para args = {
1906 .ref_ci = ref_ci,
1907 .ref_root_bh = ref_root_bh,
1908 };
1776 1909
1777 xb = (struct ocfs2_xattr_block *)blk_bh->b_data; 1910 xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
1778 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { 1911 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
1779 struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header); 1912 struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
1780 ret = ocfs2_remove_value_outside(inode, &vb, header); 1913 ret = ocfs2_remove_value_outside(inode, &vb, header,
1914 ref_ci, ref_root_bh);
1781 } else 1915 } else
1782 ret = ocfs2_delete_xattr_index_block(inode, blk_bh); 1916 ret = ocfs2_iterate_xattr_index_block(inode,
1917 blk_bh,
1918 ocfs2_rm_xattr_cluster,
1919 &args);
1783 1920
1784 return ret; 1921 return ret;
1785} 1922}
1786 1923
1787static int ocfs2_xattr_free_block(struct inode *inode, 1924static int ocfs2_xattr_free_block(struct inode *inode,
1788 u64 block) 1925 u64 block,
1926 struct ocfs2_caching_info *ref_ci,
1927 struct buffer_head *ref_root_bh)
1789{ 1928{
1790 struct inode *xb_alloc_inode; 1929 struct inode *xb_alloc_inode;
1791 struct buffer_head *xb_alloc_bh = NULL; 1930 struct buffer_head *xb_alloc_bh = NULL;
@@ -1803,7 +1942,7 @@ static int ocfs2_xattr_free_block(struct inode *inode,
1803 goto out; 1942 goto out;
1804 } 1943 }
1805 1944
1806 ret = ocfs2_xattr_block_remove(inode, blk_bh); 1945 ret = ocfs2_xattr_block_remove(inode, blk_bh, ref_ci, ref_root_bh);
1807 if (ret < 0) { 1946 if (ret < 0) {
1808 mlog_errno(ret); 1947 mlog_errno(ret);
1809 goto out; 1948 goto out;
@@ -1863,6 +2002,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1863{ 2002{
1864 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2003 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1865 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2004 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2005 struct ocfs2_refcount_tree *ref_tree = NULL;
2006 struct buffer_head *ref_root_bh = NULL;
2007 struct ocfs2_caching_info *ref_ci = NULL;
1866 handle_t *handle; 2008 handle_t *handle;
1867 int ret; 2009 int ret;
1868 2010
@@ -1872,8 +2014,21 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1872 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) 2014 if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
1873 return 0; 2015 return 0;
1874 2016
2017 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
2018 ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb),
2019 le64_to_cpu(di->i_refcount_loc),
2020 1, &ref_tree, &ref_root_bh);
2021 if (ret) {
2022 mlog_errno(ret);
2023 goto out;
2024 }
2025 ref_ci = &ref_tree->rf_ci;
2026
2027 }
2028
1875 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { 2029 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
1876 ret = ocfs2_xattr_ibody_remove(inode, di_bh); 2030 ret = ocfs2_xattr_ibody_remove(inode, di_bh,
2031 ref_ci, ref_root_bh);
1877 if (ret < 0) { 2032 if (ret < 0) {
1878 mlog_errno(ret); 2033 mlog_errno(ret);
1879 goto out; 2034 goto out;
@@ -1882,7 +2037,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1882 2037
1883 if (di->i_xattr_loc) { 2038 if (di->i_xattr_loc) {
1884 ret = ocfs2_xattr_free_block(inode, 2039 ret = ocfs2_xattr_free_block(inode,
1885 le64_to_cpu(di->i_xattr_loc)); 2040 le64_to_cpu(di->i_xattr_loc),
2041 ref_ci, ref_root_bh);
1886 if (ret < 0) { 2042 if (ret < 0) {
1887 mlog_errno(ret); 2043 mlog_errno(ret);
1888 goto out; 2044 goto out;
@@ -1896,7 +2052,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1896 mlog_errno(ret); 2052 mlog_errno(ret);
1897 goto out; 2053 goto out;
1898 } 2054 }
1899 ret = ocfs2_journal_access_di(handle, inode, di_bh, 2055 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
1900 OCFS2_JOURNAL_ACCESS_WRITE); 2056 OCFS2_JOURNAL_ACCESS_WRITE);
1901 if (ret) { 2057 if (ret) {
1902 mlog_errno(ret); 2058 mlog_errno(ret);
@@ -1916,6 +2072,9 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
1916out_commit: 2072out_commit:
1917 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 2073 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
1918out: 2074out:
2075 if (ref_tree)
2076 ocfs2_unlock_refcount_tree(OCFS2_SB(inode->i_sb), ref_tree, 1);
2077 brelse(ref_root_bh);
1919 return ret; 2078 return ret;
1920} 2079}
1921 2080
@@ -2083,6 +2242,84 @@ cleanup:
2083 return ret; 2242 return ret;
2084} 2243}
2085 2244
2245static int ocfs2_create_xattr_block(handle_t *handle,
2246 struct inode *inode,
2247 struct buffer_head *inode_bh,
2248 struct ocfs2_alloc_context *meta_ac,
2249 struct buffer_head **ret_bh,
2250 int indexed)
2251{
2252 int ret;
2253 u16 suballoc_bit_start;
2254 u32 num_got;
2255 u64 first_blkno;
2256 struct ocfs2_dinode *di = (struct ocfs2_dinode *)inode_bh->b_data;
2257 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2258 struct buffer_head *new_bh = NULL;
2259 struct ocfs2_xattr_block *xblk;
2260
2261 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), inode_bh,
2262 OCFS2_JOURNAL_ACCESS_CREATE);
2263 if (ret < 0) {
2264 mlog_errno(ret);
2265 goto end;
2266 }
2267
2268 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
2269 &suballoc_bit_start, &num_got,
2270 &first_blkno);
2271 if (ret < 0) {
2272 mlog_errno(ret);
2273 goto end;
2274 }
2275
2276 new_bh = sb_getblk(inode->i_sb, first_blkno);
2277 ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
2278
2279 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode),
2280 new_bh,
2281 OCFS2_JOURNAL_ACCESS_CREATE);
2282 if (ret < 0) {
2283 mlog_errno(ret);
2284 goto end;
2285 }
2286
2287 /* Initialize ocfs2_xattr_block */
2288 xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
2289 memset(xblk, 0, inode->i_sb->s_blocksize);
2290 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
2291 xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
2292 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
2293 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
2294 xblk->xb_blkno = cpu_to_le64(first_blkno);
2295
2296 if (indexed) {
2297 struct ocfs2_xattr_tree_root *xr = &xblk->xb_attrs.xb_root;
2298 xr->xt_clusters = cpu_to_le32(1);
2299 xr->xt_last_eb_blk = 0;
2300 xr->xt_list.l_tree_depth = 0;
2301 xr->xt_list.l_count = cpu_to_le16(
2302 ocfs2_xattr_recs_per_xb(inode->i_sb));
2303 xr->xt_list.l_next_free_rec = cpu_to_le16(1);
2304 xblk->xb_flags = cpu_to_le16(OCFS2_XATTR_INDEXED);
2305 }
2306
2307 ret = ocfs2_journal_dirty(handle, new_bh);
2308 if (ret < 0) {
2309 mlog_errno(ret);
2310 goto end;
2311 }
2312 di->i_xattr_loc = cpu_to_le64(first_blkno);
2313 ocfs2_journal_dirty(handle, inode_bh);
2314
2315 *ret_bh = new_bh;
2316 new_bh = NULL;
2317
2318end:
2319 brelse(new_bh);
2320 return ret;
2321}
2322
2086/* 2323/*
2087 * ocfs2_xattr_block_set() 2324 * ocfs2_xattr_block_set()
2088 * 2325 *
@@ -2095,63 +2332,24 @@ static int ocfs2_xattr_block_set(struct inode *inode,
2095 struct ocfs2_xattr_set_ctxt *ctxt) 2332 struct ocfs2_xattr_set_ctxt *ctxt)
2096{ 2333{
2097 struct buffer_head *new_bh = NULL; 2334 struct buffer_head *new_bh = NULL;
2098 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2099 struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
2100 handle_t *handle = ctxt->handle; 2335 handle_t *handle = ctxt->handle;
2101 struct ocfs2_xattr_block *xblk = NULL; 2336 struct ocfs2_xattr_block *xblk = NULL;
2102 u16 suballoc_bit_start;
2103 u32 num_got;
2104 u64 first_blkno;
2105 int ret; 2337 int ret;
2106 2338
2107 if (!xs->xattr_bh) { 2339 if (!xs->xattr_bh) {
2108 ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh, 2340 ret = ocfs2_create_xattr_block(handle, inode, xs->inode_bh,
2109 OCFS2_JOURNAL_ACCESS_CREATE); 2341 ctxt->meta_ac, &new_bh, 0);
2110 if (ret < 0) { 2342 if (ret) {
2111 mlog_errno(ret);
2112 goto end;
2113 }
2114
2115 ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
2116 &suballoc_bit_start, &num_got,
2117 &first_blkno);
2118 if (ret < 0) {
2119 mlog_errno(ret);
2120 goto end;
2121 }
2122
2123 new_bh = sb_getblk(inode->i_sb, first_blkno);
2124 ocfs2_set_new_buffer_uptodate(inode, new_bh);
2125
2126 ret = ocfs2_journal_access_xb(handle, inode, new_bh,
2127 OCFS2_JOURNAL_ACCESS_CREATE);
2128 if (ret < 0) {
2129 mlog_errno(ret); 2343 mlog_errno(ret);
2130 goto end; 2344 goto end;
2131 } 2345 }
2132 2346
2133 /* Initialize ocfs2_xattr_block */
2134 xs->xattr_bh = new_bh; 2347 xs->xattr_bh = new_bh;
2135 xblk = (struct ocfs2_xattr_block *)new_bh->b_data; 2348 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
2136 memset(xblk, 0, inode->i_sb->s_blocksize);
2137 strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
2138 xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
2139 xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
2140 xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
2141 xblk->xb_blkno = cpu_to_le64(first_blkno);
2142
2143 xs->header = &xblk->xb_attrs.xb_header; 2349 xs->header = &xblk->xb_attrs.xb_header;
2144 xs->base = (void *)xs->header; 2350 xs->base = (void *)xs->header;
2145 xs->end = (void *)xblk + inode->i_sb->s_blocksize; 2351 xs->end = (void *)xblk + inode->i_sb->s_blocksize;
2146 xs->here = xs->header->xh_entries; 2352 xs->here = xs->header->xh_entries;
2147
2148 ret = ocfs2_journal_dirty(handle, new_bh);
2149 if (ret < 0) {
2150 mlog_errno(ret);
2151 goto end;
2152 }
2153 di->i_xattr_loc = cpu_to_le64(first_blkno);
2154 ocfs2_journal_dirty(handle, xs->inode_bh);
2155 } else 2353 } else
2156 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; 2354 xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
2157 2355
@@ -2273,7 +2471,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
2273 old_in_xb = 1; 2471 old_in_xb = 1;
2274 2472
2275 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { 2473 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
2276 ret = ocfs2_xattr_bucket_get_name_value(inode, 2474 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
2277 bucket_xh(xbs->bucket), 2475 bucket_xh(xbs->bucket),
2278 i, &block_off, 2476 i, &block_off,
2279 &name_offset); 2477 &name_offset);
@@ -2428,6 +2626,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
2428 struct ocfs2_xattr_search *xis, 2626 struct ocfs2_xattr_search *xis,
2429 struct ocfs2_xattr_search *xbs, 2627 struct ocfs2_xattr_search *xbs,
2430 struct ocfs2_xattr_set_ctxt *ctxt, 2628 struct ocfs2_xattr_set_ctxt *ctxt,
2629 int extra_meta,
2431 int *credits) 2630 int *credits)
2432{ 2631{
2433 int clusters_add, meta_add, ret; 2632 int clusters_add, meta_add, ret;
@@ -2444,6 +2643,7 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
2444 return ret; 2643 return ret;
2445 } 2644 }
2446 2645
2646 meta_add += extra_meta;
2447 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, " 2647 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
2448 "credits = %d\n", xi->name, meta_add, clusters_add, *credits); 2648 "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
2449 2649
@@ -2598,7 +2798,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
2598 2798
2599 if (!ret) { 2799 if (!ret) {
2600 /* Update inode ctime. */ 2800 /* Update inode ctime. */
2601 ret = ocfs2_journal_access_di(ctxt->handle, inode, 2801 ret = ocfs2_journal_access_di(ctxt->handle, INODE_CACHE(inode),
2602 xis->inode_bh, 2802 xis->inode_bh,
2603 OCFS2_JOURNAL_ACCESS_WRITE); 2803 OCFS2_JOURNAL_ACCESS_WRITE);
2604 if (ret) { 2804 if (ret) {
@@ -2711,10 +2911,11 @@ int ocfs2_xattr_set(struct inode *inode,
2711{ 2911{
2712 struct buffer_head *di_bh = NULL; 2912 struct buffer_head *di_bh = NULL;
2713 struct ocfs2_dinode *di; 2913 struct ocfs2_dinode *di;
2714 int ret, credits; 2914 int ret, credits, ref_meta = 0, ref_credits = 0;
2715 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2915 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2716 struct inode *tl_inode = osb->osb_tl_inode; 2916 struct inode *tl_inode = osb->osb_tl_inode;
2717 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; 2917 struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
2918 struct ocfs2_refcount_tree *ref_tree = NULL;
2718 2919
2719 struct ocfs2_xattr_info xi = { 2920 struct ocfs2_xattr_info xi = {
2720 .name_index = name_index, 2921 .name_index = name_index,
@@ -2779,6 +2980,17 @@ int ocfs2_xattr_set(struct inode *inode,
2779 goto cleanup; 2980 goto cleanup;
2780 } 2981 }
2781 2982
2983 /* Check whether the value is refcounted and do some prepartion. */
2984 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL &&
2985 (!xis.not_found || !xbs.not_found)) {
2986 ret = ocfs2_prepare_refcount_xattr(inode, di, &xi,
2987 &xis, &xbs, &ref_tree,
2988 &ref_meta, &ref_credits);
2989 if (ret) {
2990 mlog_errno(ret);
2991 goto cleanup;
2992 }
2993 }
2782 2994
2783 mutex_lock(&tl_inode->i_mutex); 2995 mutex_lock(&tl_inode->i_mutex);
2784 2996
@@ -2793,7 +3005,7 @@ int ocfs2_xattr_set(struct inode *inode,
2793 mutex_unlock(&tl_inode->i_mutex); 3005 mutex_unlock(&tl_inode->i_mutex);
2794 3006
2795 ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, 3007 ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
2796 &xbs, &ctxt, &credits); 3008 &xbs, &ctxt, ref_meta, &credits);
2797 if (ret) { 3009 if (ret) {
2798 mlog_errno(ret); 3010 mlog_errno(ret);
2799 goto cleanup; 3011 goto cleanup;
@@ -2801,7 +3013,7 @@ int ocfs2_xattr_set(struct inode *inode,
2801 3013
2802 /* we need to update inode's ctime field, so add credit for it. */ 3014 /* we need to update inode's ctime field, so add credit for it. */
2803 credits += OCFS2_INODE_UPDATE_CREDITS; 3015 credits += OCFS2_INODE_UPDATE_CREDITS;
2804 ctxt.handle = ocfs2_start_trans(osb, credits); 3016 ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
2805 if (IS_ERR(ctxt.handle)) { 3017 if (IS_ERR(ctxt.handle)) {
2806 ret = PTR_ERR(ctxt.handle); 3018 ret = PTR_ERR(ctxt.handle);
2807 mlog_errno(ret); 3019 mlog_errno(ret);
@@ -2819,8 +3031,16 @@ int ocfs2_xattr_set(struct inode *inode,
2819 if (ocfs2_dealloc_has_cluster(&ctxt.dealloc)) 3031 if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
2820 ocfs2_schedule_truncate_log_flush(osb, 1); 3032 ocfs2_schedule_truncate_log_flush(osb, 1);
2821 ocfs2_run_deallocs(osb, &ctxt.dealloc); 3033 ocfs2_run_deallocs(osb, &ctxt.dealloc);
3034
2822cleanup: 3035cleanup:
3036 if (ref_tree)
3037 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
2823 up_write(&OCFS2_I(inode)->ip_xattr_sem); 3038 up_write(&OCFS2_I(inode)->ip_xattr_sem);
3039 if (!value && !ret) {
3040 ret = ocfs2_try_remove_refcount_tree(inode, di_bh);
3041 if (ret)
3042 mlog_errno(ret);
3043 }
2824 ocfs2_inode_unlock(inode, 1); 3044 ocfs2_inode_unlock(inode, 1);
2825cleanup_nolock: 3045cleanup_nolock:
2826 brelse(di_bh); 3046 brelse(di_bh);
@@ -2849,7 +3069,8 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
2849 u64 e_blkno = 0; 3069 u64 e_blkno = 0;
2850 3070
2851 if (el->l_tree_depth) { 3071 if (el->l_tree_depth) {
2852 ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh); 3072 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, name_hash,
3073 &eb_bh);
2853 if (ret) { 3074 if (ret) {
2854 mlog_errno(ret); 3075 mlog_errno(ret);
2855 goto out; 3076 goto out;
@@ -2931,7 +3152,7 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
2931 if (cmp) 3152 if (cmp)
2932 continue; 3153 continue;
2933 3154
2934 ret = ocfs2_xattr_bucket_get_name_value(inode, 3155 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
2935 xh, 3156 xh,
2936 i, 3157 i,
2937 &block_off, 3158 &block_off,
@@ -3175,7 +3396,7 @@ struct ocfs2_xattr_tree_list {
3175 size_t result; 3396 size_t result;
3176}; 3397};
3177 3398
3178static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, 3399static int ocfs2_xattr_bucket_get_name_value(struct super_block *sb,
3179 struct ocfs2_xattr_header *xh, 3400 struct ocfs2_xattr_header *xh,
3180 int index, 3401 int index,
3181 int *block_off, 3402 int *block_off,
@@ -3188,8 +3409,8 @@ static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
3188 3409
3189 name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset); 3410 name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
3190 3411
3191 *block_off = name_offset >> inode->i_sb->s_blocksize_bits; 3412 *block_off = name_offset >> sb->s_blocksize_bits;
3192 *new_offset = name_offset % inode->i_sb->s_blocksize; 3413 *new_offset = name_offset % sb->s_blocksize;
3193 3414
3194 return 0; 3415 return 0;
3195} 3416}
@@ -3209,7 +3430,7 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
3209 prefix = ocfs2_xattr_prefix(type); 3430 prefix = ocfs2_xattr_prefix(type);
3210 3431
3211 if (prefix) { 3432 if (prefix) {
3212 ret = ocfs2_xattr_bucket_get_name_value(inode, 3433 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
3213 bucket_xh(bucket), 3434 bucket_xh(bucket),
3214 i, 3435 i,
3215 &block_off, 3436 &block_off,
@@ -3232,22 +3453,19 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
3232 return ret; 3453 return ret;
3233} 3454}
3234 3455
3235static int ocfs2_xattr_tree_list_index_block(struct inode *inode, 3456static int ocfs2_iterate_xattr_index_block(struct inode *inode,
3236 struct ocfs2_xattr_tree_root *xt, 3457 struct buffer_head *blk_bh,
3237 char *buffer, 3458 xattr_tree_rec_func *rec_func,
3238 size_t buffer_size) 3459 void *para)
3239{ 3460{
3240 struct ocfs2_extent_list *el = &xt->xt_list; 3461 struct ocfs2_xattr_block *xb =
3462 (struct ocfs2_xattr_block *)blk_bh->b_data;
3463 struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
3241 int ret = 0; 3464 int ret = 0;
3242 u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0; 3465 u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
3243 u64 p_blkno = 0; 3466 u64 p_blkno = 0;
3244 struct ocfs2_xattr_tree_list xl = {
3245 .buffer = buffer,
3246 .buffer_size = buffer_size,
3247 .result = 0,
3248 };
3249 3467
3250 if (le16_to_cpu(el->l_next_free_rec) == 0) 3468 if (!el->l_next_free_rec || !rec_func)
3251 return 0; 3469 return 0;
3252 3470
3253 while (name_hash > 0) { 3471 while (name_hash > 0) {
@@ -3255,16 +3473,15 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
3255 &e_cpos, &num_clusters, el); 3473 &e_cpos, &num_clusters, el);
3256 if (ret) { 3474 if (ret) {
3257 mlog_errno(ret); 3475 mlog_errno(ret);
3258 goto out; 3476 break;
3259 } 3477 }
3260 3478
3261 ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, 3479 ret = rec_func(inode, blk_bh, p_blkno, e_cpos,
3262 ocfs2_list_xattr_bucket, 3480 num_clusters, para);
3263 &xl);
3264 if (ret) { 3481 if (ret) {
3265 if (ret != -ERANGE) 3482 if (ret != -ERANGE)
3266 mlog_errno(ret); 3483 mlog_errno(ret);
3267 goto out; 3484 break;
3268 } 3485 }
3269 3486
3270 if (e_cpos == 0) 3487 if (e_cpos == 0)
@@ -3273,6 +3490,37 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
3273 name_hash = e_cpos - 1; 3490 name_hash = e_cpos - 1;
3274 } 3491 }
3275 3492
3493 return ret;
3494
3495}
3496
3497static int ocfs2_list_xattr_tree_rec(struct inode *inode,
3498 struct buffer_head *root_bh,
3499 u64 blkno, u32 cpos, u32 len, void *para)
3500{
3501 return ocfs2_iterate_xattr_buckets(inode, blkno, len,
3502 ocfs2_list_xattr_bucket, para);
3503}
3504
3505static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
3506 struct buffer_head *blk_bh,
3507 char *buffer,
3508 size_t buffer_size)
3509{
3510 int ret;
3511 struct ocfs2_xattr_tree_list xl = {
3512 .buffer = buffer,
3513 .buffer_size = buffer_size,
3514 .result = 0,
3515 };
3516
3517 ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
3518 ocfs2_list_xattr_tree_rec, &xl);
3519 if (ret) {
3520 mlog_errno(ret);
3521 goto out;
3522 }
3523
3276 ret = xl.result; 3524 ret = xl.result;
3277out: 3525out:
3278 return ret; 3526 return ret;
@@ -3426,7 +3674,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
3426 */ 3674 */
3427 down_write(&oi->ip_alloc_sem); 3675 down_write(&oi->ip_alloc_sem);
3428 3676
3429 ret = ocfs2_journal_access_xb(handle, inode, xb_bh, 3677 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), xb_bh,
3430 OCFS2_JOURNAL_ACCESS_WRITE); 3678 OCFS2_JOURNAL_ACCESS_WRITE);
3431 if (ret) { 3679 if (ret) {
3432 mlog_errno(ret); 3680 mlog_errno(ret);
@@ -4263,9 +4511,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
4263 (unsigned long long)OCFS2_I(inode)->ip_blkno, 4511 (unsigned long long)OCFS2_I(inode)->ip_blkno,
4264 prev_cpos, (unsigned long long)bucket_blkno(first)); 4512 prev_cpos, (unsigned long long)bucket_blkno(first));
4265 4513
4266 ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); 4514 ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
4267 4515
4268 ret = ocfs2_journal_access_xb(handle, inode, root_bh, 4516 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh,
4269 OCFS2_JOURNAL_ACCESS_WRITE); 4517 OCFS2_JOURNAL_ACCESS_WRITE);
4270 if (ret < 0) { 4518 if (ret < 0) {
4271 mlog_errno(ret); 4519 mlog_errno(ret);
@@ -4319,7 +4567,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
4319 4567
4320 mlog(0, "Insert %u clusters at block %llu for xattr at %u\n", 4568 mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
4321 num_bits, (unsigned long long)block, v_start); 4569 num_bits, (unsigned long long)block, v_start);
4322 ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block, 4570 ret = ocfs2_insert_extent(handle, &et, v_start, block,
4323 num_bits, 0, ctxt->meta_ac); 4571 num_bits, 0, ctxt->meta_ac);
4324 if (ret < 0) { 4572 if (ret < 0) {
4325 mlog_errno(ret); 4573 mlog_errno(ret);
@@ -4798,10 +5046,13 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
4798 struct ocfs2_xattr_entry *xe = xs->here; 5046 struct ocfs2_xattr_entry *xe = xs->here;
4799 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket); 5047 struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
4800 void *base; 5048 void *base;
5049 struct ocfs2_xattr_value_buf vb = {
5050 .vb_access = ocfs2_journal_access,
5051 };
4801 5052
4802 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); 5053 BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
4803 5054
4804 ret = ocfs2_xattr_bucket_get_name_value(inode, xh, 5055 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, xh,
4805 xe - xh->xh_entries, 5056 xe - xh->xh_entries,
4806 &block_off, 5057 &block_off,
4807 &offset); 5058 &offset);
@@ -4814,8 +5065,10 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
4814 xv = (struct ocfs2_xattr_value_root *)(base + offset + 5065 xv = (struct ocfs2_xattr_value_root *)(base + offset +
4815 OCFS2_XATTR_SIZE(xe->xe_name_len)); 5066 OCFS2_XATTR_SIZE(xe->xe_name_len));
4816 5067
5068 vb.vb_xv = xv;
5069 vb.vb_bh = xs->bucket->bu_bhs[block_off];
4817 ret = __ocfs2_xattr_set_value_outside(inode, handle, 5070 ret = __ocfs2_xattr_set_value_outside(inode, handle,
4818 xv, val, value_len); 5071 &vb, val, value_len);
4819 if (ret) 5072 if (ret)
4820 mlog_errno(ret); 5073 mlog_errno(ret);
4821out: 5074out:
@@ -4826,7 +5079,8 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
4826 struct buffer_head *root_bh, 5079 struct buffer_head *root_bh,
4827 u64 blkno, 5080 u64 blkno,
4828 u32 cpos, 5081 u32 cpos,
4829 u32 len) 5082 u32 len,
5083 void *para)
4830{ 5084{
4831 int ret; 5085 int ret;
4832 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 5086 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -4838,14 +5092,22 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
4838 struct ocfs2_cached_dealloc_ctxt dealloc; 5092 struct ocfs2_cached_dealloc_ctxt dealloc;
4839 struct ocfs2_extent_tree et; 5093 struct ocfs2_extent_tree et;
4840 5094
4841 ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); 5095 ret = ocfs2_iterate_xattr_buckets(inode, blkno, len,
5096 ocfs2_delete_xattr_in_bucket, para);
5097 if (ret) {
5098 mlog_errno(ret);
5099 return ret;
5100 }
5101
5102 ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
4842 5103
4843 ocfs2_init_dealloc_ctxt(&dealloc); 5104 ocfs2_init_dealloc_ctxt(&dealloc);
4844 5105
4845 mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n", 5106 mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n",
4846 cpos, len, (unsigned long long)blkno); 5107 cpos, len, (unsigned long long)blkno);
4847 5108
4848 ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len); 5109 ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno,
5110 len);
4849 5111
4850 ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); 5112 ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
4851 if (ret) { 5113 if (ret) {
@@ -4870,14 +5132,14 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
4870 goto out; 5132 goto out;
4871 } 5133 }
4872 5134
4873 ret = ocfs2_journal_access_xb(handle, inode, root_bh, 5135 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(inode), root_bh,
4874 OCFS2_JOURNAL_ACCESS_WRITE); 5136 OCFS2_JOURNAL_ACCESS_WRITE);
4875 if (ret) { 5137 if (ret) {
4876 mlog_errno(ret); 5138 mlog_errno(ret);
4877 goto out_commit; 5139 goto out_commit;
4878 } 5140 }
4879 5141
4880 ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, 5142 ret = ocfs2_remove_extent(handle, &et, cpos, len, meta_ac,
4881 &dealloc); 5143 &dealloc);
4882 if (ret) { 5144 if (ret) {
4883 mlog_errno(ret); 5145 mlog_errno(ret);
@@ -5220,7 +5482,7 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
5220 struct ocfs2_xattr_bucket *bucket, 5482 struct ocfs2_xattr_bucket *bucket,
5221 void *para) 5483 void *para)
5222{ 5484{
5223 int ret = 0; 5485 int ret = 0, ref_credits;
5224 struct ocfs2_xattr_header *xh = bucket_xh(bucket); 5486 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
5225 u16 i; 5487 u16 i;
5226 struct ocfs2_xattr_entry *xe; 5488 struct ocfs2_xattr_entry *xe;
@@ -5228,7 +5490,9 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
5228 struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,}; 5490 struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
5229 int credits = ocfs2_remove_extent_credits(osb->sb) + 5491 int credits = ocfs2_remove_extent_credits(osb->sb) +
5230 ocfs2_blocks_per_xattr_bucket(inode->i_sb); 5492 ocfs2_blocks_per_xattr_bucket(inode->i_sb);
5231 5493 struct ocfs2_xattr_value_root *xv;
5494 struct ocfs2_rm_xattr_bucket_para *args =
5495 (struct ocfs2_rm_xattr_bucket_para *)para;
5232 5496
5233 ocfs2_init_dealloc_ctxt(&ctxt.dealloc); 5497 ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
5234 5498
@@ -5237,7 +5501,16 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
5237 if (ocfs2_xattr_is_local(xe)) 5501 if (ocfs2_xattr_is_local(xe))
5238 continue; 5502 continue;
5239 5503
5240 ctxt.handle = ocfs2_start_trans(osb, credits); 5504 ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket,
5505 i, &xv, NULL);
5506
5507 ret = ocfs2_lock_xattr_remove_allocators(inode, xv,
5508 args->ref_ci,
5509 args->ref_root_bh,
5510 &ctxt.meta_ac,
5511 &ref_credits);
5512
5513 ctxt.handle = ocfs2_start_trans(osb, credits + ref_credits);
5241 if (IS_ERR(ctxt.handle)) { 5514 if (IS_ERR(ctxt.handle)) {
5242 ret = PTR_ERR(ctxt.handle); 5515 ret = PTR_ERR(ctxt.handle);
5243 mlog_errno(ret); 5516 mlog_errno(ret);
@@ -5248,57 +5521,1439 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
5248 i, 0, &ctxt); 5521 i, 0, &ctxt);
5249 5522
5250 ocfs2_commit_trans(osb, ctxt.handle); 5523 ocfs2_commit_trans(osb, ctxt.handle);
5524 if (ctxt.meta_ac) {
5525 ocfs2_free_alloc_context(ctxt.meta_ac);
5526 ctxt.meta_ac = NULL;
5527 }
5251 if (ret) { 5528 if (ret) {
5252 mlog_errno(ret); 5529 mlog_errno(ret);
5253 break; 5530 break;
5254 } 5531 }
5255 } 5532 }
5256 5533
5534 if (ctxt.meta_ac)
5535 ocfs2_free_alloc_context(ctxt.meta_ac);
5257 ocfs2_schedule_truncate_log_flush(osb, 1); 5536 ocfs2_schedule_truncate_log_flush(osb, 1);
5258 ocfs2_run_deallocs(osb, &ctxt.dealloc); 5537 ocfs2_run_deallocs(osb, &ctxt.dealloc);
5259 return ret; 5538 return ret;
5260} 5539}
5261 5540
5262static int ocfs2_delete_xattr_index_block(struct inode *inode, 5541/*
5263 struct buffer_head *xb_bh) 5542 * Whenever we modify a xattr value root in the bucket(e.g, CoW
5543 * or change the extent record flag), we need to recalculate
5544 * the metaecc for the whole bucket. So it is done here.
5545 *
5546 * Note:
5547 * We have to give the extra credits for the caller.
5548 */
5549static int ocfs2_xattr_bucket_post_refcount(struct inode *inode,
5550 handle_t *handle,
5551 void *para)
5552{
5553 int ret;
5554 struct ocfs2_xattr_bucket *bucket =
5555 (struct ocfs2_xattr_bucket *)para;
5556
5557 ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
5558 OCFS2_JOURNAL_ACCESS_WRITE);
5559 if (ret) {
5560 mlog_errno(ret);
5561 return ret;
5562 }
5563
5564 ocfs2_xattr_bucket_journal_dirty(handle, bucket);
5565
5566 return 0;
5567}
5568
5569/*
5570 * Special action we need if the xattr value is refcounted.
5571 *
5572 * 1. If the xattr is refcounted, lock the tree.
5573 * 2. CoW the xattr if we are setting the new value and the value
5574 * will be stored outside.
5575 * 3. In other case, decrease_refcount will work for us, so just
5576 * lock the refcount tree, calculate the meta and credits is OK.
5577 *
5578 * We have to do CoW before ocfs2_init_xattr_set_ctxt since
5579 * currently CoW is a completed transaction, while this function
5580 * will also lock the allocators and let us deadlock. So we will
5581 * CoW the whole xattr value.
5582 */
5583static int ocfs2_prepare_refcount_xattr(struct inode *inode,
5584 struct ocfs2_dinode *di,
5585 struct ocfs2_xattr_info *xi,
5586 struct ocfs2_xattr_search *xis,
5587 struct ocfs2_xattr_search *xbs,
5588 struct ocfs2_refcount_tree **ref_tree,
5589 int *meta_add,
5590 int *credits)
5264{ 5591{
5265 struct ocfs2_xattr_block *xb =
5266 (struct ocfs2_xattr_block *)xb_bh->b_data;
5267 struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
5268 int ret = 0; 5592 int ret = 0;
5269 u32 name_hash = UINT_MAX, e_cpos, num_clusters; 5593 struct ocfs2_xattr_block *xb;
5270 u64 p_blkno; 5594 struct ocfs2_xattr_entry *xe;
5595 char *base;
5596 u32 p_cluster, num_clusters;
5597 unsigned int ext_flags;
5598 int name_offset, name_len;
5599 struct ocfs2_xattr_value_buf vb;
5600 struct ocfs2_xattr_bucket *bucket = NULL;
5601 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5602 struct ocfs2_post_refcount refcount;
5603 struct ocfs2_post_refcount *p = NULL;
5604 struct buffer_head *ref_root_bh = NULL;
5271 5605
5272 if (le16_to_cpu(el->l_next_free_rec) == 0) 5606 if (!xis->not_found) {
5273 return 0; 5607 xe = xis->here;
5608 name_offset = le16_to_cpu(xe->xe_name_offset);
5609 name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
5610 base = xis->base;
5611 vb.vb_bh = xis->inode_bh;
5612 vb.vb_access = ocfs2_journal_access_di;
5613 } else {
5614 int i, block_off = 0;
5615 xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
5616 xe = xbs->here;
5617 name_offset = le16_to_cpu(xe->xe_name_offset);
5618 name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
5619 i = xbs->here - xbs->header->xh_entries;
5274 5620
5275 while (name_hash > 0) { 5621 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
5276 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, 5622 ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
5277 &e_cpos, &num_clusters, el); 5623 bucket_xh(xbs->bucket),
5624 i, &block_off,
5625 &name_offset);
5626 if (ret) {
5627 mlog_errno(ret);
5628 goto out;
5629 }
5630 base = bucket_block(xbs->bucket, block_off);
5631 vb.vb_bh = xbs->bucket->bu_bhs[block_off];
5632 vb.vb_access = ocfs2_journal_access;
5633
5634 if (ocfs2_meta_ecc(osb)) {
5635 /*create parameters for ocfs2_post_refcount. */
5636 bucket = xbs->bucket;
5637 refcount.credits = bucket->bu_blocks;
5638 refcount.para = bucket;
5639 refcount.func =
5640 ocfs2_xattr_bucket_post_refcount;
5641 p = &refcount;
5642 }
5643 } else {
5644 base = xbs->base;
5645 vb.vb_bh = xbs->xattr_bh;
5646 vb.vb_access = ocfs2_journal_access_xb;
5647 }
5648 }
5649
5650 if (ocfs2_xattr_is_local(xe))
5651 goto out;
5652
5653 vb.vb_xv = (struct ocfs2_xattr_value_root *)
5654 (base + name_offset + name_len);
5655
5656 ret = ocfs2_xattr_get_clusters(inode, 0, &p_cluster,
5657 &num_clusters, &vb.vb_xv->xr_list,
5658 &ext_flags);
5659 if (ret) {
5660 mlog_errno(ret);
5661 goto out;
5662 }
5663
5664 /*
5665 * We just need to check the 1st extent record, since we always
5666 * CoW the whole xattr. So there shouldn't be a xattr with
5667 * some REFCOUNT extent recs after the 1st one.
5668 */
5669 if (!(ext_flags & OCFS2_EXT_REFCOUNTED))
5670 goto out;
5671
5672 ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
5673 1, ref_tree, &ref_root_bh);
5674 if (ret) {
5675 mlog_errno(ret);
5676 goto out;
5677 }
5678
5679 /*
5680 * If we are deleting the xattr or the new size will be stored inside,
5681 * cool, leave it there, the xattr truncate process will remove them
5682 * for us(it still needs the refcount tree lock and the meta, credits).
5683 * And the worse case is that every cluster truncate will split the
5684 * refcount tree, and make the original extent become 3. So we will need
5685 * 2 * cluster more extent recs at most.
5686 */
5687 if (!xi->value || xi->value_len <= OCFS2_XATTR_INLINE_SIZE) {
5688
5689 ret = ocfs2_refcounted_xattr_delete_need(inode,
5690 &(*ref_tree)->rf_ci,
5691 ref_root_bh, vb.vb_xv,
5692 meta_add, credits);
5693 if (ret)
5694 mlog_errno(ret);
5695 goto out;
5696 }
5697
5698 ret = ocfs2_refcount_cow_xattr(inode, di, &vb,
5699 *ref_tree, ref_root_bh, 0,
5700 le32_to_cpu(vb.vb_xv->xr_clusters), p);
5701 if (ret)
5702 mlog_errno(ret);
5703
5704out:
5705 brelse(ref_root_bh);
5706 return ret;
5707}
5708
5709/*
5710 * Add the REFCOUNTED flags for all the extent rec in ocfs2_xattr_value_root.
5711 * The physical clusters will be added to refcount tree.
5712 */
5713static int ocfs2_xattr_value_attach_refcount(struct inode *inode,
5714 struct ocfs2_xattr_value_root *xv,
5715 struct ocfs2_extent_tree *value_et,
5716 struct ocfs2_caching_info *ref_ci,
5717 struct buffer_head *ref_root_bh,
5718 struct ocfs2_cached_dealloc_ctxt *dealloc,
5719 struct ocfs2_post_refcount *refcount)
5720{
5721 int ret = 0;
5722 u32 clusters = le32_to_cpu(xv->xr_clusters);
5723 u32 cpos, p_cluster, num_clusters;
5724 struct ocfs2_extent_list *el = &xv->xr_list;
5725 unsigned int ext_flags;
5726
5727 cpos = 0;
5728 while (cpos < clusters) {
5729 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
5730 &num_clusters, el, &ext_flags);
5731
5732 cpos += num_clusters;
5733 if ((ext_flags & OCFS2_EXT_REFCOUNTED))
5734 continue;
5735
5736 BUG_ON(!p_cluster);
5737
5738 ret = ocfs2_add_refcount_flag(inode, value_et,
5739 ref_ci, ref_root_bh,
5740 cpos - num_clusters,
5741 p_cluster, num_clusters,
5742 dealloc, refcount);
5743 if (ret) {
5744 mlog_errno(ret);
5745 break;
5746 }
5747 }
5748
5749 return ret;
5750}
5751
5752/*
5753 * Given a normal ocfs2_xattr_header, refcount all the entries which
5754 * have value stored outside.
5755 * Used for xattrs stored in inode and ocfs2_xattr_block.
5756 */
5757static int ocfs2_xattr_attach_refcount_normal(struct inode *inode,
5758 struct ocfs2_xattr_value_buf *vb,
5759 struct ocfs2_xattr_header *header,
5760 struct ocfs2_caching_info *ref_ci,
5761 struct buffer_head *ref_root_bh,
5762 struct ocfs2_cached_dealloc_ctxt *dealloc)
5763{
5764
5765 struct ocfs2_xattr_entry *xe;
5766 struct ocfs2_xattr_value_root *xv;
5767 struct ocfs2_extent_tree et;
5768 int i, ret = 0;
5769
5770 for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
5771 xe = &header->xh_entries[i];
5772
5773 if (ocfs2_xattr_is_local(xe))
5774 continue;
5775
5776 xv = (struct ocfs2_xattr_value_root *)((void *)header +
5777 le16_to_cpu(xe->xe_name_offset) +
5778 OCFS2_XATTR_SIZE(xe->xe_name_len));
5779
5780 vb->vb_xv = xv;
5781 ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
5782
5783 ret = ocfs2_xattr_value_attach_refcount(inode, xv, &et,
5784 ref_ci, ref_root_bh,
5785 dealloc, NULL);
5786 if (ret) {
5787 mlog_errno(ret);
5788 break;
5789 }
5790 }
5791
5792 return ret;
5793}
5794
5795static int ocfs2_xattr_inline_attach_refcount(struct inode *inode,
5796 struct buffer_head *fe_bh,
5797 struct ocfs2_caching_info *ref_ci,
5798 struct buffer_head *ref_root_bh,
5799 struct ocfs2_cached_dealloc_ctxt *dealloc)
5800{
5801 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
5802 struct ocfs2_xattr_header *header = (struct ocfs2_xattr_header *)
5803 (fe_bh->b_data + inode->i_sb->s_blocksize -
5804 le16_to_cpu(di->i_xattr_inline_size));
5805 struct ocfs2_xattr_value_buf vb = {
5806 .vb_bh = fe_bh,
5807 .vb_access = ocfs2_journal_access_di,
5808 };
5809
5810 return ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
5811 ref_ci, ref_root_bh, dealloc);
5812}
5813
5814struct ocfs2_xattr_tree_value_refcount_para {
5815 struct ocfs2_caching_info *ref_ci;
5816 struct buffer_head *ref_root_bh;
5817 struct ocfs2_cached_dealloc_ctxt *dealloc;
5818};
5819
5820static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
5821 struct ocfs2_xattr_bucket *bucket,
5822 int offset,
5823 struct ocfs2_xattr_value_root **xv,
5824 struct buffer_head **bh)
5825{
5826 int ret, block_off, name_offset;
5827 struct ocfs2_xattr_header *xh = bucket_xh(bucket);
5828 struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
5829 void *base;
5830
5831 ret = ocfs2_xattr_bucket_get_name_value(sb,
5832 bucket_xh(bucket),
5833 offset,
5834 &block_off,
5835 &name_offset);
5836 if (ret) {
5837 mlog_errno(ret);
5838 goto out;
5839 }
5840
5841 base = bucket_block(bucket, block_off);
5842
5843 *xv = (struct ocfs2_xattr_value_root *)(base + name_offset +
5844 OCFS2_XATTR_SIZE(xe->xe_name_len));
5845
5846 if (bh)
5847 *bh = bucket->bu_bhs[block_off];
5848out:
5849 return ret;
5850}
5851
5852/*
5853 * For a given xattr bucket, refcount all the entries which
5854 * have value stored outside.
5855 */
5856static int ocfs2_xattr_bucket_value_refcount(struct inode *inode,
5857 struct ocfs2_xattr_bucket *bucket,
5858 void *para)
5859{
5860 int i, ret = 0;
5861 struct ocfs2_extent_tree et;
5862 struct ocfs2_xattr_tree_value_refcount_para *ref =
5863 (struct ocfs2_xattr_tree_value_refcount_para *)para;
5864 struct ocfs2_xattr_header *xh =
5865 (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data;
5866 struct ocfs2_xattr_entry *xe;
5867 struct ocfs2_xattr_value_buf vb = {
5868 .vb_access = ocfs2_journal_access,
5869 };
5870 struct ocfs2_post_refcount refcount = {
5871 .credits = bucket->bu_blocks,
5872 .para = bucket,
5873 .func = ocfs2_xattr_bucket_post_refcount,
5874 };
5875 struct ocfs2_post_refcount *p = NULL;
5876
5877 /* We only need post_refcount if we support metaecc. */
5878 if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)))
5879 p = &refcount;
5880
5881 mlog(0, "refcount bucket %llu, count = %u\n",
5882 (unsigned long long)bucket_blkno(bucket),
5883 le16_to_cpu(xh->xh_count));
5884 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
5885 xe = &xh->xh_entries[i];
5886
5887 if (ocfs2_xattr_is_local(xe))
5888 continue;
5889
5890 ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i,
5891 &vb.vb_xv, &vb.vb_bh);
5892 if (ret) {
5893 mlog_errno(ret);
5894 break;
5895 }
5896
5897 ocfs2_init_xattr_value_extent_tree(&et,
5898 INODE_CACHE(inode), &vb);
5899
5900 ret = ocfs2_xattr_value_attach_refcount(inode, vb.vb_xv,
5901 &et, ref->ref_ci,
5902 ref->ref_root_bh,
5903 ref->dealloc, p);
5904 if (ret) {
5905 mlog_errno(ret);
5906 break;
5907 }
5908 }
5909
5910 return ret;
5911
5912}
5913
5914static int ocfs2_refcount_xattr_tree_rec(struct inode *inode,
5915 struct buffer_head *root_bh,
5916 u64 blkno, u32 cpos, u32 len, void *para)
5917{
5918 return ocfs2_iterate_xattr_buckets(inode, blkno, len,
5919 ocfs2_xattr_bucket_value_refcount,
5920 para);
5921}
5922
5923static int ocfs2_xattr_block_attach_refcount(struct inode *inode,
5924 struct buffer_head *blk_bh,
5925 struct ocfs2_caching_info *ref_ci,
5926 struct buffer_head *ref_root_bh,
5927 struct ocfs2_cached_dealloc_ctxt *dealloc)
5928{
5929 int ret = 0;
5930 struct ocfs2_xattr_block *xb =
5931 (struct ocfs2_xattr_block *)blk_bh->b_data;
5932
5933 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
5934 struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
5935 struct ocfs2_xattr_value_buf vb = {
5936 .vb_bh = blk_bh,
5937 .vb_access = ocfs2_journal_access_xb,
5938 };
5939
5940 ret = ocfs2_xattr_attach_refcount_normal(inode, &vb, header,
5941 ref_ci, ref_root_bh,
5942 dealloc);
5943 } else {
5944 struct ocfs2_xattr_tree_value_refcount_para para = {
5945 .ref_ci = ref_ci,
5946 .ref_root_bh = ref_root_bh,
5947 .dealloc = dealloc,
5948 };
5949
5950 ret = ocfs2_iterate_xattr_index_block(inode, blk_bh,
5951 ocfs2_refcount_xattr_tree_rec,
5952 &para);
5953 }
5954
5955 return ret;
5956}
5957
5958int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
5959 struct buffer_head *fe_bh,
5960 struct ocfs2_caching_info *ref_ci,
5961 struct buffer_head *ref_root_bh,
5962 struct ocfs2_cached_dealloc_ctxt *dealloc)
5963{
5964 int ret = 0;
5965 struct ocfs2_inode_info *oi = OCFS2_I(inode);
5966 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
5967 struct buffer_head *blk_bh = NULL;
5968
5969 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
5970 ret = ocfs2_xattr_inline_attach_refcount(inode, fe_bh,
5971 ref_ci, ref_root_bh,
5972 dealloc);
5278 if (ret) { 5973 if (ret) {
5279 mlog_errno(ret); 5974 mlog_errno(ret);
5280 goto out; 5975 goto out;
5281 } 5976 }
5977 }
5978
5979 if (!di->i_xattr_loc)
5980 goto out;
5981
5982 ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
5983 &blk_bh);
5984 if (ret < 0) {
5985 mlog_errno(ret);
5986 goto out;
5987 }
5988
5989 ret = ocfs2_xattr_block_attach_refcount(inode, blk_bh, ref_ci,
5990 ref_root_bh, dealloc);
5991 if (ret)
5992 mlog_errno(ret);
5993
5994 brelse(blk_bh);
5995out:
5996
5997 return ret;
5998}
5999
6000typedef int (should_xattr_reflinked)(struct ocfs2_xattr_entry *xe);
6001/*
6002 * Store the information we need in xattr reflink.
6003 * old_bh and new_bh are inode bh for the old and new inode.
6004 */
6005struct ocfs2_xattr_reflink {
6006 struct inode *old_inode;
6007 struct inode *new_inode;
6008 struct buffer_head *old_bh;
6009 struct buffer_head *new_bh;
6010 struct ocfs2_caching_info *ref_ci;
6011 struct buffer_head *ref_root_bh;
6012 struct ocfs2_cached_dealloc_ctxt *dealloc;
6013 should_xattr_reflinked *xattr_reflinked;
6014};
6015
6016/*
6017 * Given a xattr header and xe offset,
6018 * return the proper xv and the corresponding bh.
6019 * xattr in inode, block and xattr tree have different implementaions.
6020 */
6021typedef int (get_xattr_value_root)(struct super_block *sb,
6022 struct buffer_head *bh,
6023 struct ocfs2_xattr_header *xh,
6024 int offset,
6025 struct ocfs2_xattr_value_root **xv,
6026 struct buffer_head **ret_bh,
6027 void *para);
6028
6029/*
6030 * Calculate all the xattr value root metadata stored in this xattr header and
6031 * credits we need if we create them from the scratch.
6032 * We use get_xattr_value_root so that all types of xattr container can use it.
6033 */
6034static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
6035 struct buffer_head *bh,
6036 struct ocfs2_xattr_header *xh,
6037 int *metas, int *credits,
6038 int *num_recs,
6039 get_xattr_value_root *func,
6040 void *para)
6041{
6042 int i, ret = 0;
6043 struct ocfs2_xattr_value_root *xv;
6044 struct ocfs2_xattr_entry *xe;
6045
6046 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
6047 xe = &xh->xh_entries[i];
6048 if (ocfs2_xattr_is_local(xe))
6049 continue;
6050
6051 ret = func(sb, bh, xh, i, &xv, NULL, para);
6052 if (ret) {
6053 mlog_errno(ret);
6054 break;
6055 }
6056
6057 *metas += le16_to_cpu(xv->xr_list.l_tree_depth) *
6058 le16_to_cpu(xv->xr_list.l_next_free_rec);
6059
6060 *credits += ocfs2_calc_extend_credits(sb,
6061 &def_xv.xv.xr_list,
6062 le32_to_cpu(xv->xr_clusters));
6063
6064 /*
6065 * If the value is a tree with depth > 1, We don't go deep
6066 * to the extent block, so just calculate a maximum record num.
6067 */
6068 if (!xv->xr_list.l_tree_depth)
6069 *num_recs += xv->xr_list.l_next_free_rec;
6070 else
6071 *num_recs += ocfs2_clusters_for_bytes(sb,
6072 XATTR_SIZE_MAX);
6073 }
6074
6075 return ret;
6076}
6077
6078/* Used by xattr inode and block to return the right xv and buffer_head. */
6079static int ocfs2_get_xattr_value_root(struct super_block *sb,
6080 struct buffer_head *bh,
6081 struct ocfs2_xattr_header *xh,
6082 int offset,
6083 struct ocfs2_xattr_value_root **xv,
6084 struct buffer_head **ret_bh,
6085 void *para)
6086{
6087 struct ocfs2_xattr_entry *xe = &xh->xh_entries[offset];
6088
6089 *xv = (struct ocfs2_xattr_value_root *)((void *)xh +
6090 le16_to_cpu(xe->xe_name_offset) +
6091 OCFS2_XATTR_SIZE(xe->xe_name_len));
6092
6093 if (ret_bh)
6094 *ret_bh = bh;
6095
6096 return 0;
6097}
6098
6099/*
6100 * Lock the meta_ac and caculate how much credits we need for reflink xattrs.
6101 * It is only used for inline xattr and xattr block.
6102 */
6103static int ocfs2_reflink_lock_xattr_allocators(struct ocfs2_super *osb,
6104 struct ocfs2_xattr_header *xh,
6105 struct buffer_head *ref_root_bh,
6106 int *credits,
6107 struct ocfs2_alloc_context **meta_ac)
6108{
6109 int ret, meta_add = 0, num_recs = 0;
6110 struct ocfs2_refcount_block *rb =
6111 (struct ocfs2_refcount_block *)ref_root_bh->b_data;
6112
6113 *credits = 0;
6114
6115 ret = ocfs2_value_metas_in_xattr_header(osb->sb, NULL, xh,
6116 &meta_add, credits, &num_recs,
6117 ocfs2_get_xattr_value_root,
6118 NULL);
6119 if (ret) {
6120 mlog_errno(ret);
6121 goto out;
6122 }
6123
6124 /*
6125 * We need to add/modify num_recs in refcount tree, so just calculate
6126 * an approximate number we need for refcount tree change.
6127 * Sometimes we need to split the tree, and after split, half recs
6128 * will be moved to the new block, and a new block can only provide
6129 * half number of recs. So we multiple new blocks by 2.
6130 */
6131 num_recs = num_recs / ocfs2_refcount_recs_per_rb(osb->sb) * 2;
6132 meta_add += num_recs;
6133 *credits += num_recs + num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
6134 if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
6135 *credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
6136 le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
6137 else
6138 *credits += 1;
6139
6140 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, meta_ac);
6141 if (ret)
6142 mlog_errno(ret);
6143
6144out:
6145 return ret;
6146}
5282 6147
5283 ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, 6148/*
5284 ocfs2_delete_xattr_in_bucket, 6149 * Given a xattr header, reflink all the xattrs in this container.
5285 NULL); 6150 * It can be used for inode, block and bucket.
6151 *
6152 * NOTE:
6153 * Before we call this function, the caller has memcpy the xattr in
6154 * old_xh to the new_xh.
6155 *
6156 * If args.xattr_reflinked is set, call it to decide whether the xe should
6157 * be reflinked or not. If not, remove it from the new xattr header.
6158 */
6159static int ocfs2_reflink_xattr_header(handle_t *handle,
6160 struct ocfs2_xattr_reflink *args,
6161 struct buffer_head *old_bh,
6162 struct ocfs2_xattr_header *xh,
6163 struct buffer_head *new_bh,
6164 struct ocfs2_xattr_header *new_xh,
6165 struct ocfs2_xattr_value_buf *vb,
6166 struct ocfs2_alloc_context *meta_ac,
6167 get_xattr_value_root *func,
6168 void *para)
6169{
6170 int ret = 0, i, j;
6171 struct super_block *sb = args->old_inode->i_sb;
6172 struct buffer_head *value_bh;
6173 struct ocfs2_xattr_entry *xe, *last;
6174 struct ocfs2_xattr_value_root *xv, *new_xv;
6175 struct ocfs2_extent_tree data_et;
6176 u32 clusters, cpos, p_cluster, num_clusters;
6177 unsigned int ext_flags = 0;
6178
6179 mlog(0, "reflink xattr in container %llu, count = %u\n",
6180 (unsigned long long)old_bh->b_blocknr, le16_to_cpu(xh->xh_count));
6181
6182 last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)];
6183 for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) {
6184 xe = &xh->xh_entries[i];
6185
6186 if (args->xattr_reflinked && !args->xattr_reflinked(xe)) {
6187 xe = &new_xh->xh_entries[j];
6188
6189 le16_add_cpu(&new_xh->xh_count, -1);
6190 if (new_xh->xh_count) {
6191 memmove(xe, xe + 1,
6192 (void *)last - (void *)xe);
6193 memset(last, 0,
6194 sizeof(struct ocfs2_xattr_entry));
6195 }
6196
6197 /*
6198 * We don't want j to increase in the next round since
6199 * it is already moved ahead.
6200 */
6201 j--;
6202 continue;
6203 }
6204
6205 if (ocfs2_xattr_is_local(xe))
6206 continue;
6207
6208 ret = func(sb, old_bh, xh, i, &xv, NULL, para);
6209 if (ret) {
6210 mlog_errno(ret);
6211 break;
6212 }
6213
6214 ret = func(sb, new_bh, new_xh, j, &new_xv, &value_bh, para);
6215 if (ret) {
6216 mlog_errno(ret);
6217 break;
6218 }
6219
6220 /*
6221 * For the xattr which has l_tree_depth = 0, all the extent
6222 * recs have already be copied to the new xh with the
6223 * propriate OCFS2_EXT_REFCOUNTED flag we just need to
6224 * increase the refount count int the refcount tree.
6225 *
6226 * For the xattr which has l_tree_depth > 0, we need
6227 * to initialize it to the empty default value root,
6228 * and then insert the extents one by one.
6229 */
6230 if (xv->xr_list.l_tree_depth) {
6231 memcpy(new_xv, &def_xv, sizeof(def_xv));
6232 vb->vb_xv = new_xv;
6233 vb->vb_bh = value_bh;
6234 ocfs2_init_xattr_value_extent_tree(&data_et,
6235 INODE_CACHE(args->new_inode), vb);
6236 }
6237
6238 clusters = le32_to_cpu(xv->xr_clusters);
6239 cpos = 0;
6240 while (cpos < clusters) {
6241 ret = ocfs2_xattr_get_clusters(args->old_inode,
6242 cpos,
6243 &p_cluster,
6244 &num_clusters,
6245 &xv->xr_list,
6246 &ext_flags);
6247 if (ret) {
6248 mlog_errno(ret);
6249 goto out;
6250 }
6251
6252 BUG_ON(!p_cluster);
6253
6254 if (xv->xr_list.l_tree_depth) {
6255 ret = ocfs2_insert_extent(handle,
6256 &data_et, cpos,
6257 ocfs2_clusters_to_blocks(
6258 args->old_inode->i_sb,
6259 p_cluster),
6260 num_clusters, ext_flags,
6261 meta_ac);
6262 if (ret) {
6263 mlog_errno(ret);
6264 goto out;
6265 }
6266 }
6267
6268 ret = ocfs2_increase_refcount(handle, args->ref_ci,
6269 args->ref_root_bh,
6270 p_cluster, num_clusters,
6271 meta_ac, args->dealloc);
6272 if (ret) {
6273 mlog_errno(ret);
6274 goto out;
6275 }
6276
6277 cpos += num_clusters;
6278 }
6279 }
6280
6281out:
6282 return ret;
6283}
6284
6285static int ocfs2_reflink_xattr_inline(struct ocfs2_xattr_reflink *args)
6286{
6287 int ret = 0, credits = 0;
6288 handle_t *handle;
6289 struct ocfs2_super *osb = OCFS2_SB(args->old_inode->i_sb);
6290 struct ocfs2_dinode *di = (struct ocfs2_dinode *)args->old_bh->b_data;
6291 int inline_size = le16_to_cpu(di->i_xattr_inline_size);
6292 int header_off = osb->sb->s_blocksize - inline_size;
6293 struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)
6294 (args->old_bh->b_data + header_off);
6295 struct ocfs2_xattr_header *new_xh = (struct ocfs2_xattr_header *)
6296 (args->new_bh->b_data + header_off);
6297 struct ocfs2_alloc_context *meta_ac = NULL;
6298 struct ocfs2_inode_info *new_oi;
6299 struct ocfs2_dinode *new_di;
6300 struct ocfs2_xattr_value_buf vb = {
6301 .vb_bh = args->new_bh,
6302 .vb_access = ocfs2_journal_access_di,
6303 };
6304
6305 ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
6306 &credits, &meta_ac);
6307 if (ret) {
6308 mlog_errno(ret);
6309 goto out;
6310 }
6311
6312 handle = ocfs2_start_trans(osb, credits);
6313 if (IS_ERR(handle)) {
6314 ret = PTR_ERR(handle);
6315 mlog_errno(ret);
6316 goto out;
6317 }
6318
6319 ret = ocfs2_journal_access_di(handle, INODE_CACHE(args->new_inode),
6320 args->new_bh, OCFS2_JOURNAL_ACCESS_WRITE);
6321 if (ret) {
6322 mlog_errno(ret);
6323 goto out_commit;
6324 }
6325
6326 memcpy(args->new_bh->b_data + header_off,
6327 args->old_bh->b_data + header_off, inline_size);
6328
6329 new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
6330 new_di->i_xattr_inline_size = cpu_to_le16(inline_size);
6331
6332 ret = ocfs2_reflink_xattr_header(handle, args, args->old_bh, xh,
6333 args->new_bh, new_xh, &vb, meta_ac,
6334 ocfs2_get_xattr_value_root, NULL);
6335 if (ret) {
6336 mlog_errno(ret);
6337 goto out_commit;
6338 }
6339
6340 new_oi = OCFS2_I(args->new_inode);
6341 spin_lock(&new_oi->ip_lock);
6342 new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL | OCFS2_INLINE_XATTR_FL;
6343 new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
6344 spin_unlock(&new_oi->ip_lock);
6345
6346 ocfs2_journal_dirty(handle, args->new_bh);
6347
6348out_commit:
6349 ocfs2_commit_trans(osb, handle);
6350
6351out:
6352 if (meta_ac)
6353 ocfs2_free_alloc_context(meta_ac);
6354 return ret;
6355}
6356
6357static int ocfs2_create_empty_xattr_block(struct inode *inode,
6358 struct buffer_head *fe_bh,
6359 struct buffer_head **ret_bh,
6360 int indexed)
6361{
6362 int ret;
6363 handle_t *handle;
6364 struct ocfs2_alloc_context *meta_ac;
6365 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6366
6367 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
6368 if (ret < 0) {
6369 mlog_errno(ret);
6370 return ret;
6371 }
6372
6373 handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_CREATE_CREDITS);
6374 if (IS_ERR(handle)) {
6375 ret = PTR_ERR(handle);
6376 mlog_errno(ret);
6377 goto out;
6378 }
6379
6380 mlog(0, "create new xattr block for inode %llu, index = %d\n",
6381 (unsigned long long)fe_bh->b_blocknr, indexed);
6382 ret = ocfs2_create_xattr_block(handle, inode, fe_bh,
6383 meta_ac, ret_bh, indexed);
6384 if (ret)
6385 mlog_errno(ret);
6386
6387 ocfs2_commit_trans(osb, handle);
6388out:
6389 ocfs2_free_alloc_context(meta_ac);
6390 return ret;
6391}
6392
6393static int ocfs2_reflink_xattr_block(struct ocfs2_xattr_reflink *args,
6394 struct buffer_head *blk_bh,
6395 struct buffer_head *new_blk_bh)
6396{
6397 int ret = 0, credits = 0;
6398 handle_t *handle;
6399 struct ocfs2_inode_info *new_oi = OCFS2_I(args->new_inode);
6400 struct ocfs2_dinode *new_di;
6401 struct ocfs2_super *osb = OCFS2_SB(args->new_inode->i_sb);
6402 int header_off = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
6403 struct ocfs2_xattr_block *xb =
6404 (struct ocfs2_xattr_block *)blk_bh->b_data;
6405 struct ocfs2_xattr_header *xh = &xb->xb_attrs.xb_header;
6406 struct ocfs2_xattr_block *new_xb =
6407 (struct ocfs2_xattr_block *)new_blk_bh->b_data;
6408 struct ocfs2_xattr_header *new_xh = &new_xb->xb_attrs.xb_header;
6409 struct ocfs2_alloc_context *meta_ac;
6410 struct ocfs2_xattr_value_buf vb = {
6411 .vb_bh = new_blk_bh,
6412 .vb_access = ocfs2_journal_access_xb,
6413 };
6414
6415 ret = ocfs2_reflink_lock_xattr_allocators(osb, xh, args->ref_root_bh,
6416 &credits, &meta_ac);
6417 if (ret) {
6418 mlog_errno(ret);
6419 return ret;
6420 }
6421
6422 /* One more credits in case we need to add xattr flags in new inode. */
6423 handle = ocfs2_start_trans(osb, credits + 1);
6424 if (IS_ERR(handle)) {
6425 ret = PTR_ERR(handle);
6426 mlog_errno(ret);
6427 goto out;
6428 }
6429
6430 if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
6431 ret = ocfs2_journal_access_di(handle,
6432 INODE_CACHE(args->new_inode),
6433 args->new_bh,
6434 OCFS2_JOURNAL_ACCESS_WRITE);
6435 if (ret) {
6436 mlog_errno(ret);
6437 goto out_commit;
6438 }
6439 }
6440
6441 ret = ocfs2_journal_access_xb(handle, INODE_CACHE(args->new_inode),
6442 new_blk_bh, OCFS2_JOURNAL_ACCESS_WRITE);
6443 if (ret) {
6444 mlog_errno(ret);
6445 goto out_commit;
6446 }
6447
6448 memcpy(new_blk_bh->b_data + header_off, blk_bh->b_data + header_off,
6449 osb->sb->s_blocksize - header_off);
6450
6451 ret = ocfs2_reflink_xattr_header(handle, args, blk_bh, xh,
6452 new_blk_bh, new_xh, &vb, meta_ac,
6453 ocfs2_get_xattr_value_root, NULL);
6454 if (ret) {
6455 mlog_errno(ret);
6456 goto out_commit;
6457 }
6458
6459 ocfs2_journal_dirty(handle, new_blk_bh);
6460
6461 if (!(new_oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) {
6462 new_di = (struct ocfs2_dinode *)args->new_bh->b_data;
6463 spin_lock(&new_oi->ip_lock);
6464 new_oi->ip_dyn_features |= OCFS2_HAS_XATTR_FL;
6465 new_di->i_dyn_features = cpu_to_le16(new_oi->ip_dyn_features);
6466 spin_unlock(&new_oi->ip_lock);
6467
6468 ocfs2_journal_dirty(handle, args->new_bh);
6469 }
6470
6471out_commit:
6472 ocfs2_commit_trans(osb, handle);
6473
6474out:
6475 ocfs2_free_alloc_context(meta_ac);
6476 return ret;
6477}
6478
6479struct ocfs2_reflink_xattr_tree_args {
6480 struct ocfs2_xattr_reflink *reflink;
6481 struct buffer_head *old_blk_bh;
6482 struct buffer_head *new_blk_bh;
6483 struct ocfs2_xattr_bucket *old_bucket;
6484 struct ocfs2_xattr_bucket *new_bucket;
6485};
6486
6487/*
6488 * NOTE:
6489 * We have to handle the case that both old bucket and new bucket
6490 * will call this function to get the right ret_bh.
6491 * So The caller must give us the right bh.
6492 */
6493static int ocfs2_get_reflink_xattr_value_root(struct super_block *sb,
6494 struct buffer_head *bh,
6495 struct ocfs2_xattr_header *xh,
6496 int offset,
6497 struct ocfs2_xattr_value_root **xv,
6498 struct buffer_head **ret_bh,
6499 void *para)
6500{
6501 struct ocfs2_reflink_xattr_tree_args *args =
6502 (struct ocfs2_reflink_xattr_tree_args *)para;
6503 struct ocfs2_xattr_bucket *bucket;
6504
6505 if (bh == args->old_bucket->bu_bhs[0])
6506 bucket = args->old_bucket;
6507 else
6508 bucket = args->new_bucket;
6509
6510 return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
6511 xv, ret_bh);
6512}
6513
6514struct ocfs2_value_tree_metas {
6515 int num_metas;
6516 int credits;
6517 int num_recs;
6518};
6519
6520static int ocfs2_value_tree_metas_in_bucket(struct super_block *sb,
6521 struct buffer_head *bh,
6522 struct ocfs2_xattr_header *xh,
6523 int offset,
6524 struct ocfs2_xattr_value_root **xv,
6525 struct buffer_head **ret_bh,
6526 void *para)
6527{
6528 struct ocfs2_xattr_bucket *bucket =
6529 (struct ocfs2_xattr_bucket *)para;
6530
6531 return ocfs2_get_xattr_tree_value_root(sb, bucket, offset,
6532 xv, ret_bh);
6533}
6534
6535static int ocfs2_calc_value_tree_metas(struct inode *inode,
6536 struct ocfs2_xattr_bucket *bucket,
6537 void *para)
6538{
6539 struct ocfs2_value_tree_metas *metas =
6540 (struct ocfs2_value_tree_metas *)para;
6541 struct ocfs2_xattr_header *xh =
6542 (struct ocfs2_xattr_header *)bucket->bu_bhs[0]->b_data;
6543
6544 /* Add the credits for this bucket first. */
6545 metas->credits += bucket->bu_blocks;
6546 return ocfs2_value_metas_in_xattr_header(inode->i_sb, bucket->bu_bhs[0],
6547 xh, &metas->num_metas,
6548 &metas->credits, &metas->num_recs,
6549 ocfs2_value_tree_metas_in_bucket,
6550 bucket);
6551}
6552
6553/*
6554 * Given a xattr extent rec starting from blkno and having len clusters,
6555 * iterate all the buckets calculate how much metadata we need for reflinking
6556 * all the ocfs2_xattr_value_root and lock the allocators accordingly.
6557 */
6558static int ocfs2_lock_reflink_xattr_rec_allocators(
6559 struct ocfs2_reflink_xattr_tree_args *args,
6560 struct ocfs2_extent_tree *xt_et,
6561 u64 blkno, u32 len, int *credits,
6562 struct ocfs2_alloc_context **meta_ac,
6563 struct ocfs2_alloc_context **data_ac)
6564{
6565 int ret, num_free_extents;
6566 struct ocfs2_value_tree_metas metas;
6567 struct ocfs2_super *osb = OCFS2_SB(args->reflink->old_inode->i_sb);
6568 struct ocfs2_refcount_block *rb;
6569
6570 memset(&metas, 0, sizeof(metas));
6571
6572 ret = ocfs2_iterate_xattr_buckets(args->reflink->old_inode, blkno, len,
6573 ocfs2_calc_value_tree_metas, &metas);
6574 if (ret) {
6575 mlog_errno(ret);
6576 goto out;
6577 }
6578
6579 *credits = metas.credits;
6580
6581 /*
6582 * Calculate we need for refcount tree change.
6583 *
6584 * We need to add/modify num_recs in refcount tree, so just calculate
6585 * an approximate number we need for refcount tree change.
6586 * Sometimes we need to split the tree, and after split, half recs
6587 * will be moved to the new block, and a new block can only provide
6588 * half number of recs. So we multiple new blocks by 2.
6589 * In the end, we have to add credits for modifying the already
6590 * existed refcount block.
6591 */
6592 rb = (struct ocfs2_refcount_block *)args->reflink->ref_root_bh->b_data;
6593 metas.num_recs =
6594 (metas.num_recs + ocfs2_refcount_recs_per_rb(osb->sb) - 1) /
6595 ocfs2_refcount_recs_per_rb(osb->sb) * 2;
6596 metas.num_metas += metas.num_recs;
6597 *credits += metas.num_recs +
6598 metas.num_recs * OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
6599 if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
6600 *credits += le16_to_cpu(rb->rf_list.l_tree_depth) *
6601 le16_to_cpu(rb->rf_list.l_next_free_rec) + 1;
6602 else
6603 *credits += 1;
6604
6605 /* count in the xattr tree change. */
6606 num_free_extents = ocfs2_num_free_extents(osb, xt_et);
6607 if (num_free_extents < 0) {
6608 ret = num_free_extents;
6609 mlog_errno(ret);
6610 goto out;
6611 }
6612
6613 if (num_free_extents < len)
6614 metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el);
6615
6616 *credits += ocfs2_calc_extend_credits(osb->sb,
6617 xt_et->et_root_el, len);
6618
6619 if (metas.num_metas) {
6620 ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas,
6621 meta_ac);
5286 if (ret) { 6622 if (ret) {
5287 mlog_errno(ret); 6623 mlog_errno(ret);
5288 goto out; 6624 goto out;
5289 } 6625 }
6626 }
5290 6627
5291 ret = ocfs2_rm_xattr_cluster(inode, xb_bh, 6628 if (len) {
5292 p_blkno, e_cpos, num_clusters); 6629 ret = ocfs2_reserve_clusters(osb, len, data_ac);
6630 if (ret)
6631 mlog_errno(ret);
6632 }
6633out:
6634 if (ret) {
6635 if (*meta_ac) {
6636 ocfs2_free_alloc_context(*meta_ac);
6637 meta_ac = NULL;
6638 }
6639 }
6640
6641 return ret;
6642}
6643
6644static int ocfs2_reflink_xattr_buckets(handle_t *handle,
6645 u64 blkno, u64 new_blkno, u32 clusters,
6646 struct ocfs2_alloc_context *meta_ac,
6647 struct ocfs2_alloc_context *data_ac,
6648 struct ocfs2_reflink_xattr_tree_args *args)
6649{
6650 int i, j, ret = 0;
6651 struct super_block *sb = args->reflink->old_inode->i_sb;
6652 u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
6653 u32 num_buckets = clusters * bpc;
6654 int bpb = args->old_bucket->bu_blocks;
6655 struct ocfs2_xattr_value_buf vb = {
6656 .vb_access = ocfs2_journal_access,
6657 };
6658
6659 for (i = 0; i < num_buckets; i++, blkno += bpb, new_blkno += bpb) {
6660 ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
5293 if (ret) { 6661 if (ret) {
5294 mlog_errno(ret); 6662 mlog_errno(ret);
5295 break; 6663 break;
5296 } 6664 }
5297 6665
5298 if (e_cpos == 0) 6666 ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno);
6667 if (ret) {
6668 mlog_errno(ret);
5299 break; 6669 break;
6670 }
5300 6671
5301 name_hash = e_cpos - 1; 6672 /*
6673 * The real bucket num in this series of blocks is stored
6674 * in the 1st bucket.
6675 */
6676 if (i == 0)
6677 num_buckets = le16_to_cpu(
6678 bucket_xh(args->old_bucket)->xh_num_buckets);
6679
6680 ret = ocfs2_xattr_bucket_journal_access(handle,
6681 args->new_bucket,
6682 OCFS2_JOURNAL_ACCESS_CREATE);
6683 if (ret) {
6684 mlog_errno(ret);
6685 break;
6686 }
6687
6688 for (j = 0; j < bpb; j++)
6689 memcpy(bucket_block(args->new_bucket, j),
6690 bucket_block(args->old_bucket, j),
6691 sb->s_blocksize);
6692
6693 ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
6694
6695 ret = ocfs2_reflink_xattr_header(handle, args->reflink,
6696 args->old_bucket->bu_bhs[0],
6697 bucket_xh(args->old_bucket),
6698 args->new_bucket->bu_bhs[0],
6699 bucket_xh(args->new_bucket),
6700 &vb, meta_ac,
6701 ocfs2_get_reflink_xattr_value_root,
6702 args);
6703 if (ret) {
6704 mlog_errno(ret);
6705 break;
6706 }
6707
6708 /*
6709 * Re-access and dirty the bucket to calculate metaecc.
6710 * Because we may extend the transaction in reflink_xattr_header
6711 * which will let the already accessed block gone.
6712 */
6713 ret = ocfs2_xattr_bucket_journal_access(handle,
6714 args->new_bucket,
6715 OCFS2_JOURNAL_ACCESS_WRITE);
6716 if (ret) {
6717 mlog_errno(ret);
6718 break;
6719 }
6720
6721 ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
6722 ocfs2_xattr_bucket_relse(args->old_bucket);
6723 ocfs2_xattr_bucket_relse(args->new_bucket);
6724 }
6725
6726 ocfs2_xattr_bucket_relse(args->old_bucket);
6727 ocfs2_xattr_bucket_relse(args->new_bucket);
6728 return ret;
6729}
6730/*
6731 * Create the same xattr extent record in the new inode's xattr tree.
6732 */
6733static int ocfs2_reflink_xattr_rec(struct inode *inode,
6734 struct buffer_head *root_bh,
6735 u64 blkno,
6736 u32 cpos,
6737 u32 len,
6738 void *para)
6739{
6740 int ret, credits = 0;
6741 u32 p_cluster, num_clusters;
6742 u64 new_blkno;
6743 handle_t *handle;
6744 struct ocfs2_reflink_xattr_tree_args *args =
6745 (struct ocfs2_reflink_xattr_tree_args *)para;
6746 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6747 struct ocfs2_alloc_context *meta_ac = NULL;
6748 struct ocfs2_alloc_context *data_ac = NULL;
6749 struct ocfs2_extent_tree et;
6750
6751 ocfs2_init_xattr_tree_extent_tree(&et,
6752 INODE_CACHE(args->reflink->new_inode),
6753 args->new_blk_bh);
6754
6755 ret = ocfs2_lock_reflink_xattr_rec_allocators(args, &et, blkno,
6756 len, &credits,
6757 &meta_ac, &data_ac);
6758 if (ret) {
6759 mlog_errno(ret);
6760 goto out;
6761 }
6762
6763 handle = ocfs2_start_trans(osb, credits);
6764 if (IS_ERR(handle)) {
6765 ret = PTR_ERR(handle);
6766 mlog_errno(ret);
6767 goto out;
6768 }
6769
6770 ret = ocfs2_claim_clusters(osb, handle, data_ac,
6771 len, &p_cluster, &num_clusters);
6772 if (ret) {
6773 mlog_errno(ret);
6774 goto out_commit;
6775 }
6776
6777 new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster);
6778
6779 mlog(0, "reflink xattr buckets %llu to %llu, len %u\n",
6780 (unsigned long long)blkno, (unsigned long long)new_blkno, len);
6781 ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len,
6782 meta_ac, data_ac, args);
6783 if (ret) {
6784 mlog_errno(ret);
6785 goto out_commit;
6786 }
6787
6788 mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
6789 (unsigned long long)new_blkno, len, cpos);
6790 ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno,
6791 len, 0, meta_ac);
6792 if (ret)
6793 mlog_errno(ret);
6794
6795out_commit:
6796 ocfs2_commit_trans(osb, handle);
6797
6798out:
6799 if (meta_ac)
6800 ocfs2_free_alloc_context(meta_ac);
6801 if (data_ac)
6802 ocfs2_free_alloc_context(data_ac);
6803 return ret;
6804}
6805
6806/*
6807 * Create reflinked xattr buckets.
6808 * We will add bucket one by one, and refcount all the xattrs in the bucket
6809 * if they are stored outside.
6810 */
6811static int ocfs2_reflink_xattr_tree(struct ocfs2_xattr_reflink *args,
6812 struct buffer_head *blk_bh,
6813 struct buffer_head *new_blk_bh)
6814{
6815 int ret;
6816 struct ocfs2_reflink_xattr_tree_args para;
6817
6818 memset(&para, 0, sizeof(para));
6819 para.reflink = args;
6820 para.old_blk_bh = blk_bh;
6821 para.new_blk_bh = new_blk_bh;
6822
6823 para.old_bucket = ocfs2_xattr_bucket_new(args->old_inode);
6824 if (!para.old_bucket) {
6825 mlog_errno(-ENOMEM);
6826 return -ENOMEM;
6827 }
6828
6829 para.new_bucket = ocfs2_xattr_bucket_new(args->new_inode);
6830 if (!para.new_bucket) {
6831 ret = -ENOMEM;
6832 mlog_errno(ret);
6833 goto out;
6834 }
6835
6836 ret = ocfs2_iterate_xattr_index_block(args->old_inode, blk_bh,
6837 ocfs2_reflink_xattr_rec,
6838 &para);
6839 if (ret)
6840 mlog_errno(ret);
6841
6842out:
6843 ocfs2_xattr_bucket_free(para.old_bucket);
6844 ocfs2_xattr_bucket_free(para.new_bucket);
6845 return ret;
6846}
6847
6848static int ocfs2_reflink_xattr_in_block(struct ocfs2_xattr_reflink *args,
6849 struct buffer_head *blk_bh)
6850{
6851 int ret, indexed = 0;
6852 struct buffer_head *new_blk_bh = NULL;
6853 struct ocfs2_xattr_block *xb =
6854 (struct ocfs2_xattr_block *)blk_bh->b_data;
6855
6856
6857 if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)
6858 indexed = 1;
6859
6860 ret = ocfs2_create_empty_xattr_block(args->new_inode, args->new_bh,
6861 &new_blk_bh, indexed);
6862 if (ret) {
6863 mlog_errno(ret);
6864 goto out;
6865 }
6866
6867 if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED))
6868 ret = ocfs2_reflink_xattr_block(args, blk_bh, new_blk_bh);
6869 else
6870 ret = ocfs2_reflink_xattr_tree(args, blk_bh, new_blk_bh);
6871 if (ret)
6872 mlog_errno(ret);
6873
6874out:
6875 brelse(new_blk_bh);
6876 return ret;
6877}
6878
6879static int ocfs2_reflink_xattr_no_security(struct ocfs2_xattr_entry *xe)
6880{
6881 int type = ocfs2_xattr_get_type(xe);
6882
6883 return type != OCFS2_XATTR_INDEX_SECURITY &&
6884 type != OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS &&
6885 type != OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
6886}
6887
6888int ocfs2_reflink_xattrs(struct inode *old_inode,
6889 struct buffer_head *old_bh,
6890 struct inode *new_inode,
6891 struct buffer_head *new_bh,
6892 bool preserve_security)
6893{
6894 int ret;
6895 struct ocfs2_xattr_reflink args;
6896 struct ocfs2_inode_info *oi = OCFS2_I(old_inode);
6897 struct ocfs2_dinode *di = (struct ocfs2_dinode *)old_bh->b_data;
6898 struct buffer_head *blk_bh = NULL;
6899 struct ocfs2_cached_dealloc_ctxt dealloc;
6900 struct ocfs2_refcount_tree *ref_tree;
6901 struct buffer_head *ref_root_bh = NULL;
6902
6903 ret = ocfs2_lock_refcount_tree(OCFS2_SB(old_inode->i_sb),
6904 le64_to_cpu(di->i_refcount_loc),
6905 1, &ref_tree, &ref_root_bh);
6906 if (ret) {
6907 mlog_errno(ret);
6908 goto out;
6909 }
6910
6911 ocfs2_init_dealloc_ctxt(&dealloc);
6912
6913 args.old_inode = old_inode;
6914 args.new_inode = new_inode;
6915 args.old_bh = old_bh;
6916 args.new_bh = new_bh;
6917 args.ref_ci = &ref_tree->rf_ci;
6918 args.ref_root_bh = ref_root_bh;
6919 args.dealloc = &dealloc;
6920 if (preserve_security)
6921 args.xattr_reflinked = NULL;
6922 else
6923 args.xattr_reflinked = ocfs2_reflink_xattr_no_security;
6924
6925 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
6926 ret = ocfs2_reflink_xattr_inline(&args);
6927 if (ret) {
6928 mlog_errno(ret);
6929 goto out_unlock;
6930 }
6931 }
6932
6933 if (!di->i_xattr_loc)
6934 goto out_unlock;
6935
6936 ret = ocfs2_read_xattr_block(old_inode, le64_to_cpu(di->i_xattr_loc),
6937 &blk_bh);
6938 if (ret < 0) {
6939 mlog_errno(ret);
6940 goto out_unlock;
6941 }
6942
6943 ret = ocfs2_reflink_xattr_in_block(&args, blk_bh);
6944 if (ret)
6945 mlog_errno(ret);
6946
6947 brelse(blk_bh);
6948
6949out_unlock:
6950 ocfs2_unlock_refcount_tree(OCFS2_SB(old_inode->i_sb),
6951 ref_tree, 1);
6952 brelse(ref_root_bh);
6953
6954 if (ocfs2_dealloc_has_cluster(&dealloc)) {
6955 ocfs2_schedule_truncate_log_flush(OCFS2_SB(old_inode->i_sb), 1);
6956 ocfs2_run_deallocs(OCFS2_SB(old_inode->i_sb), &dealloc);
5302 } 6957 }
5303 6958
5304out: 6959out:
@@ -5306,6 +6961,51 @@ out:
5306} 6961}
5307 6962
5308/* 6963/*
6964 * Initialize security and acl for a already created inode.
6965 * Used for reflink a non-preserve-security file.
6966 *
6967 * It uses common api like ocfs2_xattr_set, so the caller
6968 * must not hold any lock expect i_mutex.
6969 */
6970int ocfs2_init_security_and_acl(struct inode *dir,
6971 struct inode *inode)
6972{
6973 int ret = 0;
6974 struct buffer_head *dir_bh = NULL;
6975 struct ocfs2_security_xattr_info si = {
6976 .enable = 1,
6977 };
6978
6979 ret = ocfs2_init_security_get(inode, dir, &si);
6980 if (!ret) {
6981 ret = ocfs2_xattr_security_set(inode, si.name,
6982 si.value, si.value_len,
6983 XATTR_CREATE);
6984 if (ret) {
6985 mlog_errno(ret);
6986 goto leave;
6987 }
6988 } else if (ret != -EOPNOTSUPP) {
6989 mlog_errno(ret);
6990 goto leave;
6991 }
6992
6993 ret = ocfs2_inode_lock(dir, &dir_bh, 0);
6994 if (ret) {
6995 mlog_errno(ret);
6996 goto leave;
6997 }
6998
6999 ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
7000 if (ret)
7001 mlog_errno(ret);
7002
7003 ocfs2_inode_unlock(dir, 0);
7004 brelse(dir_bh);
7005leave:
7006 return ret;
7007}
7008/*
5309 * 'security' attributes support 7009 * 'security' attributes support
5310 */ 7010 */
5311static size_t ocfs2_xattr_security_list(struct inode *inode, char *list, 7011static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1ca7e9a1b7bc..08e36389f56d 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -55,6 +55,8 @@ int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
55 int, const char *, const void *, size_t, int, 55 int, const char *, const void *, size_t, int,
56 struct ocfs2_alloc_context *, 56 struct ocfs2_alloc_context *,
57 struct ocfs2_alloc_context *); 57 struct ocfs2_alloc_context *);
58int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
59 struct ocfs2_dinode *di);
58int ocfs2_xattr_remove(struct inode *, struct buffer_head *); 60int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
59int ocfs2_init_security_get(struct inode *, struct inode *, 61int ocfs2_init_security_get(struct inode *, struct inode *,
60 struct ocfs2_security_xattr_info *); 62 struct ocfs2_security_xattr_info *);
@@ -83,5 +85,16 @@ struct ocfs2_xattr_value_buf {
83 struct ocfs2_xattr_value_root *vb_xv; 85 struct ocfs2_xattr_value_root *vb_xv;
84}; 86};
85 87
86 88int ocfs2_xattr_attach_refcount_tree(struct inode *inode,
89 struct buffer_head *fe_bh,
90 struct ocfs2_caching_info *ref_ci,
91 struct buffer_head *ref_root_bh,
92 struct ocfs2_cached_dealloc_ctxt *dealloc);
93int ocfs2_reflink_xattrs(struct inode *old_inode,
94 struct buffer_head *old_bh,
95 struct inode *new_inode,
96 struct buffer_head *new_bh,
97 bool preserve_security);
98int ocfs2_init_security_and_acl(struct inode *dir,
99 struct inode *inode);
87#endif /* OCFS2_XATTR_H */ 100#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index c7275cfbdcfb..3680bae335b5 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -489,7 +489,7 @@ out:
489 return ret; 489 return ret;
490} 490}
491 491
492struct inode_operations omfs_dir_inops = { 492const struct inode_operations omfs_dir_inops = {
493 .lookup = omfs_lookup, 493 .lookup = omfs_lookup,
494 .mkdir = omfs_mkdir, 494 .mkdir = omfs_mkdir,
495 .rename = omfs_rename, 495 .rename = omfs_rename,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index d17e774eaf45..4845fbb18e6e 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -333,11 +333,11 @@ struct file_operations omfs_file_operations = {
333 .splice_read = generic_file_splice_read, 333 .splice_read = generic_file_splice_read,
334}; 334};
335 335
336struct inode_operations omfs_file_inops = { 336const struct inode_operations omfs_file_inops = {
337 .truncate = omfs_truncate 337 .truncate = omfs_truncate
338}; 338};
339 339
340struct address_space_operations omfs_aops = { 340const struct address_space_operations omfs_aops = {
341 .readpage = omfs_readpage, 341 .readpage = omfs_readpage,
342 .readpages = omfs_readpages, 342 .readpages = omfs_readpages,
343 .writepage = omfs_writepage, 343 .writepage = omfs_writepage,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 379ae5fb4411..f3b7c1541f3a 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -278,7 +278,7 @@ static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
278 return 0; 278 return 0;
279} 279}
280 280
281static struct super_operations omfs_sops = { 281static const struct super_operations omfs_sops = {
282 .write_inode = omfs_write_inode, 282 .write_inode = omfs_write_inode,
283 .delete_inode = omfs_delete_inode, 283 .delete_inode = omfs_delete_inode,
284 .put_super = omfs_put_super, 284 .put_super = omfs_put_super,
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index 2bc0f0670406..df71039945ac 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -45,15 +45,15 @@ extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
45 45
46/* dir.c */ 46/* dir.c */
47extern struct file_operations omfs_dir_operations; 47extern struct file_operations omfs_dir_operations;
48extern struct inode_operations omfs_dir_inops; 48extern const struct inode_operations omfs_dir_inops;
49extern int omfs_make_empty(struct inode *inode, struct super_block *sb); 49extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
50extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, 50extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
51 u64 fsblock); 51 u64 fsblock);
52 52
53/* file.c */ 53/* file.c */
54extern struct file_operations omfs_file_operations; 54extern struct file_operations omfs_file_operations;
55extern struct inode_operations omfs_file_inops; 55extern const struct inode_operations omfs_file_inops;
56extern struct address_space_operations omfs_aops; 56extern const struct address_space_operations omfs_aops;
57extern void omfs_make_empty_table(struct buffer_head *bh, int offset); 57extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
58extern int omfs_shrink_inode(struct inode *inode); 58extern int omfs_shrink_inode(struct inode *inode);
59 59
diff --git a/fs/open.c b/fs/open.c
index 31191bf513e4..4f01e06227c6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -290,10 +290,9 @@ out:
290 return error; 290 return error;
291} 291}
292 292
293SYSCALL_DEFINE2(truncate, const char __user *, path, unsigned long, length) 293SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
294{ 294{
295 /* on 32-bit boxen it will cut the range 2^31--2^32-1 off */ 295 return do_sys_truncate(path, length);
296 return do_sys_truncate(path, (long)length);
297} 296}
298 297
299static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) 298static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 619ba99dfe39..7b685e10cbad 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -312,7 +312,7 @@ static struct attribute_group part_attr_group = {
312 .attrs = part_attrs, 312 .attrs = part_attrs,
313}; 313};
314 314
315static struct attribute_group *part_attr_groups[] = { 315static const struct attribute_group *part_attr_groups[] = {
316 &part_attr_group, 316 &part_attr_group,
317#ifdef CONFIG_BLK_DEV_IO_TRACE 317#ifdef CONFIG_BLK_DEV_IO_TRACE
318 &blk_trace_attr_group, 318 &blk_trace_attr_group,
@@ -581,7 +581,7 @@ try_scan:
581 } 581 }
582 582
583 if (from + size > get_capacity(disk)) { 583 if (from + size > get_capacity(disk)) {
584 struct block_device_operations *bdops = disk->fops; 584 const struct block_device_operations *bdops = disk->fops;
585 unsigned long long capacity; 585 unsigned long long capacity;
586 586
587 printk(KERN_WARNING 587 printk(KERN_WARNING
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 725a650bbbb8..0c6bc602e6c4 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -82,6 +82,7 @@
82#include <linux/pid_namespace.h> 82#include <linux/pid_namespace.h>
83#include <linux/ptrace.h> 83#include <linux/ptrace.h>
84#include <linux/tracehook.h> 84#include <linux/tracehook.h>
85#include <linux/swapops.h>
85 86
86#include <asm/pgtable.h> 87#include <asm/pgtable.h>
87#include <asm/processor.h> 88#include <asm/processor.h>
@@ -321,6 +322,87 @@ static inline void task_context_switch_counts(struct seq_file *m,
321 p->nivcsw); 322 p->nivcsw);
322} 323}
323 324
325struct stack_stats {
326 struct vm_area_struct *vma;
327 unsigned long startpage;
328 unsigned long usage;
329};
330
331static int stack_usage_pte_range(pmd_t *pmd, unsigned long addr,
332 unsigned long end, struct mm_walk *walk)
333{
334 struct stack_stats *ss = walk->private;
335 struct vm_area_struct *vma = ss->vma;
336 pte_t *pte, ptent;
337 spinlock_t *ptl;
338 int ret = 0;
339
340 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
341 for (; addr != end; pte++, addr += PAGE_SIZE) {
342 ptent = *pte;
343
344#ifdef CONFIG_STACK_GROWSUP
345 if (pte_present(ptent) || is_swap_pte(ptent))
346 ss->usage = addr - ss->startpage + PAGE_SIZE;
347#else
348 if (pte_present(ptent) || is_swap_pte(ptent)) {
349 ss->usage = ss->startpage - addr + PAGE_SIZE;
350 pte++;
351 ret = 1;
352 break;
353 }
354#endif
355 }
356 pte_unmap_unlock(pte - 1, ptl);
357 cond_resched();
358 return ret;
359}
360
361static inline unsigned long get_stack_usage_in_bytes(struct vm_area_struct *vma,
362 struct task_struct *task)
363{
364 struct stack_stats ss;
365 struct mm_walk stack_walk = {
366 .pmd_entry = stack_usage_pte_range,
367 .mm = vma->vm_mm,
368 .private = &ss,
369 };
370
371 if (!vma->vm_mm || is_vm_hugetlb_page(vma))
372 return 0;
373
374 ss.vma = vma;
375 ss.startpage = task->stack_start & PAGE_MASK;
376 ss.usage = 0;
377
378#ifdef CONFIG_STACK_GROWSUP
379 walk_page_range(KSTK_ESP(task) & PAGE_MASK, vma->vm_end,
380 &stack_walk);
381#else
382 walk_page_range(vma->vm_start, (KSTK_ESP(task) & PAGE_MASK) + PAGE_SIZE,
383 &stack_walk);
384#endif
385 return ss.usage;
386}
387
388static inline void task_show_stack_usage(struct seq_file *m,
389 struct task_struct *task)
390{
391 struct vm_area_struct *vma;
392 struct mm_struct *mm = get_task_mm(task);
393
394 if (mm) {
395 down_read(&mm->mmap_sem);
396 vma = find_vma(mm, task->stack_start);
397 if (vma)
398 seq_printf(m, "Stack usage:\t%lu kB\n",
399 get_stack_usage_in_bytes(vma, task) >> 10);
400
401 up_read(&mm->mmap_sem);
402 mmput(mm);
403 }
404}
405
324int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, 406int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
325 struct pid *pid, struct task_struct *task) 407 struct pid *pid, struct task_struct *task)
326{ 408{
@@ -340,6 +422,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
340 task_show_regs(m, task); 422 task_show_regs(m, task);
341#endif 423#endif
342 task_context_switch_counts(m, task); 424 task_context_switch_counts(m, task);
425 task_show_stack_usage(m, task);
343 return 0; 426 return 0;
344} 427}
345 428
@@ -481,7 +564,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
481 rsslim, 564 rsslim,
482 mm ? mm->start_code : 0, 565 mm ? mm->start_code : 0,
483 mm ? mm->end_code : 0, 566 mm ? mm->end_code : 0,
484 (permitted && mm) ? mm->start_stack : 0, 567 (permitted) ? task->stack_start : 0,
485 esp, 568 esp,
486 eip, 569 eip,
487 /* The signal information here is obsolete. 570 /* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6f742f6658a9..837469a96598 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -447,7 +447,7 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
447 447
448 do_posix_clock_monotonic_gettime(&uptime); 448 do_posix_clock_monotonic_gettime(&uptime);
449 read_lock(&tasklist_lock); 449 read_lock(&tasklist_lock);
450 points = badness(task, uptime.tv_sec); 450 points = badness(task->group_leader, uptime.tv_sec);
451 read_unlock(&tasklist_lock); 451 read_unlock(&tasklist_lock);
452 return sprintf(buffer, "%lu\n", points); 452 return sprintf(buffer, "%lu\n", points);
453} 453}
@@ -458,7 +458,7 @@ struct limit_names {
458}; 458};
459 459
460static const struct limit_names lnames[RLIM_NLIMITS] = { 460static const struct limit_names lnames[RLIM_NLIMITS] = {
461 [RLIMIT_CPU] = {"Max cpu time", "ms"}, 461 [RLIMIT_CPU] = {"Max cpu time", "seconds"},
462 [RLIMIT_FSIZE] = {"Max file size", "bytes"}, 462 [RLIMIT_FSIZE] = {"Max file size", "bytes"},
463 [RLIMIT_DATA] = {"Max data size", "bytes"}, 463 [RLIMIT_DATA] = {"Max data size", "bytes"},
464 [RLIMIT_STACK] = {"Max stack size", "bytes"}, 464 [RLIMIT_STACK] = {"Max stack size", "bytes"},
@@ -999,11 +999,17 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
999 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 999 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
1000 char buffer[PROC_NUMBUF]; 1000 char buffer[PROC_NUMBUF];
1001 size_t len; 1001 size_t len;
1002 int oom_adjust; 1002 int oom_adjust = OOM_DISABLE;
1003 unsigned long flags;
1003 1004
1004 if (!task) 1005 if (!task)
1005 return -ESRCH; 1006 return -ESRCH;
1006 oom_adjust = task->oomkilladj; 1007
1008 if (lock_task_sighand(task, &flags)) {
1009 oom_adjust = task->signal->oom_adj;
1010 unlock_task_sighand(task, &flags);
1011 }
1012
1007 put_task_struct(task); 1013 put_task_struct(task);
1008 1014
1009 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); 1015 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@@ -1015,32 +1021,44 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1015 size_t count, loff_t *ppos) 1021 size_t count, loff_t *ppos)
1016{ 1022{
1017 struct task_struct *task; 1023 struct task_struct *task;
1018 char buffer[PROC_NUMBUF], *end; 1024 char buffer[PROC_NUMBUF];
1019 int oom_adjust; 1025 long oom_adjust;
1026 unsigned long flags;
1027 int err;
1020 1028
1021 memset(buffer, 0, sizeof(buffer)); 1029 memset(buffer, 0, sizeof(buffer));
1022 if (count > sizeof(buffer) - 1) 1030 if (count > sizeof(buffer) - 1)
1023 count = sizeof(buffer) - 1; 1031 count = sizeof(buffer) - 1;
1024 if (copy_from_user(buffer, buf, count)) 1032 if (copy_from_user(buffer, buf, count))
1025 return -EFAULT; 1033 return -EFAULT;
1026 oom_adjust = simple_strtol(buffer, &end, 0); 1034
1035 err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
1036 if (err)
1037 return -EINVAL;
1027 if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && 1038 if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
1028 oom_adjust != OOM_DISABLE) 1039 oom_adjust != OOM_DISABLE)
1029 return -EINVAL; 1040 return -EINVAL;
1030 if (*end == '\n') 1041
1031 end++;
1032 task = get_proc_task(file->f_path.dentry->d_inode); 1042 task = get_proc_task(file->f_path.dentry->d_inode);
1033 if (!task) 1043 if (!task)
1034 return -ESRCH; 1044 return -ESRCH;
1035 if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { 1045 if (!lock_task_sighand(task, &flags)) {
1046 put_task_struct(task);
1047 return -ESRCH;
1048 }
1049
1050 if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
1051 unlock_task_sighand(task, &flags);
1036 put_task_struct(task); 1052 put_task_struct(task);
1037 return -EACCES; 1053 return -EACCES;
1038 } 1054 }
1039 task->oomkilladj = oom_adjust; 1055
1056 task->signal->oom_adj = oom_adjust;
1057
1058 unlock_task_sighand(task, &flags);
1040 put_task_struct(task); 1059 put_task_struct(task);
1041 if (end - buffer == 0) 1060
1042 return -EIO; 1061 return count;
1043 return end - buffer;
1044} 1062}
1045 1063
1046static const struct file_operations proc_oom_adjust_operations = { 1064static const struct file_operations proc_oom_adjust_operations = {
@@ -1169,17 +1187,16 @@ static ssize_t proc_fault_inject_write(struct file * file,
1169 count = sizeof(buffer) - 1; 1187 count = sizeof(buffer) - 1;
1170 if (copy_from_user(buffer, buf, count)) 1188 if (copy_from_user(buffer, buf, count))
1171 return -EFAULT; 1189 return -EFAULT;
1172 make_it_fail = simple_strtol(buffer, &end, 0); 1190 make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
1173 if (*end == '\n') 1191 if (*end)
1174 end++; 1192 return -EINVAL;
1175 task = get_proc_task(file->f_dentry->d_inode); 1193 task = get_proc_task(file->f_dentry->d_inode);
1176 if (!task) 1194 if (!task)
1177 return -ESRCH; 1195 return -ESRCH;
1178 task->make_it_fail = make_it_fail; 1196 task->make_it_fail = make_it_fail;
1179 put_task_struct(task); 1197 put_task_struct(task);
1180 if (end - buffer == 0) 1198
1181 return -EIO; 1199 return count;
1182 return end - buffer;
1183} 1200}
1184 1201
1185static const struct file_operations proc_fault_inject_operations = { 1202static const struct file_operations proc_fault_inject_operations = {
@@ -2586,9 +2603,6 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
2586 dput(dentry); 2603 dput(dentry);
2587 } 2604 }
2588 2605
2589 if (tgid == 0)
2590 goto out;
2591
2592 name.name = buf; 2606 name.name = buf;
2593 name.len = snprintf(buf, sizeof(buf), "%d", tgid); 2607 name.len = snprintf(buf, sizeof(buf), "%d", tgid);
2594 leader = d_hash_and_lookup(mnt->mnt_root, &name); 2608 leader = d_hash_and_lookup(mnt->mnt_root, &name);
@@ -2645,17 +2659,16 @@ out:
2645void proc_flush_task(struct task_struct *task) 2659void proc_flush_task(struct task_struct *task)
2646{ 2660{
2647 int i; 2661 int i;
2648 struct pid *pid, *tgid = NULL; 2662 struct pid *pid, *tgid;
2649 struct upid *upid; 2663 struct upid *upid;
2650 2664
2651 pid = task_pid(task); 2665 pid = task_pid(task);
2652 if (thread_group_leader(task)) 2666 tgid = task_tgid(task);
2653 tgid = task_tgid(task);
2654 2667
2655 for (i = 0; i <= pid->level; i++) { 2668 for (i = 0; i <= pid->level; i++) {
2656 upid = &pid->numbers[i]; 2669 upid = &pid->numbers[i];
2657 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, 2670 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
2658 tgid ? tgid->numbers[i].nr : 0); 2671 tgid->numbers[i].nr);
2659 } 2672 }
2660 2673
2661 upid = &pid->numbers[pid->level]; 2674 upid = &pid->numbers[pid->level];
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 59b43a068872..56013371f9f3 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -17,9 +17,15 @@
17#include <linux/elfcore.h> 17#include <linux/elfcore.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/highmem.h> 19#include <linux/highmem.h>
20#include <linux/bootmem.h>
20#include <linux/init.h> 21#include <linux/init.h>
21#include <asm/uaccess.h> 22#include <asm/uaccess.h>
22#include <asm/io.h> 23#include <asm/io.h>
24#include <linux/list.h>
25#include <linux/ioport.h>
26#include <linux/mm.h>
27#include <linux/memory.h>
28#include <asm/sections.h>
23 29
24#define CORE_STR "CORE" 30#define CORE_STR "CORE"
25 31
@@ -29,17 +35,6 @@
29 35
30static struct proc_dir_entry *proc_root_kcore; 36static struct proc_dir_entry *proc_root_kcore;
31 37
32static int open_kcore(struct inode * inode, struct file * filp)
33{
34 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
35}
36
37static ssize_t read_kcore(struct file *, char __user *, size_t, loff_t *);
38
39static const struct file_operations proc_kcore_operations = {
40 .read = read_kcore,
41 .open = open_kcore,
42};
43 38
44#ifndef kc_vaddr_to_offset 39#ifndef kc_vaddr_to_offset
45#define kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET) 40#define kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET)
@@ -57,18 +52,19 @@ struct memelfnote
57 void *data; 52 void *data;
58}; 53};
59 54
60static struct kcore_list *kclist; 55static LIST_HEAD(kclist_head);
61static DEFINE_RWLOCK(kclist_lock); 56static DEFINE_RWLOCK(kclist_lock);
57static int kcore_need_update = 1;
62 58
63void 59void
64kclist_add(struct kcore_list *new, void *addr, size_t size) 60kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
65{ 61{
66 new->addr = (unsigned long)addr; 62 new->addr = (unsigned long)addr;
67 new->size = size; 63 new->size = size;
64 new->type = type;
68 65
69 write_lock(&kclist_lock); 66 write_lock(&kclist_lock);
70 new->next = kclist; 67 list_add_tail(&new->list, &kclist_head);
71 kclist = new;
72 write_unlock(&kclist_lock); 68 write_unlock(&kclist_lock);
73} 69}
74 70
@@ -80,7 +76,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
80 *nphdr = 1; /* PT_NOTE */ 76 *nphdr = 1; /* PT_NOTE */
81 size = 0; 77 size = 0;
82 78
83 for (m=kclist; m; m=m->next) { 79 list_for_each_entry(m, &kclist_head, list) {
84 try = kc_vaddr_to_offset((size_t)m->addr + m->size); 80 try = kc_vaddr_to_offset((size_t)m->addr + m->size);
85 if (try > size) 81 if (try > size)
86 size = try; 82 size = try;
@@ -97,6 +93,177 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
97 return size + *elf_buflen; 93 return size + *elf_buflen;
98} 94}
99 95
96static void free_kclist_ents(struct list_head *head)
97{
98 struct kcore_list *tmp, *pos;
99
100 list_for_each_entry_safe(pos, tmp, head, list) {
101 list_del(&pos->list);
102 kfree(pos);
103 }
104}
105/*
106 * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list.
107 */
108static void __kcore_update_ram(struct list_head *list)
109{
110 int nphdr;
111 size_t size;
112 struct kcore_list *tmp, *pos;
113 LIST_HEAD(garbage);
114
115 write_lock(&kclist_lock);
116 if (kcore_need_update) {
117 list_for_each_entry_safe(pos, tmp, &kclist_head, list) {
118 if (pos->type == KCORE_RAM
119 || pos->type == KCORE_VMEMMAP)
120 list_move(&pos->list, &garbage);
121 }
122 list_splice_tail(list, &kclist_head);
123 } else
124 list_splice(list, &garbage);
125 kcore_need_update = 0;
126 proc_root_kcore->size = get_kcore_size(&nphdr, &size);
127 write_unlock(&kclist_lock);
128
129 free_kclist_ents(&garbage);
130}
131
132
133#ifdef CONFIG_HIGHMEM
134/*
135 * If no highmem, we can assume [0...max_low_pfn) continuous range of memory
136 * because memory hole is not as big as !HIGHMEM case.
137 * (HIGHMEM is special because part of memory is _invisible_ from the kernel.)
138 */
139static int kcore_update_ram(void)
140{
141 LIST_HEAD(head);
142 struct kcore_list *ent;
143 int ret = 0;
144
145 ent = kmalloc(sizeof(*ent), GFP_KERNEL);
146 if (!ent)
147 return -ENOMEM;
148 ent->addr = (unsigned long)__va(0);
149 ent->size = max_low_pfn << PAGE_SHIFT;
150 ent->type = KCORE_RAM;
151 list_add(&ent->list, &head);
152 __kcore_update_ram(&head);
153 return ret;
154}
155
156#else /* !CONFIG_HIGHMEM */
157
158#ifdef CONFIG_SPARSEMEM_VMEMMAP
159/* calculate vmemmap's address from given system ram pfn and register it */
160int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
161{
162 unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT;
163 unsigned long nr_pages = ent->size >> PAGE_SHIFT;
164 unsigned long start, end;
165 struct kcore_list *vmm, *tmp;
166
167
168 start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK;
169 end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1;
170 end = ALIGN(end, PAGE_SIZE);
171 /* overlap check (because we have to align page */
172 list_for_each_entry(tmp, head, list) {
173 if (tmp->type != KCORE_VMEMMAP)
174 continue;
175 if (start < tmp->addr + tmp->size)
176 if (end > tmp->addr)
177 end = tmp->addr;
178 }
179 if (start < end) {
180 vmm = kmalloc(sizeof(*vmm), GFP_KERNEL);
181 if (!vmm)
182 return 0;
183 vmm->addr = start;
184 vmm->size = end - start;
185 vmm->type = KCORE_VMEMMAP;
186 list_add_tail(&vmm->list, head);
187 }
188 return 1;
189
190}
191#else
192int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
193{
194 return 1;
195}
196
197#endif
198
199static int
200kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg)
201{
202 struct list_head *head = (struct list_head *)arg;
203 struct kcore_list *ent;
204
205 ent = kmalloc(sizeof(*ent), GFP_KERNEL);
206 if (!ent)
207 return -ENOMEM;
208 ent->addr = (unsigned long)__va((pfn << PAGE_SHIFT));
209 ent->size = nr_pages << PAGE_SHIFT;
210
211 /* Sanity check: Can happen in 32bit arch...maybe */
212 if (ent->addr < (unsigned long) __va(0))
213 goto free_out;
214
215 /* cut not-mapped area. ....from ppc-32 code. */
216 if (ULONG_MAX - ent->addr < ent->size)
217 ent->size = ULONG_MAX - ent->addr;
218
219 /* cut when vmalloc() area is higher than direct-map area */
220 if (VMALLOC_START > (unsigned long)__va(0)) {
221 if (ent->addr > VMALLOC_START)
222 goto free_out;
223 if (VMALLOC_START - ent->addr < ent->size)
224 ent->size = VMALLOC_START - ent->addr;
225 }
226
227 ent->type = KCORE_RAM;
228 list_add_tail(&ent->list, head);
229
230 if (!get_sparsemem_vmemmap_info(ent, head)) {
231 list_del(&ent->list);
232 goto free_out;
233 }
234
235 return 0;
236free_out:
237 kfree(ent);
238 return 1;
239}
240
241static int kcore_update_ram(void)
242{
243 int nid, ret;
244 unsigned long end_pfn;
245 LIST_HEAD(head);
246
247 /* Not inialized....update now */
248 /* find out "max pfn" */
249 end_pfn = 0;
250 for_each_node_state(nid, N_HIGH_MEMORY) {
251 unsigned long node_end;
252 node_end = NODE_DATA(nid)->node_start_pfn +
253 NODE_DATA(nid)->node_spanned_pages;
254 if (end_pfn < node_end)
255 end_pfn = node_end;
256 }
257 /* scan 0 to max_pfn */
258 ret = walk_system_ram_range(0, end_pfn, &head, kclist_add_private);
259 if (ret) {
260 free_kclist_ents(&head);
261 return -ENOMEM;
262 }
263 __kcore_update_ram(&head);
264 return ret;
265}
266#endif /* CONFIG_HIGHMEM */
100 267
101/*****************************************************************************/ 268/*****************************************************************************/
102/* 269/*
@@ -192,7 +359,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
192 nhdr->p_align = 0; 359 nhdr->p_align = 0;
193 360
194 /* setup ELF PT_LOAD program header for every area */ 361 /* setup ELF PT_LOAD program header for every area */
195 for (m=kclist; m; m=m->next) { 362 list_for_each_entry(m, &kclist_head, list) {
196 phdr = (struct elf_phdr *) bufp; 363 phdr = (struct elf_phdr *) bufp;
197 bufp += sizeof(struct elf_phdr); 364 bufp += sizeof(struct elf_phdr);
198 offset += sizeof(struct elf_phdr); 365 offset += sizeof(struct elf_phdr);
@@ -265,7 +432,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
265 unsigned long start; 432 unsigned long start;
266 433
267 read_lock(&kclist_lock); 434 read_lock(&kclist_lock);
268 proc_root_kcore->size = size = get_kcore_size(&nphdr, &elf_buflen); 435 size = get_kcore_size(&nphdr, &elf_buflen);
436
269 if (buflen == 0 || *fpos >= size) { 437 if (buflen == 0 || *fpos >= size) {
270 read_unlock(&kclist_lock); 438 read_unlock(&kclist_lock);
271 return 0; 439 return 0;
@@ -317,7 +485,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
317 struct kcore_list *m; 485 struct kcore_list *m;
318 486
319 read_lock(&kclist_lock); 487 read_lock(&kclist_lock);
320 for (m=kclist; m; m=m->next) { 488 list_for_each_entry(m, &kclist_head, list) {
321 if (start >= m->addr && start < (m->addr+m->size)) 489 if (start >= m->addr && start < (m->addr+m->size))
322 break; 490 break;
323 } 491 }
@@ -326,45 +494,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
326 if (m == NULL) { 494 if (m == NULL) {
327 if (clear_user(buffer, tsz)) 495 if (clear_user(buffer, tsz))
328 return -EFAULT; 496 return -EFAULT;
329 } else if (is_vmalloc_addr((void *)start)) { 497 } else if (is_vmalloc_or_module_addr((void *)start)) {
330 char * elf_buf; 498 char * elf_buf;
331 struct vm_struct *m;
332 unsigned long curstart = start;
333 unsigned long cursize = tsz;
334 499
335 elf_buf = kzalloc(tsz, GFP_KERNEL); 500 elf_buf = kzalloc(tsz, GFP_KERNEL);
336 if (!elf_buf) 501 if (!elf_buf)
337 return -ENOMEM; 502 return -ENOMEM;
338 503 vread(elf_buf, (char *)start, tsz);
339 read_lock(&vmlist_lock); 504 /* we have to zero-fill user buffer even if no read */
340 for (m=vmlist; m && cursize; m=m->next) {
341 unsigned long vmstart;
342 unsigned long vmsize;
343 unsigned long msize = m->size - PAGE_SIZE;
344
345 if (((unsigned long)m->addr + msize) <
346 curstart)
347 continue;
348 if ((unsigned long)m->addr > (curstart +
349 cursize))
350 break;
351 vmstart = (curstart < (unsigned long)m->addr ?
352 (unsigned long)m->addr : curstart);
353 if (((unsigned long)m->addr + msize) >
354 (curstart + cursize))
355 vmsize = curstart + cursize - vmstart;
356 else
357 vmsize = (unsigned long)m->addr +
358 msize - vmstart;
359 curstart = vmstart + vmsize;
360 cursize -= vmsize;
361 /* don't dump ioremap'd stuff! (TA) */
362 if (m->flags & VM_IOREMAP)
363 continue;
364 memcpy(elf_buf + (vmstart - start),
365 (char *)vmstart, vmsize);
366 }
367 read_unlock(&vmlist_lock);
368 if (copy_to_user(buffer, elf_buf, tsz)) { 505 if (copy_to_user(buffer, elf_buf, tsz)) {
369 kfree(elf_buf); 506 kfree(elf_buf);
370 return -EFAULT; 507 return -EFAULT;
@@ -402,12 +539,96 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
402 return acc; 539 return acc;
403} 540}
404 541
542
543static int open_kcore(struct inode *inode, struct file *filp)
544{
545 if (!capable(CAP_SYS_RAWIO))
546 return -EPERM;
547 if (kcore_need_update)
548 kcore_update_ram();
549 if (i_size_read(inode) != proc_root_kcore->size) {
550 mutex_lock(&inode->i_mutex);
551 i_size_write(inode, proc_root_kcore->size);
552 mutex_unlock(&inode->i_mutex);
553 }
554 return 0;
555}
556
557
558static const struct file_operations proc_kcore_operations = {
559 .read = read_kcore,
560 .open = open_kcore,
561};
562
563#ifdef CONFIG_MEMORY_HOTPLUG
564/* just remember that we have to update kcore */
565static int __meminit kcore_callback(struct notifier_block *self,
566 unsigned long action, void *arg)
567{
568 switch (action) {
569 case MEM_ONLINE:
570 case MEM_OFFLINE:
571 write_lock(&kclist_lock);
572 kcore_need_update = 1;
573 write_unlock(&kclist_lock);
574 }
575 return NOTIFY_OK;
576}
577#endif
578
579
580static struct kcore_list kcore_vmalloc;
581
582#ifdef CONFIG_ARCH_PROC_KCORE_TEXT
583static struct kcore_list kcore_text;
584/*
585 * If defined, special segment is used for mapping kernel text instead of
586 * direct-map area. We need to create special TEXT section.
587 */
588static void __init proc_kcore_text_init(void)
589{
590 kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT);
591}
592#else
593static void __init proc_kcore_text_init(void)
594{
595}
596#endif
597
598#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
599/*
600 * MODULES_VADDR has no intersection with VMALLOC_ADDR.
601 */
602struct kcore_list kcore_modules;
603static void __init add_modules_range(void)
604{
605 kclist_add(&kcore_modules, (void *)MODULES_VADDR,
606 MODULES_END - MODULES_VADDR, KCORE_VMALLOC);
607}
608#else
609static void __init add_modules_range(void)
610{
611}
612#endif
613
405static int __init proc_kcore_init(void) 614static int __init proc_kcore_init(void)
406{ 615{
407 proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations); 616 proc_root_kcore = proc_create("kcore", S_IRUSR, NULL,
408 if (proc_root_kcore) 617 &proc_kcore_operations);
409 proc_root_kcore->size = 618 if (!proc_root_kcore) {
410 (size_t)high_memory - PAGE_OFFSET + PAGE_SIZE; 619 printk(KERN_ERR "couldn't create /proc/kcore\n");
620 return 0; /* Always returns 0. */
621 }
622 /* Store text area if it's special */
623 proc_kcore_text_init();
624 /* Store vmalloc area */
625 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
626 VMALLOC_END - VMALLOC_START, KCORE_VMALLOC);
627 add_modules_range();
628 /* Store direct-map area from physical memory map */
629 kcore_update_ram();
630 hotplug_memory_notifier(kcore_callback, 0);
631
411 return 0; 632 return 0;
412} 633}
413module_init(proc_kcore_init); 634module_init(proc_kcore_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 78faedcb0a8d..c7bff4f603ff 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -81,9 +81,11 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
81 "Writeback: %8lu kB\n" 81 "Writeback: %8lu kB\n"
82 "AnonPages: %8lu kB\n" 82 "AnonPages: %8lu kB\n"
83 "Mapped: %8lu kB\n" 83 "Mapped: %8lu kB\n"
84 "Shmem: %8lu kB\n"
84 "Slab: %8lu kB\n" 85 "Slab: %8lu kB\n"
85 "SReclaimable: %8lu kB\n" 86 "SReclaimable: %8lu kB\n"
86 "SUnreclaim: %8lu kB\n" 87 "SUnreclaim: %8lu kB\n"
88 "KernelStack: %8lu kB\n"
87 "PageTables: %8lu kB\n" 89 "PageTables: %8lu kB\n"
88#ifdef CONFIG_QUICKLIST 90#ifdef CONFIG_QUICKLIST
89 "Quicklists: %8lu kB\n" 91 "Quicklists: %8lu kB\n"
@@ -128,10 +130,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
128 K(global_page_state(NR_WRITEBACK)), 130 K(global_page_state(NR_WRITEBACK)),
129 K(global_page_state(NR_ANON_PAGES)), 131 K(global_page_state(NR_ANON_PAGES)),
130 K(global_page_state(NR_FILE_MAPPED)), 132 K(global_page_state(NR_FILE_MAPPED)),
133 K(global_page_state(NR_SHMEM)),
131 K(global_page_state(NR_SLAB_RECLAIMABLE) + 134 K(global_page_state(NR_SLAB_RECLAIMABLE) +
132 global_page_state(NR_SLAB_UNRECLAIMABLE)), 135 global_page_state(NR_SLAB_UNRECLAIMABLE)),
133 K(global_page_state(NR_SLAB_RECLAIMABLE)), 136 K(global_page_state(NR_SLAB_RECLAIMABLE)),
134 K(global_page_state(NR_SLAB_UNRECLAIMABLE)), 137 K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
138 global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
135 K(global_page_state(NR_PAGETABLE)), 139 K(global_page_state(NR_PAGETABLE)),
136#ifdef CONFIG_QUICKLIST 140#ifdef CONFIG_QUICKLIST
137 K(quicklist_total_size()), 141 K(quicklist_total_size()),
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 7e14d1a04001..9fe7d7ebe115 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -109,7 +109,7 @@ static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos)
109 return rb_next((struct rb_node *) v); 109 return rb_next((struct rb_node *) v);
110} 110}
111 111
112static struct seq_operations proc_nommu_region_list_seqop = { 112static const struct seq_operations proc_nommu_region_list_seqop = {
113 .start = nommu_region_list_start, 113 .start = nommu_region_list_start,
114 .next = nommu_region_list_next, 114 .next = nommu_region_list_next,
115 .stop = nommu_region_list_stop, 115 .stop = nommu_region_list_stop,
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 2707c6c7a20f..2281c2cbfe2b 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -2,6 +2,7 @@
2#include <linux/compiler.h> 2#include <linux/compiler.h>
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/init.h> 4#include <linux/init.h>
5#include <linux/ksm.h>
5#include <linux/mm.h> 6#include <linux/mm.h>
6#include <linux/mmzone.h> 7#include <linux/mmzone.h>
7#include <linux/proc_fs.h> 8#include <linux/proc_fs.h>
@@ -95,6 +96,8 @@ static const struct file_operations proc_kpagecount_operations = {
95#define KPF_UNEVICTABLE 18 96#define KPF_UNEVICTABLE 18
96#define KPF_NOPAGE 20 97#define KPF_NOPAGE 20
97 98
99#define KPF_KSM 21
100
98/* kernel hacking assistances 101/* kernel hacking assistances
99 * WARNING: subject to change, never rely on them! 102 * WARNING: subject to change, never rely on them!
100 */ 103 */
@@ -137,6 +140,8 @@ static u64 get_uflags(struct page *page)
137 u |= 1 << KPF_MMAP; 140 u |= 1 << KPF_MMAP;
138 if (PageAnon(page)) 141 if (PageAnon(page))
139 u |= 1 << KPF_ANON; 142 u |= 1 << KPF_ANON;
143 if (PageKsm(page))
144 u |= 1 << KPF_KSM;
140 145
141 /* 146 /*
142 * compound pages: export both head/tail info 147 * compound pages: export both head/tail info
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 9b1e4e9a16bf..f667e8aeabdf 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -153,7 +153,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
153 153
154 /* careful: calling conventions are nasty here */ 154 /* careful: calling conventions are nasty here */
155 res = count; 155 res = count;
156 error = table->proc_handler(table, write, filp, buf, &res, ppos); 156 error = table->proc_handler(table, write, buf, &res, ppos);
157 if (!error) 157 if (!error)
158 error = res; 158 error = res;
159out: 159out:
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9bd8be1d235c..2a1bef9203c6 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -243,6 +243,25 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
243 } else if (vma->vm_start <= mm->start_stack && 243 } else if (vma->vm_start <= mm->start_stack &&
244 vma->vm_end >= mm->start_stack) { 244 vma->vm_end >= mm->start_stack) {
245 name = "[stack]"; 245 name = "[stack]";
246 } else {
247 unsigned long stack_start;
248 struct proc_maps_private *pmp;
249
250 pmp = m->private;
251 stack_start = pmp->task->stack_start;
252
253 if (vma->vm_start <= stack_start &&
254 vma->vm_end >= stack_start) {
255 pad_len_spaces(m, len);
256 seq_printf(m,
257 "[threadstack:%08lx]",
258#ifdef CONFIG_STACK_GROWSUP
259 vma->vm_end - stack_start
260#else
261 stack_start - vma->vm_start
262#endif
263 );
264 }
246 } 265 }
247 } else { 266 } else {
248 name = "[vdso]"; 267 name = "[vdso]";
@@ -465,23 +484,28 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
465 return 0; 484 return 0;
466} 485}
467 486
487#define CLEAR_REFS_ALL 1
488#define CLEAR_REFS_ANON 2
489#define CLEAR_REFS_MAPPED 3
490
468static ssize_t clear_refs_write(struct file *file, const char __user *buf, 491static ssize_t clear_refs_write(struct file *file, const char __user *buf,
469 size_t count, loff_t *ppos) 492 size_t count, loff_t *ppos)
470{ 493{
471 struct task_struct *task; 494 struct task_struct *task;
472 char buffer[PROC_NUMBUF], *end; 495 char buffer[PROC_NUMBUF];
473 struct mm_struct *mm; 496 struct mm_struct *mm;
474 struct vm_area_struct *vma; 497 struct vm_area_struct *vma;
498 long type;
475 499
476 memset(buffer, 0, sizeof(buffer)); 500 memset(buffer, 0, sizeof(buffer));
477 if (count > sizeof(buffer) - 1) 501 if (count > sizeof(buffer) - 1)
478 count = sizeof(buffer) - 1; 502 count = sizeof(buffer) - 1;
479 if (copy_from_user(buffer, buf, count)) 503 if (copy_from_user(buffer, buf, count))
480 return -EFAULT; 504 return -EFAULT;
481 if (!simple_strtol(buffer, &end, 0)) 505 if (strict_strtol(strstrip(buffer), 10, &type))
506 return -EINVAL;
507 if (type < CLEAR_REFS_ALL || type > CLEAR_REFS_MAPPED)
482 return -EINVAL; 508 return -EINVAL;
483 if (*end == '\n')
484 end++;
485 task = get_proc_task(file->f_path.dentry->d_inode); 509 task = get_proc_task(file->f_path.dentry->d_inode);
486 if (!task) 510 if (!task)
487 return -ESRCH; 511 return -ESRCH;
@@ -494,18 +518,31 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
494 down_read(&mm->mmap_sem); 518 down_read(&mm->mmap_sem);
495 for (vma = mm->mmap; vma; vma = vma->vm_next) { 519 for (vma = mm->mmap; vma; vma = vma->vm_next) {
496 clear_refs_walk.private = vma; 520 clear_refs_walk.private = vma;
497 if (!is_vm_hugetlb_page(vma)) 521 if (is_vm_hugetlb_page(vma))
498 walk_page_range(vma->vm_start, vma->vm_end, 522 continue;
499 &clear_refs_walk); 523 /*
524 * Writing 1 to /proc/pid/clear_refs affects all pages.
525 *
526 * Writing 2 to /proc/pid/clear_refs only affects
527 * Anonymous pages.
528 *
529 * Writing 3 to /proc/pid/clear_refs only affects file
530 * mapped pages.
531 */
532 if (type == CLEAR_REFS_ANON && vma->vm_file)
533 continue;
534 if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
535 continue;
536 walk_page_range(vma->vm_start, vma->vm_end,
537 &clear_refs_walk);
500 } 538 }
501 flush_tlb_mm(mm); 539 flush_tlb_mm(mm);
502 up_read(&mm->mmap_sem); 540 up_read(&mm->mmap_sem);
503 mmput(mm); 541 mmput(mm);
504 } 542 }
505 put_task_struct(task); 543 put_task_struct(task);
506 if (end - buffer == 0) 544
507 return -EIO; 545 return count;
508 return end - buffer;
509} 546}
510 547
511const struct file_operations proc_clear_refs_operations = { 548const struct file_operations proc_clear_refs_operations = {
diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig
index be8e0e1445b6..5f6089994042 100644
--- a/fs/qnx4/Kconfig
+++ b/fs/qnx4/Kconfig
@@ -6,20 +6,9 @@ config QNX4FS_FS
6 QNX 4 and QNX 6 (the latter is also called QNX RTP). 6 QNX 4 and QNX 6 (the latter is also called QNX RTP).
7 Further information is available at <http://www.qnx.com/>. 7 Further information is available at <http://www.qnx.com/>.
8 Say Y if you intend to mount QNX hard disks or floppies. 8 Say Y if you intend to mount QNX hard disks or floppies.
9 Unless you say Y to "QNX4FS read-write support" below, you will
10 only be able to read these file systems.
11 9
12 To compile this file system support as a module, choose M here: the 10 To compile this file system support as a module, choose M here: the
13 module will be called qnx4. 11 module will be called qnx4.
14 12
15 If you don't know whether you need it, then you don't need it: 13 If you don't know whether you need it, then you don't need it:
16 answer N. 14 answer N.
17
18config QNX4FS_RW
19 bool "QNX4FS write support (DANGEROUS)"
20 depends on QNX4FS_FS && EXPERIMENTAL && BROKEN
21 help
22 Say Y if you want to test write support for QNX4 file systems.
23
24 It's currently broken, so for now:
25 answer N.
diff --git a/fs/qnx4/Makefile b/fs/qnx4/Makefile
index e4d408cc5473..4a283b3f87f8 100644
--- a/fs/qnx4/Makefile
+++ b/fs/qnx4/Makefile
@@ -4,4 +4,4 @@
4 4
5obj-$(CONFIG_QNX4FS_FS) += qnx4.o 5obj-$(CONFIG_QNX4FS_FS) += qnx4.o
6 6
7qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o 7qnx4-objs := inode.o dir.o namei.o bitmap.o
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index e1cd061a25f7..0afba069d567 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -78,84 +78,3 @@ unsigned long qnx4_count_free_blocks(struct super_block *sb)
78 78
79 return total_free; 79 return total_free;
80} 80}
81
82#ifdef CONFIG_QNX4FS_RW
83
84int qnx4_is_free(struct super_block *sb, long block)
85{
86 int start = le32_to_cpu(qnx4_sb(sb)->BitMap->di_first_xtnt.xtnt_blk) - 1;
87 int size = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size);
88 struct buffer_head *bh;
89 const char *g;
90 int ret = -EIO;
91
92 start += block / (QNX4_BLOCK_SIZE * 8);
93 QNX4DEBUG(("qnx4: is_free requesting block [%lu], bitmap in block [%lu]\n",
94 (unsigned long) block, (unsigned long) start));
95 (void) size; /* CHECKME */
96 bh = sb_bread(sb, start);
97 if (bh == NULL) {
98 return -EIO;
99 }
100 g = bh->b_data + (block % QNX4_BLOCK_SIZE);
101 if (((*g) & (1 << (block % 8))) == 0) {
102 QNX4DEBUG(("qnx4: is_free -> block is free\n"));
103 ret = 1;
104 } else {
105 QNX4DEBUG(("qnx4: is_free -> block is busy\n"));
106 ret = 0;
107 }
108 brelse(bh);
109
110 return ret;
111}
112
113int qnx4_set_bitmap(struct super_block *sb, long block, int busy)
114{
115 int start = le32_to_cpu(qnx4_sb(sb)->BitMap->di_first_xtnt.xtnt_blk) - 1;
116 int size = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size);
117 struct buffer_head *bh;
118 char *g;
119
120 start += block / (QNX4_BLOCK_SIZE * 8);
121 QNX4DEBUG(("qnx4: set_bitmap requesting block [%lu], bitmap in block [%lu]\n",
122 (unsigned long) block, (unsigned long) start));
123 (void) size; /* CHECKME */
124 bh = sb_bread(sb, start);
125 if (bh == NULL) {
126 return -EIO;
127 }
128 g = bh->b_data + (block % QNX4_BLOCK_SIZE);
129 if (busy == 0) {
130 (*g) &= ~(1 << (block % 8));
131 } else {
132 (*g) |= (1 << (block % 8));
133 }
134 mark_buffer_dirty(bh);
135 brelse(bh);
136
137 return 0;
138}
139
140static void qnx4_clear_inode(struct inode *inode)
141{
142 struct qnx4_inode_entry *qnx4_ino = qnx4_raw_inode(inode);
143 /* What for? */
144 memset(qnx4_ino->di_fname, 0, sizeof qnx4_ino->di_fname);
145 qnx4_ino->di_size = 0;
146 qnx4_ino->di_num_xtnts = 0;
147 qnx4_ino->di_mode = 0;
148 qnx4_ino->di_status = 0;
149}
150
151void qnx4_free_inode(struct inode *inode)
152{
153 if (inode->i_ino < 1) {
154 printk("free_inode: inode 0 or nonexistent inode\n");
155 return;
156 }
157 qnx4_clear_inode(inode);
158 clear_inode(inode);
159}
160
161#endif
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 003c68f3238b..86cc39cb1398 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -85,9 +85,4 @@ const struct file_operations qnx4_dir_operations =
85const struct inode_operations qnx4_dir_inode_operations = 85const struct inode_operations qnx4_dir_inode_operations =
86{ 86{
87 .lookup = qnx4_lookup, 87 .lookup = qnx4_lookup,
88#ifdef CONFIG_QNX4FS_RW
89 .create = qnx4_create,
90 .unlink = qnx4_unlink,
91 .rmdir = qnx4_rmdir,
92#endif
93}; 88};
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
deleted file mode 100644
index 09b170ac936c..000000000000
--- a/fs/qnx4/file.c
+++ /dev/null
@@ -1,40 +0,0 @@
1/*
2 * QNX4 file system, Linux implementation.
3 *
4 * Version : 0.2.1
5 *
6 * Using parts of the xiafs filesystem.
7 *
8 * History :
9 *
10 * 25-05-1998 by Richard Frowijn : first release.
11 * 21-06-1998 by Frank Denis : wrote qnx4_readpage to use generic_file_read.
12 * 27-06-1998 by Frank Denis : file overwriting.
13 */
14
15#include "qnx4.h"
16
17/*
18 * We have mostly NULL's here: the current defaults are ok for
19 * the qnx4 filesystem.
20 */
21const struct file_operations qnx4_file_operations =
22{
23 .llseek = generic_file_llseek,
24 .read = do_sync_read,
25 .aio_read = generic_file_aio_read,
26 .mmap = generic_file_mmap,
27 .splice_read = generic_file_splice_read,
28#ifdef CONFIG_QNX4FS_RW
29 .write = do_sync_write,
30 .aio_write = generic_file_aio_write,
31 .fsync = simple_fsync,
32#endif
33};
34
35const struct inode_operations qnx4_file_inode_operations =
36{
37#ifdef CONFIG_QNX4FS_RW
38 .truncate = qnx4_truncate,
39#endif
40};
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 681df5fcd161..d2cd1798d8c4 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -28,73 +28,6 @@
28 28
29static const struct super_operations qnx4_sops; 29static const struct super_operations qnx4_sops;
30 30
31#ifdef CONFIG_QNX4FS_RW
32
33static void qnx4_delete_inode(struct inode *inode)
34{
35 QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino));
36 truncate_inode_pages(&inode->i_data, 0);
37 inode->i_size = 0;
38 qnx4_truncate(inode);
39 lock_kernel();
40 qnx4_free_inode(inode);
41 unlock_kernel();
42}
43
44static int qnx4_write_inode(struct inode *inode, int do_sync)
45{
46 struct qnx4_inode_entry *raw_inode;
47 int block, ino;
48 struct buffer_head *bh;
49 ino = inode->i_ino;
50
51 QNX4DEBUG(("qnx4: write inode 1.\n"));
52 if (inode->i_nlink == 0) {
53 return 0;
54 }
55 if (!ino) {
56 printk("qnx4: bad inode number on dev %s: %d is out of range\n",
57 inode->i_sb->s_id, ino);
58 return -EIO;
59 }
60 QNX4DEBUG(("qnx4: write inode 2.\n"));
61 block = ino / QNX4_INODES_PER_BLOCK;
62 lock_kernel();
63 if (!(bh = sb_bread(inode->i_sb, block))) {
64 printk("qnx4: major problem: unable to read inode from dev "
65 "%s\n", inode->i_sb->s_id);
66 unlock_kernel();
67 return -EIO;
68 }
69 raw_inode = ((struct qnx4_inode_entry *) bh->b_data) +
70 (ino % QNX4_INODES_PER_BLOCK);
71 raw_inode->di_mode = cpu_to_le16(inode->i_mode);
72 raw_inode->di_uid = cpu_to_le16(fs_high2lowuid(inode->i_uid));
73 raw_inode->di_gid = cpu_to_le16(fs_high2lowgid(inode->i_gid));
74 raw_inode->di_nlink = cpu_to_le16(inode->i_nlink);
75 raw_inode->di_size = cpu_to_le32(inode->i_size);
76 raw_inode->di_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
77 raw_inode->di_atime = cpu_to_le32(inode->i_atime.tv_sec);
78 raw_inode->di_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
79 raw_inode->di_first_xtnt.xtnt_size = cpu_to_le32(inode->i_blocks);
80 mark_buffer_dirty(bh);
81 if (do_sync) {
82 sync_dirty_buffer(bh);
83 if (buffer_req(bh) && !buffer_uptodate(bh)) {
84 printk("qnx4: IO error syncing inode [%s:%08x]\n",
85 inode->i_sb->s_id, ino);
86 brelse(bh);
87 unlock_kernel();
88 return -EIO;
89 }
90 }
91 brelse(bh);
92 unlock_kernel();
93 return 0;
94}
95
96#endif
97
98static void qnx4_put_super(struct super_block *sb); 31static void qnx4_put_super(struct super_block *sb);
99static struct inode *qnx4_alloc_inode(struct super_block *sb); 32static struct inode *qnx4_alloc_inode(struct super_block *sb);
100static void qnx4_destroy_inode(struct inode *inode); 33static void qnx4_destroy_inode(struct inode *inode);
@@ -108,10 +41,6 @@ static const struct super_operations qnx4_sops =
108 .put_super = qnx4_put_super, 41 .put_super = qnx4_put_super,
109 .statfs = qnx4_statfs, 42 .statfs = qnx4_statfs,
110 .remount_fs = qnx4_remount, 43 .remount_fs = qnx4_remount,
111#ifdef CONFIG_QNX4FS_RW
112 .write_inode = qnx4_write_inode,
113 .delete_inode = qnx4_delete_inode,
114#endif
115}; 44};
116 45
117static int qnx4_remount(struct super_block *sb, int *flags, char *data) 46static int qnx4_remount(struct super_block *sb, int *flags, char *data)
@@ -120,15 +49,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data)
120 49
121 qs = qnx4_sb(sb); 50 qs = qnx4_sb(sb);
122 qs->Version = QNX4_VERSION; 51 qs->Version = QNX4_VERSION;
123#ifndef CONFIG_QNX4FS_RW
124 *flags |= MS_RDONLY; 52 *flags |= MS_RDONLY;
125#endif
126 if (*flags & MS_RDONLY) {
127 return 0;
128 }
129
130 mark_buffer_dirty(qs->sb_buf);
131
132 return 0; 53 return 0;
133} 54}
134 55
@@ -354,9 +275,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
354 } 275 }
355 s->s_op = &qnx4_sops; 276 s->s_op = &qnx4_sops;
356 s->s_magic = QNX4_SUPER_MAGIC; 277 s->s_magic = QNX4_SUPER_MAGIC;
357#ifndef CONFIG_QNX4FS_RW
358 s->s_flags |= MS_RDONLY; /* Yup, read-only yet */ 278 s->s_flags |= MS_RDONLY; /* Yup, read-only yet */
359#endif
360 qnx4_sb(s)->sb_buf = bh; 279 qnx4_sb(s)->sb_buf = bh;
361 qnx4_sb(s)->sb = (struct qnx4_super_block *) bh->b_data; 280 qnx4_sb(s)->sb = (struct qnx4_super_block *) bh->b_data;
362 281
@@ -489,8 +408,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
489 408
490 memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE); 409 memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE);
491 if (S_ISREG(inode->i_mode)) { 410 if (S_ISREG(inode->i_mode)) {
492 inode->i_op = &qnx4_file_inode_operations; 411 inode->i_fop = &generic_ro_fops;
493 inode->i_fop = &qnx4_file_operations;
494 inode->i_mapping->a_ops = &qnx4_aops; 412 inode->i_mapping->a_ops = &qnx4_aops;
495 qnx4_i(inode)->mmu_private = inode->i_size; 413 qnx4_i(inode)->mmu_private = inode->i_size;
496 } else if (S_ISDIR(inode->i_mode)) { 414 } else if (S_ISDIR(inode->i_mode)) {
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 5972ed214937..ae1e7edbacd6 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -134,108 +134,3 @@ out:
134 134
135 return NULL; 135 return NULL;
136} 136}
137
138#ifdef CONFIG_QNX4FS_RW
139int qnx4_create(struct inode *dir, struct dentry *dentry, int mode,
140 struct nameidata *nd)
141{
142 QNX4DEBUG(("qnx4: qnx4_create\n"));
143 if (dir == NULL) {
144 return -ENOENT;
145 }
146 return -ENOSPC;
147}
148
149int qnx4_rmdir(struct inode *dir, struct dentry *dentry)
150{
151 struct buffer_head *bh;
152 struct qnx4_inode_entry *de;
153 struct inode *inode;
154 int retval;
155 int ino;
156
157 QNX4DEBUG(("qnx4: qnx4_rmdir [%s]\n", dentry->d_name.name));
158 lock_kernel();
159 bh = qnx4_find_entry(dentry->d_name.len, dir, dentry->d_name.name,
160 &de, &ino);
161 if (bh == NULL) {
162 unlock_kernel();
163 return -ENOENT;
164 }
165 inode = dentry->d_inode;
166 if (inode->i_ino != ino) {
167 retval = -EIO;
168 goto end_rmdir;
169 }
170#if 0
171 if (!empty_dir(inode)) {
172 retval = -ENOTEMPTY;
173 goto end_rmdir;
174 }
175#endif
176 if (inode->i_nlink != 2) {
177 QNX4DEBUG(("empty directory has nlink!=2 (%d)\n", inode->i_nlink));
178 }
179 QNX4DEBUG(("qnx4: deleting directory\n"));
180 de->di_status = 0;
181 memset(de->di_fname, 0, sizeof de->di_fname);
182 de->di_mode = 0;
183 mark_buffer_dirty_inode(bh, dir);
184 clear_nlink(inode);
185 mark_inode_dirty(inode);
186 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
187 inode_dec_link_count(dir);
188 retval = 0;
189
190 end_rmdir:
191 brelse(bh);
192
193 unlock_kernel();
194 return retval;
195}
196
197int qnx4_unlink(struct inode *dir, struct dentry *dentry)
198{
199 struct buffer_head *bh;
200 struct qnx4_inode_entry *de;
201 struct inode *inode;
202 int retval;
203 int ino;
204
205 QNX4DEBUG(("qnx4: qnx4_unlink [%s]\n", dentry->d_name.name));
206 lock_kernel();
207 bh = qnx4_find_entry(dentry->d_name.len, dir, dentry->d_name.name,
208 &de, &ino);
209 if (bh == NULL) {
210 unlock_kernel();
211 return -ENOENT;
212 }
213 inode = dentry->d_inode;
214 if (inode->i_ino != ino) {
215 retval = -EIO;
216 goto end_unlink;
217 }
218 retval = -EPERM;
219 if (!inode->i_nlink) {
220 QNX4DEBUG(("Deleting nonexistent file (%s:%lu), %d\n",
221 inode->i_sb->s_id,
222 inode->i_ino, inode->i_nlink));
223 inode->i_nlink = 1;
224 }
225 de->di_status = 0;
226 memset(de->di_fname, 0, sizeof de->di_fname);
227 de->di_mode = 0;
228 mark_buffer_dirty_inode(bh, dir);
229 dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
230 mark_inode_dirty(dir);
231 inode->i_ctime = dir->i_ctime;
232 inode_dec_link_count(inode);
233 retval = 0;
234
235end_unlink:
236 unlock_kernel();
237 brelse(bh);
238
239 return retval;
240}
241#endif
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
index 9efc089454f6..33a60858203b 100644
--- a/fs/qnx4/qnx4.h
+++ b/fs/qnx4/qnx4.h
@@ -29,17 +29,9 @@ extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
29 29
30extern struct buffer_head *qnx4_bread(struct inode *, int, int); 30extern struct buffer_head *qnx4_bread(struct inode *, int, int);
31 31
32extern const struct inode_operations qnx4_file_inode_operations;
33extern const struct inode_operations qnx4_dir_inode_operations; 32extern const struct inode_operations qnx4_dir_inode_operations;
34extern const struct file_operations qnx4_file_operations;
35extern const struct file_operations qnx4_dir_operations; 33extern const struct file_operations qnx4_dir_operations;
36extern int qnx4_is_free(struct super_block *sb, long block); 34extern int qnx4_is_free(struct super_block *sb, long block);
37extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
38extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
39extern void qnx4_truncate(struct inode *inode);
40extern void qnx4_free_inode(struct inode *inode);
41extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
42extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
43 35
44static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb) 36static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
45{ 37{
diff --git a/fs/qnx4/truncate.c b/fs/qnx4/truncate.c
deleted file mode 100644
index d94d9ee241fe..000000000000
--- a/fs/qnx4/truncate.c
+++ /dev/null
@@ -1,34 +0,0 @@
1/*
2 * QNX4 file system, Linux implementation.
3 *
4 * Version : 0.1
5 *
6 * Using parts of the xiafs filesystem.
7 *
8 * History :
9 *
10 * 30-06-1998 by Frank DENIS : ugly filler.
11 */
12
13#include <linux/smp_lock.h>
14#include "qnx4.h"
15
16#ifdef CONFIG_QNX4FS_RW
17
18void qnx4_truncate(struct inode *inode)
19{
20 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
21 S_ISLNK(inode->i_mode))) {
22 return;
23 }
24 lock_kernel();
25 if (!(S_ISDIR(inode->i_mode))) {
26 /* TODO */
27 }
28 QNX4DEBUG(("qnx4: qnx4_truncate called\n"));
29 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
30 mark_inode_dirty(inode);
31 unlock_kernel();
32}
33
34#endif
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 38f7bd559f35..39b49c42a7ed 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1839,7 +1839,7 @@ EXPORT_SYMBOL(dquot_commit_info);
1839/* 1839/*
1840 * Definitions of diskquota operations. 1840 * Definitions of diskquota operations.
1841 */ 1841 */
1842struct dquot_operations dquot_operations = { 1842const struct dquot_operations dquot_operations = {
1843 .initialize = dquot_initialize, 1843 .initialize = dquot_initialize,
1844 .drop = dquot_drop, 1844 .drop = dquot_drop,
1845 .alloc_space = dquot_alloc_space, 1845 .alloc_space = dquot_alloc_space,
@@ -2461,7 +2461,7 @@ out:
2461} 2461}
2462EXPORT_SYMBOL(vfs_set_dqinfo); 2462EXPORT_SYMBOL(vfs_set_dqinfo);
2463 2463
2464struct quotactl_ops vfs_quotactl_ops = { 2464const struct quotactl_ops vfs_quotactl_ops = {
2465 .quota_on = vfs_quota_on, 2465 .quota_on = vfs_quota_on,
2466 .quota_off = vfs_quota_off, 2466 .quota_off = vfs_quota_off,
2467 .quota_sync = vfs_quota_sync, 2467 .quota_sync = vfs_quota_sync,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a7f0110fca4c..a6090aa1a7c1 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -34,12 +34,10 @@
34#include <linux/ramfs.h> 34#include <linux/ramfs.h>
35#include <linux/sched.h> 35#include <linux/sched.h>
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/magic.h>
37#include <asm/uaccess.h> 38#include <asm/uaccess.h>
38#include "internal.h" 39#include "internal.h"
39 40
40/* some random number */
41#define RAMFS_MAGIC 0x858458f6
42
43#define RAMFS_DEFAULT_MODE 0755 41#define RAMFS_DEFAULT_MODE 0755
44 42
45static const struct super_operations ramfs_ops; 43static const struct super_operations ramfs_ops;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 7adea74d6a8a..f0ad05f38022 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -612,7 +612,7 @@ static int reiserfs_mark_dquot_dirty(struct dquot *);
612static int reiserfs_write_info(struct super_block *, int); 612static int reiserfs_write_info(struct super_block *, int);
613static int reiserfs_quota_on(struct super_block *, int, int, char *, int); 613static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
614 614
615static struct dquot_operations reiserfs_quota_operations = { 615static const struct dquot_operations reiserfs_quota_operations = {
616 .initialize = dquot_initialize, 616 .initialize = dquot_initialize,
617 .drop = dquot_drop, 617 .drop = dquot_drop,
618 .alloc_space = dquot_alloc_space, 618 .alloc_space = dquot_alloc_space,
@@ -629,7 +629,7 @@ static struct dquot_operations reiserfs_quota_operations = {
629 .destroy_dquot = dquot_destroy, 629 .destroy_dquot = dquot_destroy,
630}; 630};
631 631
632static struct quotactl_ops reiserfs_qctl_operations = { 632static const struct quotactl_ops reiserfs_qctl_operations = {
633 .quota_on = reiserfs_quota_on, 633 .quota_on = reiserfs_quota_on,
634 .quota_off = vfs_quota_off, 634 .quota_off = vfs_quota_off,
635 .quota_sync = vfs_quota_sync, 635 .quota_sync = vfs_quota_sync,
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 4ab3c03d8f95..c117fa80d1e9 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -284,7 +284,7 @@ static const struct file_operations romfs_dir_operations = {
284 .readdir = romfs_readdir, 284 .readdir = romfs_readdir,
285}; 285};
286 286
287static struct inode_operations romfs_dir_inode_operations = { 287static const struct inode_operations romfs_dir_inode_operations = {
288 .lookup = romfs_lookup, 288 .lookup = romfs_lookup,
289}; 289};
290 290
@@ -528,7 +528,7 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
528 pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK; 528 pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
529 529
530 root = romfs_iget(sb, pos); 530 root = romfs_iget(sb, pos);
531 if (!root) 531 if (IS_ERR(root))
532 goto error; 532 goto error;
533 533
534 sb->s_root = d_alloc_root(root); 534 sb->s_root = d_alloc_root(root);
diff --git a/fs/select.c b/fs/select.c
index 8084834e123e..a201fc370223 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -41,22 +41,28 @@
41 * better solutions.. 41 * better solutions..
42 */ 42 */
43 43
44#define MAX_SLACK (100 * NSEC_PER_MSEC)
45
44static long __estimate_accuracy(struct timespec *tv) 46static long __estimate_accuracy(struct timespec *tv)
45{ 47{
46 long slack; 48 long slack;
47 int divfactor = 1000; 49 int divfactor = 1000;
48 50
51 if (tv->tv_sec < 0)
52 return 0;
53
49 if (task_nice(current) > 0) 54 if (task_nice(current) > 0)
50 divfactor = divfactor / 5; 55 divfactor = divfactor / 5;
51 56
57 if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
58 return MAX_SLACK;
59
52 slack = tv->tv_nsec / divfactor; 60 slack = tv->tv_nsec / divfactor;
53 slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); 61 slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);
54 62
55 if (slack > 100 * NSEC_PER_MSEC) 63 if (slack > MAX_SLACK)
56 slack = 100 * NSEC_PER_MSEC; 64 return MAX_SLACK;
57 65
58 if (slack < 0)
59 slack = 0;
60 return slack; 66 return slack;
61} 67}
62 68
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index 9468168b9af5..71c29b6670b4 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -509,7 +509,7 @@ date_unix2dos(struct smb_sb_info *server,
509 month = 2; 509 month = 2;
510 } else { 510 } else {
511 nl_day = (year & 3) || day <= 59 ? day : day - 1; 511 nl_day = (year & 3) || day <= 59 ? day : day - 1;
512 for (month = 0; month < 12; month++) 512 for (month = 1; month < 12; month++)
513 if (day_n[month] > nl_day) 513 if (day_n[month] > nl_day)
514 break; 514 break;
515 } 515 }
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index cb5fc57e370b..6c197ef53add 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -44,7 +44,7 @@
44#include "squashfs.h" 44#include "squashfs.h"
45 45
46static struct file_system_type squashfs_fs_type; 46static struct file_system_type squashfs_fs_type;
47static struct super_operations squashfs_super_ops; 47static const struct super_operations squashfs_super_ops;
48 48
49static int supported_squashfs_filesystem(short major, short minor, short comp) 49static int supported_squashfs_filesystem(short major, short minor, short comp)
50{ 50{
@@ -444,7 +444,7 @@ static struct file_system_type squashfs_fs_type = {
444 .fs_flags = FS_REQUIRES_DEV 444 .fs_flags = FS_REQUIRES_DEV
445}; 445};
446 446
447static struct super_operations squashfs_super_ops = { 447static const struct super_operations squashfs_super_ops = {
448 .alloc_inode = squashfs_alloc_inode, 448 .alloc_inode = squashfs_alloc_inode,
449 .destroy_inode = squashfs_destroy_inode, 449 .destroy_inode = squashfs_destroy_inode,
450 .statfs = squashfs_statfs, 450 .statfs = squashfs_statfs,
diff --git a/fs/super.c b/fs/super.c
index 9cda337ddae2..0e7207b9815c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,7 @@ DEFINE_SPINLOCK(sb_lock);
54static struct super_block *alloc_super(struct file_system_type *type) 54static struct super_block *alloc_super(struct file_system_type *type)
55{ 55{
56 struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); 56 struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER);
57 static struct super_operations default_op; 57 static const struct super_operations default_op;
58 58
59 if (s) { 59 if (s) {
60 if (security_sb_alloc(s)) { 60 if (security_sb_alloc(s)) {
@@ -707,6 +707,12 @@ static int set_bdev_super(struct super_block *s, void *data)
707{ 707{
708 s->s_bdev = data; 708 s->s_bdev = data;
709 s->s_dev = s->s_bdev->bd_dev; 709 s->s_dev = s->s_bdev->bd_dev;
710
711 /*
712 * We set the bdi here to the queue backing, file systems can
713 * overwrite this in ->fill_super()
714 */
715 s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
710 return 0; 716 return 0;
711} 717}
712 718
diff --git a/fs/sync.c b/fs/sync.c
index 192340930bb4..d104591b066b 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -27,6 +27,13 @@
27 */ 27 */
28static int __sync_filesystem(struct super_block *sb, int wait) 28static int __sync_filesystem(struct super_block *sb, int wait)
29{ 29{
30 /*
31 * This should be safe, as we require bdi backing to actually
32 * write out data in the first place
33 */
34 if (!sb->s_bdi)
35 return 0;
36
30 /* Avoid doing twice syncing and cache pruning for quota sync */ 37 /* Avoid doing twice syncing and cache pruning for quota sync */
31 if (!wait) { 38 if (!wait) {
32 writeout_quota_sb(sb, -1); 39 writeout_quota_sb(sb, -1);
@@ -101,7 +108,7 @@ restart:
101 spin_unlock(&sb_lock); 108 spin_unlock(&sb_lock);
102 109
103 down_read(&sb->s_umount); 110 down_read(&sb->s_umount);
104 if (!(sb->s_flags & MS_RDONLY) && sb->s_root) 111 if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
105 __sync_filesystem(sb, wait); 112 __sync_filesystem(sb, wait);
106 up_read(&sb->s_umount); 113 up_read(&sb->s_umount);
107 114
@@ -176,6 +183,7 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
176 ret = err; 183 ret = err;
177 return ret; 184 return ret;
178} 185}
186EXPORT_SYMBOL(file_fsync);
179 187
180/** 188/**
181 * vfs_fsync_range - helper to sync a range of data & metadata to disk 189 * vfs_fsync_range - helper to sync a range of data & metadata to disk
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 1c8991b0db13..076ca50e9933 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -54,29 +54,15 @@
54 * @nr_to_write: how many dirty pages to write-back 54 * @nr_to_write: how many dirty pages to write-back
55 * 55 *
56 * This function shrinks UBIFS liability by means of writing back some amount 56 * This function shrinks UBIFS liability by means of writing back some amount
57 * of dirty inodes and their pages. Returns the amount of pages which were 57 * of dirty inodes and their pages.
58 * written back. The returned value does not include dirty inodes which were
59 * synchronized.
60 * 58 *
61 * Note, this function synchronizes even VFS inodes which are locked 59 * Note, this function synchronizes even VFS inodes which are locked
62 * (@i_mutex) by the caller of the budgeting function, because write-back does 60 * (@i_mutex) by the caller of the budgeting function, because write-back does
63 * not touch @i_mutex. 61 * not touch @i_mutex.
64 */ 62 */
65static int shrink_liability(struct ubifs_info *c, int nr_to_write) 63static void shrink_liability(struct ubifs_info *c, int nr_to_write)
66{ 64{
67 int nr_written; 65 writeback_inodes_sb(c->vfs_sb);
68
69 nr_written = writeback_inodes_sb(c->vfs_sb);
70 if (!nr_written) {
71 /*
72 * Re-try again but wait on pages/inodes which are being
73 * written-back concurrently (e.g., by pdflush).
74 */
75 nr_written = sync_inodes_sb(c->vfs_sb);
76 }
77
78 dbg_budg("%d pages were written back", nr_written);
79 return nr_written;
80} 66}
81 67
82/** 68/**
@@ -729,7 +715,7 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
729 * ubifs_get_free_space - return amount of free space. 715 * ubifs_get_free_space - return amount of free space.
730 * @c: UBIFS file-system description object 716 * @c: UBIFS file-system description object
731 * 717 *
732 * This function calculates and retuns amount of free space to report to 718 * This function calculates and returns amount of free space to report to
733 * user-space. 719 * user-space.
734 */ 720 */
735long long ubifs_get_free_space(struct ubifs_info *c) 721long long ubifs_get_free_space(struct ubifs_info *c)
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index f3a7945527fb..4775af401167 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -510,7 +510,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
510 int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt; 510 int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
511 int first = 1, iip; 511 int first = 1, iip;
512 struct ubifs_debug_info *d = c->dbg; 512 struct ubifs_debug_info *d = c->dbg;
513 union ubifs_key lower_key, upper_key, l_key, u_key; 513 union ubifs_key uninitialized_var(lower_key), upper_key, l_key, u_key;
514 unsigned long long uninitialized_var(last_sqnum); 514 unsigned long long uninitialized_var(last_sqnum);
515 struct ubifs_idx_node *idx; 515 struct ubifs_idx_node *idx;
516 struct list_head list; 516 struct list_head list;
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index ce2cd8343618..dbc093afd946 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -210,6 +210,20 @@ const char *dbg_cstate(int cmt_state)
210 } 210 }
211} 211}
212 212
213const char *dbg_jhead(int jhead)
214{
215 switch (jhead) {
216 case GCHD:
217 return "0 (GC)";
218 case BASEHD:
219 return "1 (base)";
220 case DATAHD:
221 return "2 (data)";
222 default:
223 return "unknown journal head";
224 }
225}
226
213static void dump_ch(const struct ubifs_ch *ch) 227static void dump_ch(const struct ubifs_ch *ch)
214{ 228{
215 printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic)); 229 printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic));
@@ -623,8 +637,9 @@ void dbg_dump_budg(struct ubifs_info *c)
623 /* If we are in R/O mode, journal heads do not exist */ 637 /* If we are in R/O mode, journal heads do not exist */
624 if (c->jheads) 638 if (c->jheads)
625 for (i = 0; i < c->jhead_cnt; i++) 639 for (i = 0; i < c->jhead_cnt; i++)
626 printk(KERN_DEBUG "\tjhead %d\t LEB %d\n", 640 printk(KERN_DEBUG "\tjhead %s\t LEB %d\n",
627 c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum); 641 dbg_jhead(c->jheads[i].wbuf.jhead),
642 c->jheads[i].wbuf.lnum);
628 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) { 643 for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
629 bud = rb_entry(rb, struct ubifs_bud, rb); 644 bud = rb_entry(rb, struct ubifs_bud, rb);
630 printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum); 645 printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
@@ -648,9 +663,90 @@ void dbg_dump_budg(struct ubifs_info *c)
648 663
649void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) 664void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
650{ 665{
651 printk(KERN_DEBUG "LEB %d lprops: free %d, dirty %d (used %d), " 666 int i, spc, dark = 0, dead = 0;
652 "flags %#x\n", lp->lnum, lp->free, lp->dirty, 667 struct rb_node *rb;
653 c->leb_size - lp->free - lp->dirty, lp->flags); 668 struct ubifs_bud *bud;
669
670 spc = lp->free + lp->dirty;
671 if (spc < c->dead_wm)
672 dead = spc;
673 else
674 dark = ubifs_calc_dark(c, spc);
675
676 if (lp->flags & LPROPS_INDEX)
677 printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
678 "free + dirty %-8d flags %#x (", lp->lnum, lp->free,
679 lp->dirty, c->leb_size - spc, spc, lp->flags);
680 else
681 printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
682 "free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d "
683 "flags %#-4x (", lp->lnum, lp->free, lp->dirty,
684 c->leb_size - spc, spc, dark, dead,
685 (int)(spc / UBIFS_MAX_NODE_SZ), lp->flags);
686
687 if (lp->flags & LPROPS_TAKEN) {
688 if (lp->flags & LPROPS_INDEX)
689 printk(KERN_CONT "index, taken");
690 else
691 printk(KERN_CONT "taken");
692 } else {
693 const char *s;
694
695 if (lp->flags & LPROPS_INDEX) {
696 switch (lp->flags & LPROPS_CAT_MASK) {
697 case LPROPS_DIRTY_IDX:
698 s = "dirty index";
699 break;
700 case LPROPS_FRDI_IDX:
701 s = "freeable index";
702 break;
703 default:
704 s = "index";
705 }
706 } else {
707 switch (lp->flags & LPROPS_CAT_MASK) {
708 case LPROPS_UNCAT:
709 s = "not categorized";
710 break;
711 case LPROPS_DIRTY:
712 s = "dirty";
713 break;
714 case LPROPS_FREE:
715 s = "free";
716 break;
717 case LPROPS_EMPTY:
718 s = "empty";
719 break;
720 case LPROPS_FREEABLE:
721 s = "freeable";
722 break;
723 default:
724 s = NULL;
725 break;
726 }
727 }
728 printk(KERN_CONT "%s", s);
729 }
730
731 for (rb = rb_first((struct rb_root *)&c->buds); rb; rb = rb_next(rb)) {
732 bud = rb_entry(rb, struct ubifs_bud, rb);
733 if (bud->lnum == lp->lnum) {
734 int head = 0;
735 for (i = 0; i < c->jhead_cnt; i++) {
736 if (lp->lnum == c->jheads[i].wbuf.lnum) {
737 printk(KERN_CONT ", jhead %s",
738 dbg_jhead(i));
739 head = 1;
740 }
741 }
742 if (!head)
743 printk(KERN_CONT ", bud of jhead %s",
744 dbg_jhead(bud->jhead));
745 }
746 }
747 if (lp->lnum == c->gc_lnum)
748 printk(KERN_CONT ", GC LEB");
749 printk(KERN_CONT ")\n");
654} 750}
655 751
656void dbg_dump_lprops(struct ubifs_info *c) 752void dbg_dump_lprops(struct ubifs_info *c)
@@ -724,7 +820,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
724 820
725 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", 821 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
726 current->pid, lnum); 822 current->pid, lnum);
727 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf); 823 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
728 if (IS_ERR(sleb)) { 824 if (IS_ERR(sleb)) {
729 ubifs_err("scan error %d", (int)PTR_ERR(sleb)); 825 ubifs_err("scan error %d", (int)PTR_ERR(sleb));
730 return; 826 return;
@@ -909,8 +1005,10 @@ out:
909 ubifs_msg("saved lprops statistics dump"); 1005 ubifs_msg("saved lprops statistics dump");
910 dbg_dump_lstats(&d->saved_lst); 1006 dbg_dump_lstats(&d->saved_lst);
911 ubifs_get_lp_stats(c, &lst); 1007 ubifs_get_lp_stats(c, &lst);
1008
912 ubifs_msg("current lprops statistics dump"); 1009 ubifs_msg("current lprops statistics dump");
913 dbg_dump_lstats(&d->saved_lst); 1010 dbg_dump_lstats(&lst);
1011
914 spin_lock(&c->space_lock); 1012 spin_lock(&c->space_lock);
915 dbg_dump_budg(c); 1013 dbg_dump_budg(c);
916 spin_unlock(&c->space_lock); 1014 spin_unlock(&c->space_lock);
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index c1cd73b2e06e..29d960101ea6 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -271,6 +271,7 @@ void ubifs_debugging_exit(struct ubifs_info *c);
271/* Dump functions */ 271/* Dump functions */
272const char *dbg_ntype(int type); 272const char *dbg_ntype(int type);
273const char *dbg_cstate(int cmt_state); 273const char *dbg_cstate(int cmt_state);
274const char *dbg_jhead(int jhead);
274const char *dbg_get_key_dump(const struct ubifs_info *c, 275const char *dbg_get_key_dump(const struct ubifs_info *c,
275 const union ubifs_key *key); 276 const union ubifs_key *key);
276void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode); 277void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
@@ -321,6 +322,8 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
321int dbg_check_lprops(struct ubifs_info *c); 322int dbg_check_lprops(struct ubifs_info *c);
322int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, 323int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
323 int row, int col); 324 int row, int col);
325int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
326 loff_t size);
324 327
325/* Force the use of in-the-gaps method for testing */ 328/* Force the use of in-the-gaps method for testing */
326 329
@@ -425,6 +428,7 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
425 428
426#define dbg_ntype(type) "" 429#define dbg_ntype(type) ""
427#define dbg_cstate(cmt_state) "" 430#define dbg_cstate(cmt_state) ""
431#define dbg_jhead(jhead) ""
428#define dbg_get_key_dump(c, key) ({}) 432#define dbg_get_key_dump(c, key) ({})
429#define dbg_dump_inode(c, inode) ({}) 433#define dbg_dump_inode(c, inode) ({})
430#define dbg_dump_node(c, node) ({}) 434#define dbg_dump_node(c, node) ({})
@@ -460,6 +464,7 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
460#define dbg_check_heap(c, heap, cat, add_pos) ({}) 464#define dbg_check_heap(c, heap, cat, add_pos) ({})
461#define dbg_check_lprops(c) 0 465#define dbg_check_lprops(c) 0
462#define dbg_check_lpt_nodes(c, cnode, row, col) 0 466#define dbg_check_lpt_nodes(c, cnode, row, col) 0
467#define dbg_check_inode_size(c, inode, size) 0
463#define dbg_force_in_the_gaps_enabled 0 468#define dbg_force_in_the_gaps_enabled 0
464#define dbg_force_in_the_gaps() 0 469#define dbg_force_in_the_gaps() 0
465#define dbg_failure_mode 0 470#define dbg_failure_mode 0
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 6d34dc7e33e1..2e6481a7701c 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -21,34 +21,32 @@
21 */ 21 */
22 22
23/* 23/*
24 * This file implements VFS file and inode operations of regular files, device 24 * This file implements VFS file and inode operations for regular files, device
25 * nodes and symlinks as well as address space operations. 25 * nodes and symlinks as well as address space operations.
26 * 26 *
27 * UBIFS uses 2 page flags: PG_private and PG_checked. PG_private is set if the 27 * UBIFS uses 2 page flags: @PG_private and @PG_checked. @PG_private is set if
28 * page is dirty and is used for budgeting purposes - dirty pages should not be 28 * the page is dirty and is used for optimization purposes - dirty pages are
29 * budgeted. The PG_checked flag is set if full budgeting is required for the 29 * not budgeted so the flag shows that 'ubifs_write_end()' should not release
30 * page e.g., when it corresponds to a file hole or it is just beyond the file 30 * the budget for this page. The @PG_checked flag is set if full budgeting is
31 * size. The budgeting is done in 'ubifs_write_begin()', because it is OK to 31 * required for the page e.g., when it corresponds to a file hole or it is
32 * fail in this function, and the budget is released in 'ubifs_write_end()'. So 32 * beyond the file size. The budgeting is done in 'ubifs_write_begin()', because
33 * the PG_private and PG_checked flags carry the information about how the page 33 * it is OK to fail in this function, and the budget is released in
34 * was budgeted, to make it possible to release the budget properly. 34 * 'ubifs_write_end()'. So the @PG_private and @PG_checked flags carry
35 * information about how the page was budgeted, to make it possible to release
36 * the budget properly.
35 * 37 *
36 * A thing to keep in mind: inode's 'i_mutex' is locked in most VFS operations 38 * A thing to keep in mind: inode @i_mutex is locked in most VFS operations we
37 * we implement. However, this is not true for '->writepage()', which might be 39 * implement. However, this is not true for 'ubifs_writepage()', which may be
38 * called with 'i_mutex' unlocked. For example, when pdflush is performing 40 * called with @i_mutex unlocked. For example, when pdflush is doing background
39 * write-back, it calls 'writepage()' with unlocked 'i_mutex', although the 41 * write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex. At "normal"
40 * inode has 'I_LOCK' flag in this case. At "normal" work-paths 'i_mutex' is 42 * work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g. in the
41 * locked in '->writepage', e.g. in "sys_write -> alloc_pages -> direct reclaim 43 * "sys_write -> alloc_pages -> direct reclaim path". So, in 'ubifs_writepage()'
42 * path'. So, in '->writepage()' we are only guaranteed that the page is 44 * we are only guaranteed that the page is locked.
43 * locked.
44 * 45 *
45 * Similarly, 'i_mutex' does not have to be locked in readpage(), e.g., 46 * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the
46 * readahead path does not have it locked ("sys_read -> generic_file_aio_read 47 * read-ahead path does not lock it ("sys_read -> generic_file_aio_read ->
47 * -> ondemand_readahead -> readpage"). In case of readahead, 'I_LOCK' flag is 48 * ondemand_readahead -> readpage"). In case of readahead, @I_LOCK flag is not
48 * not set as well. However, UBIFS disables readahead. 49 * set as well. However, UBIFS disables readahead.
49 *
50 * This, for example means that there might be 2 concurrent '->writepage()'
51 * calls for the same inode, but different inode dirty pages.
52 */ 50 */
53 51
54#include "ubifs.h" 52#include "ubifs.h"
@@ -449,9 +447,9 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
449 /* 447 /*
450 * We change whole page so no need to load it. But we 448 * We change whole page so no need to load it. But we
451 * have to set the @PG_checked flag to make the further 449 * have to set the @PG_checked flag to make the further
452 * code the page is new. This might be not true, but it 450 * code know that the page is new. This might be not
453 * is better to budget more that to read the page from 451 * true, but it is better to budget more than to read
454 * the media. 452 * the page from the media.
455 */ 453 */
456 SetPageChecked(page); 454 SetPageChecked(page);
457 skipped_read = 1; 455 skipped_read = 1;
@@ -497,8 +495,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
497 } 495 }
498 496
499 /* 497 /*
500 * Whee, we aquired budgeting quickly - without involving 498 * Whee, we acquired budgeting quickly - without involving
501 * garbage-collection, committing or forceing write-back. We return 499 * garbage-collection, committing or forcing write-back. We return
502 * with @ui->ui_mutex locked if we are appending pages, and unlocked 500 * with @ui->ui_mutex locked if we are appending pages, and unlocked
503 * otherwise. This is an optimization (slightly hacky though). 501 * otherwise. This is an optimization (slightly hacky though).
504 */ 502 */
@@ -562,7 +560,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
562 560
563 /* 561 /*
564 * Return 0 to force VFS to repeat the whole operation, or the 562 * Return 0 to force VFS to repeat the whole operation, or the
565 * error code if 'do_readpage()' failes. 563 * error code if 'do_readpage()' fails.
566 */ 564 */
567 copied = do_readpage(page); 565 copied = do_readpage(page);
568 goto out; 566 goto out;
@@ -1175,11 +1173,11 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
1175 ui->ui_size = inode->i_size; 1173 ui->ui_size = inode->i_size;
1176 /* Truncation changes inode [mc]time */ 1174 /* Truncation changes inode [mc]time */
1177 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); 1175 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
1178 /* The other attributes may be changed at the same time as well */ 1176 /* Other attributes may be changed at the same time as well */
1179 do_attr_changes(inode, attr); 1177 do_attr_changes(inode, attr);
1180
1181 err = ubifs_jnl_truncate(c, inode, old_size, new_size); 1178 err = ubifs_jnl_truncate(c, inode, old_size, new_size);
1182 mutex_unlock(&ui->ui_mutex); 1179 mutex_unlock(&ui->ui_mutex);
1180
1183out_budg: 1181out_budg:
1184 if (budgeted) 1182 if (budgeted)
1185 ubifs_release_budget(c, &req); 1183 ubifs_release_budget(c, &req);
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index f0f5f15d384e..618c2701d3a7 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -529,7 +529,7 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
529 * We scan the entire LEB even though we only really need to scan up to 529 * We scan the entire LEB even though we only really need to scan up to
530 * (c->leb_size - lp->free). 530 * (c->leb_size - lp->free).
531 */ 531 */
532 sleb = ubifs_scan(c, lnum, 0, c->sbuf); 532 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
533 if (IS_ERR(sleb)) 533 if (IS_ERR(sleb))
534 return PTR_ERR(sleb); 534 return PTR_ERR(sleb);
535 535
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 762a7d6cec73..e589fedaf1ef 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -297,7 +297,7 @@ static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer)
297{ 297{
298 struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer); 298 struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer);
299 299
300 dbg_io("jhead %d", wbuf->jhead); 300 dbg_io("jhead %s", dbg_jhead(wbuf->jhead));
301 wbuf->need_sync = 1; 301 wbuf->need_sync = 1;
302 wbuf->c->need_wbuf_sync = 1; 302 wbuf->c->need_wbuf_sync = 1;
303 ubifs_wake_up_bgt(wbuf->c); 303 ubifs_wake_up_bgt(wbuf->c);
@@ -314,7 +314,8 @@ static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
314 314
315 if (wbuf->no_timer) 315 if (wbuf->no_timer)
316 return; 316 return;
317 dbg_io("set timer for jhead %d, %llu-%llu millisecs", wbuf->jhead, 317 dbg_io("set timer for jhead %s, %llu-%llu millisecs",
318 dbg_jhead(wbuf->jhead),
318 div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC), 319 div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC),
319 div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta, 320 div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta,
320 USEC_PER_SEC)); 321 USEC_PER_SEC));
@@ -351,8 +352,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
351 /* Write-buffer is empty or not seeked */ 352 /* Write-buffer is empty or not seeked */
352 return 0; 353 return 0;
353 354
354 dbg_io("LEB %d:%d, %d bytes, jhead %d", 355 dbg_io("LEB %d:%d, %d bytes, jhead %s",
355 wbuf->lnum, wbuf->offs, wbuf->used, wbuf->jhead); 356 wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
356 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY)); 357 ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
357 ubifs_assert(!(wbuf->avail & 7)); 358 ubifs_assert(!(wbuf->avail & 7));
358 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); 359 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
@@ -401,7 +402,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
401{ 402{
402 const struct ubifs_info *c = wbuf->c; 403 const struct ubifs_info *c = wbuf->c;
403 404
404 dbg_io("LEB %d:%d, jhead %d", lnum, offs, wbuf->jhead); 405 dbg_io("LEB %d:%d, jhead %s", lnum, offs, dbg_jhead(wbuf->jhead));
405 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt); 406 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt);
406 ubifs_assert(offs >= 0 && offs <= c->leb_size); 407 ubifs_assert(offs >= 0 && offs <= c->leb_size);
407 ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7)); 408 ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
@@ -508,9 +509,9 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
508 struct ubifs_info *c = wbuf->c; 509 struct ubifs_info *c = wbuf->c;
509 int err, written, n, aligned_len = ALIGN(len, 8), offs; 510 int err, written, n, aligned_len = ALIGN(len, 8), offs;
510 511
511 dbg_io("%d bytes (%s) to jhead %d wbuf at LEB %d:%d", len, 512 dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len,
512 dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->jhead, 513 dbg_ntype(((struct ubifs_ch *)buf)->node_type),
513 wbuf->lnum, wbuf->offs + wbuf->used); 514 dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs + wbuf->used);
514 ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt); 515 ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
515 ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0); 516 ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
516 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); 517 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
@@ -535,8 +536,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
535 memcpy(wbuf->buf + wbuf->used, buf, len); 536 memcpy(wbuf->buf + wbuf->used, buf, len);
536 537
537 if (aligned_len == wbuf->avail) { 538 if (aligned_len == wbuf->avail) {
538 dbg_io("flush jhead %d wbuf to LEB %d:%d", 539 dbg_io("flush jhead %s wbuf to LEB %d:%d",
539 wbuf->jhead, wbuf->lnum, wbuf->offs); 540 dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
540 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, 541 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
541 wbuf->offs, c->min_io_size, 542 wbuf->offs, c->min_io_size,
542 wbuf->dtype); 543 wbuf->dtype);
@@ -564,8 +565,8 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
564 * minimal I/O unit. We have to fill and flush write-buffer and switch 565 * minimal I/O unit. We have to fill and flush write-buffer and switch
565 * to the next min. I/O unit. 566 * to the next min. I/O unit.
566 */ 567 */
567 dbg_io("flush jhead %d wbuf to LEB %d:%d", 568 dbg_io("flush jhead %s wbuf to LEB %d:%d",
568 wbuf->jhead, wbuf->lnum, wbuf->offs); 569 dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
569 memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); 570 memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
570 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, 571 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
571 c->min_io_size, wbuf->dtype); 572 c->min_io_size, wbuf->dtype);
@@ -698,8 +699,8 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
698 int err, rlen, overlap; 699 int err, rlen, overlap;
699 struct ubifs_ch *ch = buf; 700 struct ubifs_ch *ch = buf;
700 701
701 dbg_io("LEB %d:%d, %s, length %d, jhead %d", lnum, offs, 702 dbg_io("LEB %d:%d, %s, length %d, jhead %s", lnum, offs,
702 dbg_ntype(type), len, wbuf->jhead); 703 dbg_ntype(type), len, dbg_jhead(wbuf->jhead));
703 ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0); 704 ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
704 ubifs_assert(!(offs & 7) && offs < c->leb_size); 705 ubifs_assert(!(offs & 7) && offs < c->leb_size);
705 ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT); 706 ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 64b5f3a309f5..d321baeca68d 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -158,7 +158,7 @@ again:
158 * some. But the write-buffer mutex has to be unlocked because 158 * some. But the write-buffer mutex has to be unlocked because
159 * GC also takes it. 159 * GC also takes it.
160 */ 160 */
161 dbg_jnl("no free space jhead %d, run GC", jhead); 161 dbg_jnl("no free space in jhead %s, run GC", dbg_jhead(jhead));
162 mutex_unlock(&wbuf->io_mutex); 162 mutex_unlock(&wbuf->io_mutex);
163 163
164 lnum = ubifs_garbage_collect(c, 0); 164 lnum = ubifs_garbage_collect(c, 0);
@@ -173,7 +173,8 @@ again:
173 * because we dropped @wbuf->io_mutex, so try once 173 * because we dropped @wbuf->io_mutex, so try once
174 * again. 174 * again.
175 */ 175 */
176 dbg_jnl("GC couldn't make a free LEB for jhead %d", jhead); 176 dbg_jnl("GC couldn't make a free LEB for jhead %s",
177 dbg_jhead(jhead));
177 if (retries++ < 2) { 178 if (retries++ < 2) {
178 dbg_jnl("retry (%d)", retries); 179 dbg_jnl("retry (%d)", retries);
179 goto again; 180 goto again;
@@ -184,7 +185,7 @@ again:
184 } 185 }
185 186
186 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); 187 mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
187 dbg_jnl("got LEB %d for jhead %d", lnum, jhead); 188 dbg_jnl("got LEB %d for jhead %s", lnum, dbg_jhead(jhead));
188 avail = c->leb_size - wbuf->offs - wbuf->used; 189 avail = c->leb_size - wbuf->offs - wbuf->used;
189 190
190 if (wbuf->lnum != -1 && avail >= len) { 191 if (wbuf->lnum != -1 && avail >= len) {
@@ -255,7 +256,8 @@ static int write_node(struct ubifs_info *c, int jhead, void *node, int len,
255 *lnum = c->jheads[jhead].wbuf.lnum; 256 *lnum = c->jheads[jhead].wbuf.lnum;
256 *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; 257 *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
257 258
258 dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len); 259 dbg_jnl("jhead %s, LEB %d:%d, len %d",
260 dbg_jhead(jhead), *lnum, *offs, len);
259 ubifs_prepare_node(c, node, len, 0); 261 ubifs_prepare_node(c, node, len, 0);
260 262
261 return ubifs_wbuf_write_nolock(wbuf, node, len); 263 return ubifs_wbuf_write_nolock(wbuf, node, len);
@@ -285,7 +287,8 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
285 287
286 *lnum = c->jheads[jhead].wbuf.lnum; 288 *lnum = c->jheads[jhead].wbuf.lnum;
287 *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; 289 *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
288 dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len); 290 dbg_jnl("jhead %s, LEB %d:%d, len %d",
291 dbg_jhead(jhead), *lnum, *offs, len);
289 292
290 err = ubifs_wbuf_write_nolock(wbuf, buf, len); 293 err = ubifs_wbuf_write_nolock(wbuf, buf, len);
291 if (err) 294 if (err)
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 5fa27ea031ba..0f530c684f0b 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -229,23 +229,6 @@ static inline void xent_key_init(const struct ubifs_info *c,
229} 229}
230 230
231/** 231/**
232 * xent_key_init_hash - initialize extended attribute entry key without
233 * re-calculating hash function.
234 * @c: UBIFS file-system description object
235 * @key: key to initialize
236 * @inum: host inode number
237 * @hash: extended attribute entry name hash
238 */
239static inline void xent_key_init_hash(const struct ubifs_info *c,
240 union ubifs_key *key, ino_t inum,
241 uint32_t hash)
242{
243 ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
244 key->u32[0] = inum;
245 key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
246}
247
248/**
249 * xent_key_init_flash - initialize on-flash extended attribute entry key. 232 * xent_key_init_flash - initialize on-flash extended attribute entry key.
250 * @c: UBIFS file-system description object 233 * @c: UBIFS file-system description object
251 * @k: key to initialize 234 * @k: key to initialize
@@ -295,22 +278,15 @@ static inline void data_key_init(const struct ubifs_info *c,
295} 278}
296 279
297/** 280/**
298 * data_key_init_flash - initialize on-flash data key. 281 * highest_data_key - get the highest possible data key for an inode.
299 * @c: UBIFS file-system description object 282 * @c: UBIFS file-system description object
300 * @k: key to initialize 283 * @key: key to initialize
301 * @inum: inode number 284 * @inum: inode number
302 * @block: block number
303 */ 285 */
304static inline void data_key_init_flash(const struct ubifs_info *c, void *k, 286static inline void highest_data_key(const struct ubifs_info *c,
305 ino_t inum, unsigned int block) 287 union ubifs_key *key, ino_t inum)
306{ 288{
307 union ubifs_key *key = k; 289 data_key_init(c, key, inum, UBIFS_S_KEY_BLOCK_MASK);
308
309 ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
310 key->j32[0] = cpu_to_le32(inum);
311 key->j32[1] = cpu_to_le32(block |
312 (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS));
313 memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
314} 290}
315 291
316/** 292/**
@@ -554,4 +530,5 @@ static inline unsigned long long key_max_inode_size(const struct ubifs_info *c)
554 return 0; 530 return 0;
555 } 531 }
556} 532}
533
557#endif /* !__UBIFS_KEY_H__ */ 534#endif /* !__UBIFS_KEY_H__ */
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 56e33772a1ee..c345e125f42c 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -169,8 +169,8 @@ void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
169 */ 169 */
170 c->bud_bytes += c->leb_size - bud->start; 170 c->bud_bytes += c->leb_size - bud->start;
171 171
172 dbg_log("LEB %d:%d, jhead %d, bud_bytes %lld", bud->lnum, 172 dbg_log("LEB %d:%d, jhead %s, bud_bytes %lld", bud->lnum,
173 bud->start, bud->jhead, c->bud_bytes); 173 bud->start, dbg_jhead(bud->jhead), c->bud_bytes);
174 spin_unlock(&c->buds_lock); 174 spin_unlock(&c->buds_lock);
175} 175}
176 176
@@ -355,16 +355,16 @@ static void remove_buds(struct ubifs_info *c)
355 * heads (non-closed buds). 355 * heads (non-closed buds).
356 */ 356 */
357 c->cmt_bud_bytes += wbuf->offs - bud->start; 357 c->cmt_bud_bytes += wbuf->offs - bud->start;
358 dbg_log("preserve %d:%d, jhead %d, bud bytes %d, " 358 dbg_log("preserve %d:%d, jhead %s, bud bytes %d, "
359 "cmt_bud_bytes %lld", bud->lnum, bud->start, 359 "cmt_bud_bytes %lld", bud->lnum, bud->start,
360 bud->jhead, wbuf->offs - bud->start, 360 dbg_jhead(bud->jhead), wbuf->offs - bud->start,
361 c->cmt_bud_bytes); 361 c->cmt_bud_bytes);
362 bud->start = wbuf->offs; 362 bud->start = wbuf->offs;
363 } else { 363 } else {
364 c->cmt_bud_bytes += c->leb_size - bud->start; 364 c->cmt_bud_bytes += c->leb_size - bud->start;
365 dbg_log("remove %d:%d, jhead %d, bud bytes %d, " 365 dbg_log("remove %d:%d, jhead %s, bud bytes %d, "
366 "cmt_bud_bytes %lld", bud->lnum, bud->start, 366 "cmt_bud_bytes %lld", bud->lnum, bud->start,
367 bud->jhead, c->leb_size - bud->start, 367 dbg_jhead(bud->jhead), c->leb_size - bud->start,
368 c->cmt_bud_bytes); 368 c->cmt_bud_bytes);
369 rb_erase(p1, &c->buds); 369 rb_erase(p1, &c->buds);
370 /* 370 /*
@@ -429,7 +429,8 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
429 if (lnum == -1 || offs == c->leb_size) 429 if (lnum == -1 || offs == c->leb_size)
430 continue; 430 continue;
431 431
432 dbg_log("add ref to LEB %d:%d for jhead %d", lnum, offs, i); 432 dbg_log("add ref to LEB %d:%d for jhead %s",
433 lnum, offs, dbg_jhead(i));
433 ref = buf + len; 434 ref = buf + len;
434 ref->ch.node_type = UBIFS_REF_NODE; 435 ref->ch.node_type = UBIFS_REF_NODE;
435 ref->lnum = cpu_to_le32(lnum); 436 ref->lnum = cpu_to_le32(lnum);
@@ -695,7 +696,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
695 lnum = c->ltail_lnum; 696 lnum = c->ltail_lnum;
696 write_lnum = lnum; 697 write_lnum = lnum;
697 while (1) { 698 while (1) {
698 sleb = ubifs_scan(c, lnum, 0, c->sbuf); 699 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 0);
699 if (IS_ERR(sleb)) { 700 if (IS_ERR(sleb)) {
700 err = PTR_ERR(sleb); 701 err = PTR_ERR(sleb);
701 goto out_free; 702 goto out_free;
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 4cdd284dea56..4d4ca388889b 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -281,7 +281,7 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
281 case LPROPS_FREE: 281 case LPROPS_FREE:
282 if (add_to_lpt_heap(c, lprops, cat)) 282 if (add_to_lpt_heap(c, lprops, cat))
283 break; 283 break;
284 /* No more room on heap so make it uncategorized */ 284 /* No more room on heap so make it un-categorized */
285 cat = LPROPS_UNCAT; 285 cat = LPROPS_UNCAT;
286 /* Fall through */ 286 /* Fall through */
287 case LPROPS_UNCAT: 287 case LPROPS_UNCAT:
@@ -375,8 +375,8 @@ void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
375 * @lprops: LEB properties 375 * @lprops: LEB properties
376 * 376 *
377 * A LEB may have fallen off of the bottom of a heap, and ended up as 377 * A LEB may have fallen off of the bottom of a heap, and ended up as
378 * uncategorized even though it has enough space for us now. If that is the case 378 * un-categorized even though it has enough space for us now. If that is the
379 * this function will put the LEB back onto a heap. 379 * case this function will put the LEB back onto a heap.
380 */ 380 */
381void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops) 381void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops)
382{ 382{
@@ -436,10 +436,10 @@ int ubifs_categorize_lprops(const struct ubifs_info *c,
436/** 436/**
437 * change_category - change LEB properties category. 437 * change_category - change LEB properties category.
438 * @c: UBIFS file-system description object 438 * @c: UBIFS file-system description object
439 * @lprops: LEB properties to recategorize 439 * @lprops: LEB properties to re-categorize
440 * 440 *
441 * LEB properties are categorized to enable fast find operations. When the LEB 441 * LEB properties are categorized to enable fast find operations. When the LEB
442 * properties change they must be recategorized. 442 * properties change they must be re-categorized.
443 */ 443 */
444static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops) 444static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
445{ 445{
@@ -461,21 +461,18 @@ static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
461} 461}
462 462
463/** 463/**
464 * calc_dark - calculate LEB dark space size. 464 * ubifs_calc_dark - calculate LEB dark space size.
465 * @c: the UBIFS file-system description object 465 * @c: the UBIFS file-system description object
466 * @spc: amount of free and dirty space in the LEB 466 * @spc: amount of free and dirty space in the LEB
467 * 467 *
468 * This function calculates amount of dark space in an LEB which has @spc bytes 468 * This function calculates and returns amount of dark space in an LEB which
469 * of free and dirty space. Returns the calculations result. 469 * has @spc bytes of free and dirty space.
470 * 470 *
471 * Dark space is the space which is not always usable - it depends on which 471 * UBIFS is trying to account the space which might not be usable, and this
472 * nodes are written in which order. E.g., if an LEB has only 512 free bytes, 472 * space is called "dark space". For example, if an LEB has only %512 free
473 * it is dark space, because it cannot fit a large data node. So UBIFS cannot 473 * bytes, it is dark space, because it cannot fit a large data node.
474 * count on this LEB and treat these 512 bytes as usable because it is not true
475 * if, for example, only big chunks of uncompressible data will be written to
476 * the FS.
477 */ 474 */
478static int calc_dark(struct ubifs_info *c, int spc) 475int ubifs_calc_dark(const struct ubifs_info *c, int spc)
479{ 476{
480 ubifs_assert(!(spc & 7)); 477 ubifs_assert(!(spc & 7));
481 478
@@ -518,7 +515,7 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
518 * @free: new free space amount 515 * @free: new free space amount
519 * @dirty: new dirty space amount 516 * @dirty: new dirty space amount
520 * @flags: new flags 517 * @flags: new flags
521 * @idx_gc_cnt: change to the count of idx_gc list 518 * @idx_gc_cnt: change to the count of @idx_gc list
522 * 519 *
523 * This function changes LEB properties (@free, @dirty or @flag). However, the 520 * This function changes LEB properties (@free, @dirty or @flag). However, the
524 * property which has the %LPROPS_NC value is not changed. Returns a pointer to 521 * property which has the %LPROPS_NC value is not changed. Returns a pointer to
@@ -535,7 +532,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
535{ 532{
536 /* 533 /*
537 * This is the only function that is allowed to change lprops, so we 534 * This is the only function that is allowed to change lprops, so we
538 * discard the const qualifier. 535 * discard the "const" qualifier.
539 */ 536 */
540 struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp; 537 struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp;
541 538
@@ -575,7 +572,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
575 if (old_spc < c->dead_wm) 572 if (old_spc < c->dead_wm)
576 c->lst.total_dead -= old_spc; 573 c->lst.total_dead -= old_spc;
577 else 574 else
578 c->lst.total_dark -= calc_dark(c, old_spc); 575 c->lst.total_dark -= ubifs_calc_dark(c, old_spc);
579 576
580 c->lst.total_used -= c->leb_size - old_spc; 577 c->lst.total_used -= c->leb_size - old_spc;
581 } 578 }
@@ -616,7 +613,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
616 if (new_spc < c->dead_wm) 613 if (new_spc < c->dead_wm)
617 c->lst.total_dead += new_spc; 614 c->lst.total_dead += new_spc;
618 else 615 else
619 c->lst.total_dark += calc_dark(c, new_spc); 616 c->lst.total_dark += ubifs_calc_dark(c, new_spc);
620 617
621 c->lst.total_used += c->leb_size - new_spc; 618 c->lst.total_used += c->leb_size - new_spc;
622 } 619 }
@@ -1096,7 +1093,7 @@ static int scan_check_cb(struct ubifs_info *c,
1096 } 1093 }
1097 } 1094 }
1098 1095
1099 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf); 1096 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
1100 if (IS_ERR(sleb)) { 1097 if (IS_ERR(sleb)) {
1101 /* 1098 /*
1102 * After an unclean unmount, empty and freeable LEBs 1099 * After an unclean unmount, empty and freeable LEBs
@@ -1107,7 +1104,7 @@ static int scan_check_cb(struct ubifs_info *c,
1107 "- continuing checking"); 1104 "- continuing checking");
1108 lst->empty_lebs += 1; 1105 lst->empty_lebs += 1;
1109 lst->total_free += c->leb_size; 1106 lst->total_free += c->leb_size;
1110 lst->total_dark += calc_dark(c, c->leb_size); 1107 lst->total_dark += ubifs_calc_dark(c, c->leb_size);
1111 return LPT_SCAN_CONTINUE; 1108 return LPT_SCAN_CONTINUE;
1112 } 1109 }
1113 1110
@@ -1117,7 +1114,7 @@ static int scan_check_cb(struct ubifs_info *c,
1117 "- continuing checking"); 1114 "- continuing checking");
1118 lst->total_free += lp->free; 1115 lst->total_free += lp->free;
1119 lst->total_dirty += lp->dirty; 1116 lst->total_dirty += lp->dirty;
1120 lst->total_dark += calc_dark(c, c->leb_size); 1117 lst->total_dark += ubifs_calc_dark(c, c->leb_size);
1121 return LPT_SCAN_CONTINUE; 1118 return LPT_SCAN_CONTINUE;
1122 } 1119 }
1123 data->err = PTR_ERR(sleb); 1120 data->err = PTR_ERR(sleb);
@@ -1235,7 +1232,7 @@ static int scan_check_cb(struct ubifs_info *c,
1235 if (spc < c->dead_wm) 1232 if (spc < c->dead_wm)
1236 lst->total_dead += spc; 1233 lst->total_dead += spc;
1237 else 1234 else
1238 lst->total_dark += calc_dark(c, spc); 1235 lst->total_dark += ubifs_calc_dark(c, spc);
1239 } 1236 }
1240 1237
1241 ubifs_scan_destroy(sleb); 1238 ubifs_scan_destroy(sleb);
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index a88f33801b98..28beaeedadc0 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -29,7 +29,8 @@
29 * @c: UBIFS file-system description object 29 * @c: UBIFS file-system description object
30 * 30 *
31 * This function scans the master node LEBs and search for the latest master 31 * This function scans the master node LEBs and search for the latest master
32 * node. Returns zero in case of success and a negative error code in case of 32 * node. Returns zero in case of success, %-EUCLEAN if there master area is
33 * corrupted and requires recovery, and a negative error code in case of
33 * failure. 34 * failure.
34 */ 35 */
35static int scan_for_master(struct ubifs_info *c) 36static int scan_for_master(struct ubifs_info *c)
@@ -40,7 +41,7 @@ static int scan_for_master(struct ubifs_info *c)
40 41
41 lnum = UBIFS_MST_LNUM; 42 lnum = UBIFS_MST_LNUM;
42 43
43 sleb = ubifs_scan(c, lnum, 0, c->sbuf); 44 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
44 if (IS_ERR(sleb)) 45 if (IS_ERR(sleb))
45 return PTR_ERR(sleb); 46 return PTR_ERR(sleb);
46 nodes_cnt = sleb->nodes_cnt; 47 nodes_cnt = sleb->nodes_cnt;
@@ -48,7 +49,7 @@ static int scan_for_master(struct ubifs_info *c)
48 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, 49 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
49 list); 50 list);
50 if (snod->type != UBIFS_MST_NODE) 51 if (snod->type != UBIFS_MST_NODE)
51 goto out; 52 goto out_dump;
52 memcpy(c->mst_node, snod->node, snod->len); 53 memcpy(c->mst_node, snod->node, snod->len);
53 offs = snod->offs; 54 offs = snod->offs;
54 } 55 }
@@ -56,7 +57,7 @@ static int scan_for_master(struct ubifs_info *c)
56 57
57 lnum += 1; 58 lnum += 1;
58 59
59 sleb = ubifs_scan(c, lnum, 0, c->sbuf); 60 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
60 if (IS_ERR(sleb)) 61 if (IS_ERR(sleb))
61 return PTR_ERR(sleb); 62 return PTR_ERR(sleb);
62 if (sleb->nodes_cnt != nodes_cnt) 63 if (sleb->nodes_cnt != nodes_cnt)
@@ -65,7 +66,7 @@ static int scan_for_master(struct ubifs_info *c)
65 goto out; 66 goto out;
66 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list); 67 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list);
67 if (snod->type != UBIFS_MST_NODE) 68 if (snod->type != UBIFS_MST_NODE)
68 goto out; 69 goto out_dump;
69 if (snod->offs != offs) 70 if (snod->offs != offs)
70 goto out; 71 goto out;
71 if (memcmp((void *)c->mst_node + UBIFS_CH_SZ, 72 if (memcmp((void *)c->mst_node + UBIFS_CH_SZ,
@@ -78,6 +79,12 @@ static int scan_for_master(struct ubifs_info *c)
78 79
79out: 80out:
80 ubifs_scan_destroy(sleb); 81 ubifs_scan_destroy(sleb);
82 return -EUCLEAN;
83
84out_dump:
85 ubifs_err("unexpected node type %d master LEB %d:%d",
86 snod->type, lnum, snod->offs);
87 ubifs_scan_destroy(sleb);
81 return -EINVAL; 88 return -EINVAL;
82} 89}
83 90
@@ -256,7 +263,8 @@ int ubifs_read_master(struct ubifs_info *c)
256 263
257 err = scan_for_master(c); 264 err = scan_for_master(c);
258 if (err) { 265 if (err) {
259 err = ubifs_recover_master_node(c); 266 if (err == -EUCLEAN)
267 err = ubifs_recover_master_node(c);
260 if (err) 268 if (err)
261 /* 269 /*
262 * Note, we do not free 'c->mst_node' here because the 270 * Note, we do not free 'c->mst_node' here because the
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 152a7b34a141..82009c74b6a3 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -670,9 +670,10 @@ static int kill_orphans(struct ubifs_info *c)
670 struct ubifs_scan_leb *sleb; 670 struct ubifs_scan_leb *sleb;
671 671
672 dbg_rcvry("LEB %d", lnum); 672 dbg_rcvry("LEB %d", lnum);
673 sleb = ubifs_scan(c, lnum, 0, c->sbuf); 673 sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
674 if (IS_ERR(sleb)) { 674 if (IS_ERR(sleb)) {
675 sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0); 675 if (PTR_ERR(sleb) == -EUCLEAN)
676 sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
676 if (IS_ERR(sleb)) { 677 if (IS_ERR(sleb)) {
677 err = PTR_ERR(sleb); 678 err = PTR_ERR(sleb);
678 break; 679 break;
@@ -899,7 +900,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
899 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { 900 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
900 struct ubifs_scan_leb *sleb; 901 struct ubifs_scan_leb *sleb;
901 902
902 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf); 903 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0);
903 if (IS_ERR(sleb)) { 904 if (IS_ERR(sleb)) {
904 err = PTR_ERR(sleb); 905 err = PTR_ERR(sleb);
905 break; 906 break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index e5f6cf8a1155..f94ddf7efba0 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -286,7 +286,7 @@ int ubifs_recover_master_node(struct ubifs_info *c)
286 mst = mst2; 286 mst = mst2;
287 } 287 }
288 288
289 dbg_rcvry("recovered master node from LEB %d", 289 ubifs_msg("recovered master node from LEB %d",
290 (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1)); 290 (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1));
291 291
292 memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ); 292 memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
@@ -790,7 +790,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
790 * We can only recover at the end of the log, so check that the 790 * We can only recover at the end of the log, so check that the
791 * next log LEB is empty or out of date. 791 * next log LEB is empty or out of date.
792 */ 792 */
793 sleb = ubifs_scan(c, next_lnum, 0, sbuf); 793 sleb = ubifs_scan(c, next_lnum, 0, sbuf, 0);
794 if (IS_ERR(sleb)) 794 if (IS_ERR(sleb))
795 return sleb; 795 return sleb;
796 if (sleb->nodes_cnt) { 796 if (sleb->nodes_cnt) {
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 2970500f32df..5c2d6d759a3e 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -506,7 +506,7 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
506 if (c->need_recovery) 506 if (c->need_recovery)
507 sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD); 507 sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
508 else 508 else
509 sleb = ubifs_scan(c, lnum, offs, c->sbuf); 509 sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
510 if (IS_ERR(sleb)) 510 if (IS_ERR(sleb))
511 return PTR_ERR(sleb); 511 return PTR_ERR(sleb);
512 512
@@ -836,8 +836,8 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
836 const struct ubifs_cs_node *node; 836 const struct ubifs_cs_node *node;
837 837
838 dbg_mnt("replay log LEB %d:%d", lnum, offs); 838 dbg_mnt("replay log LEB %d:%d", lnum, offs);
839 sleb = ubifs_scan(c, lnum, offs, sbuf); 839 sleb = ubifs_scan(c, lnum, offs, sbuf, c->need_recovery);
840 if (IS_ERR(sleb) ) { 840 if (IS_ERR(sleb)) {
841 if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery) 841 if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
842 return PTR_ERR(sleb); 842 return PTR_ERR(sleb);
843 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf); 843 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 892ebfee4fe5..96c525384191 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -108,10 +108,9 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
108 108
109 /* Make the node pads to 8-byte boundary */ 109 /* Make the node pads to 8-byte boundary */
110 if ((node_len + pad_len) & 7) { 110 if ((node_len + pad_len) & 7) {
111 if (!quiet) { 111 if (!quiet)
112 dbg_err("bad padding length %d - %d", 112 dbg_err("bad padding length %d - %d",
113 offs, offs + node_len + pad_len); 113 offs, offs + node_len + pad_len);
114 }
115 return SCANNED_A_BAD_PAD_NODE; 114 return SCANNED_A_BAD_PAD_NODE;
116 } 115 }
117 116
@@ -253,15 +252,19 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
253 * @c: UBIFS file-system description object 252 * @c: UBIFS file-system description object
254 * @lnum: logical eraseblock number 253 * @lnum: logical eraseblock number
255 * @offs: offset to start at (usually zero) 254 * @offs: offset to start at (usually zero)
256 * @sbuf: scan buffer (must be c->leb_size) 255 * @sbuf: scan buffer (must be of @c->leb_size bytes in size)
256 * @quiet: print no messages
257 * 257 *
258 * This function scans LEB number @lnum and returns complete information about 258 * This function scans LEB number @lnum and returns complete information about
259 * its contents. Returns the scaned information in case of success and, 259 * its contents. Returns the scaned information in case of success and,
260 * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case 260 * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case
261 * of failure. 261 * of failure.
262 *
263 * If @quiet is non-zero, this function does not print large and scary
264 * error messages and flash dumps in case of errors.
262 */ 265 */
263struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, 266struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
264 int offs, void *sbuf) 267 int offs, void *sbuf, int quiet)
265{ 268{
266 void *buf = sbuf + offs; 269 void *buf = sbuf + offs;
267 int err, len = c->leb_size - offs; 270 int err, len = c->leb_size - offs;
@@ -280,7 +283,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
280 283
281 cond_resched(); 284 cond_resched();
282 285
283 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0); 286 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
284 if (ret > 0) { 287 if (ret > 0) {
285 /* Padding bytes or a valid padding node */ 288 /* Padding bytes or a valid padding node */
286 offs += ret; 289 offs += ret;
@@ -320,7 +323,9 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
320 } 323 }
321 324
322 if (offs % c->min_io_size) { 325 if (offs % c->min_io_size) {
323 ubifs_err("empty space starts at non-aligned offset %d", offs); 326 if (!quiet)
327 ubifs_err("empty space starts at non-aligned offset %d",
328 offs);
324 goto corrupted;; 329 goto corrupted;;
325 } 330 }
326 331
@@ -331,18 +336,25 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
331 break; 336 break;
332 for (; len; offs++, buf++, len--) 337 for (; len; offs++, buf++, len--)
333 if (*(uint8_t *)buf != 0xff) { 338 if (*(uint8_t *)buf != 0xff) {
334 ubifs_err("corrupt empty space at LEB %d:%d", 339 if (!quiet)
335 lnum, offs); 340 ubifs_err("corrupt empty space at LEB %d:%d",
341 lnum, offs);
336 goto corrupted; 342 goto corrupted;
337 } 343 }
338 344
339 return sleb; 345 return sleb;
340 346
341corrupted: 347corrupted:
342 ubifs_scanned_corruption(c, lnum, offs, buf); 348 if (!quiet) {
349 ubifs_scanned_corruption(c, lnum, offs, buf);
350 ubifs_err("LEB %d scanning failed", lnum);
351 }
343 err = -EUCLEAN; 352 err = -EUCLEAN;
353 ubifs_scan_destroy(sleb);
354 return ERR_PTR(err);
355
344error: 356error:
345 ubifs_err("LEB %d scanning failed", lnum); 357 ubifs_err("LEB %d scanning failed, error %d", lnum, err);
346 ubifs_scan_destroy(sleb); 358 ubifs_scan_destroy(sleb);
347 return ERR_PTR(err); 359 return ERR_PTR(err);
348} 360}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 51763aa8f4de..333e181ee987 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -36,7 +36,6 @@
36#include <linux/mount.h> 36#include <linux/mount.h>
37#include <linux/math64.h> 37#include <linux/math64.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/smp_lock.h>
40#include "ubifs.h" 39#include "ubifs.h"
41 40
42/* 41/*
@@ -318,6 +317,8 @@ static int ubifs_write_inode(struct inode *inode, int wait)
318 if (err) 317 if (err)
319 ubifs_err("can't write inode %lu, error %d", 318 ubifs_err("can't write inode %lu, error %d",
320 inode->i_ino, err); 319 inode->i_ino, err);
320 else
321 err = dbg_check_inode_size(c, inode, ui->ui_size);
321 } 322 }
322 323
323 ui->dirty = 0; 324 ui->dirty = 0;
@@ -448,17 +449,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
448 return 0; 449 return 0;
449 450
450 /* 451 /*
451 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
452 * pages, so synchronize them first, then commit the journal. Strictly
453 * speaking, it is not necessary to commit the journal here,
454 * synchronizing write-buffers would be enough. But committing makes
455 * UBIFS free space predictions much more accurate, so we want to let
456 * the user be able to get more accurate results of 'statfs()' after
457 * they synchronize the file system.
458 */
459 sync_inodes_sb(sb);
460
461 /*
462 * Synchronize write buffers, because 'ubifs_run_commit()' does not 452 * Synchronize write buffers, because 'ubifs_run_commit()' does not
463 * do this if it waits for an already running commit. 453 * do this if it waits for an already running commit.
464 */ 454 */
@@ -468,6 +458,13 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
468 return err; 458 return err;
469 } 459 }
470 460
461 /*
462 * Strictly speaking, it is not necessary to commit the journal here,
463 * synchronizing write-buffers would be enough. But committing makes
464 * UBIFS free space predictions much more accurate, so we want to let
465 * the user be able to get more accurate results of 'statfs()' after
466 * they synchronize the file system.
467 */
471 err = ubifs_run_commit(c); 468 err = ubifs_run_commit(c);
472 if (err) 469 if (err)
473 return err; 470 return err;
@@ -1720,8 +1717,6 @@ static void ubifs_put_super(struct super_block *sb)
1720 ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num, 1717 ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
1721 c->vi.vol_id); 1718 c->vi.vol_id);
1722 1719
1723 lock_kernel();
1724
1725 /* 1720 /*
1726 * The following asserts are only valid if there has not been a failure 1721 * The following asserts are only valid if there has not been a failure
1727 * of the media. For example, there will be dirty inodes if we failed 1722 * of the media. For example, there will be dirty inodes if we failed
@@ -1786,8 +1781,6 @@ static void ubifs_put_super(struct super_block *sb)
1786 ubi_close_volume(c->ubi); 1781 ubi_close_volume(c->ubi);
1787 mutex_unlock(&c->umount_mutex); 1782 mutex_unlock(&c->umount_mutex);
1788 kfree(c); 1783 kfree(c);
1789
1790 unlock_kernel();
1791} 1784}
1792 1785
1793static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) 1786static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
@@ -1803,22 +1796,17 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1803 return err; 1796 return err;
1804 } 1797 }
1805 1798
1806 lock_kernel();
1807 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 1799 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
1808 if (c->ro_media) { 1800 if (c->ro_media) {
1809 ubifs_msg("cannot re-mount due to prior errors"); 1801 ubifs_msg("cannot re-mount due to prior errors");
1810 unlock_kernel();
1811 return -EROFS; 1802 return -EROFS;
1812 } 1803 }
1813 err = ubifs_remount_rw(c); 1804 err = ubifs_remount_rw(c);
1814 if (err) { 1805 if (err)
1815 unlock_kernel();
1816 return err; 1806 return err;
1817 }
1818 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { 1807 } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
1819 if (c->ro_media) { 1808 if (c->ro_media) {
1820 ubifs_msg("cannot re-mount due to prior errors"); 1809 ubifs_msg("cannot re-mount due to prior errors");
1821 unlock_kernel();
1822 return -EROFS; 1810 return -EROFS;
1823 } 1811 }
1824 ubifs_remount_ro(c); 1812 ubifs_remount_ro(c);
@@ -1833,7 +1821,6 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1833 } 1821 }
1834 1822
1835 ubifs_assert(c->lst.taken_empty_lebs > 0); 1823 ubifs_assert(c->lst.taken_empty_lebs > 0);
1836 unlock_kernel();
1837 return 0; 1824 return 0;
1838} 1825}
1839 1826
@@ -1980,6 +1967,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1980 if (err) 1967 if (err)
1981 goto out_bdi; 1968 goto out_bdi;
1982 1969
1970 sb->s_bdi = &c->bdi;
1983 sb->s_fs_info = c; 1971 sb->s_fs_info = c;
1984 sb->s_magic = UBIFS_SUPER_MAGIC; 1972 sb->s_magic = UBIFS_SUPER_MAGIC;
1985 sb->s_blocksize = UBIFS_BLOCK_SIZE; 1973 sb->s_blocksize = UBIFS_BLOCK_SIZE;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index f249f7b0d656..e5b1a7d00fa0 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1159,8 +1159,8 @@ static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c,
1159 * o exact match, i.e. the found zero-level znode contains key @key, then %1 1159 * o exact match, i.e. the found zero-level znode contains key @key, then %1
1160 * is returned and slot number of the matched branch is stored in @n; 1160 * is returned and slot number of the matched branch is stored in @n;
1161 * o not exact match, which means that zero-level znode does not contain 1161 * o not exact match, which means that zero-level znode does not contain
1162 * @key, then %0 is returned and slot number of the closed branch is stored 1162 * @key, then %0 is returned and slot number of the closest branch is stored
1163 * in @n; 1163 * in @n;
1164 * o @key is so small that it is even less than the lowest key of the 1164 * o @key is so small that it is even less than the lowest key of the
1165 * leftmost zero-level node, then %0 is returned and %0 is stored in @n. 1165 * leftmost zero-level node, then %0 is returned and %0 is stored in @n.
1166 * 1166 *
@@ -1433,7 +1433,7 @@ static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)
1433 * @lnum: LEB number is returned here 1433 * @lnum: LEB number is returned here
1434 * @offs: offset is returned here 1434 * @offs: offset is returned here
1435 * 1435 *
1436 * This function look up and reads node with key @key. The caller has to make 1436 * This function looks up and reads node with key @key. The caller has to make
1437 * sure the @node buffer is large enough to fit the node. Returns zero in case 1437 * sure the @node buffer is large enough to fit the node. Returns zero in case
1438 * of success, %-ENOENT if the node was not found, and a negative error code in 1438 * of success, %-ENOENT if the node was not found, and a negative error code in
1439 * case of failure. The node location can be returned in @lnum and @offs. 1439 * case of failure. The node location can be returned in @lnum and @offs.
@@ -3268,3 +3268,73 @@ out_unlock:
3268 mutex_unlock(&c->tnc_mutex); 3268 mutex_unlock(&c->tnc_mutex);
3269 return err; 3269 return err;
3270} 3270}
3271
3272#ifdef CONFIG_UBIFS_FS_DEBUG
3273
3274/**
3275 * dbg_check_inode_size - check if inode size is correct.
3276 * @c: UBIFS file-system description object
3277 * @inum: inode number
3278 * @size: inode size
3279 *
3280 * This function makes sure that the inode size (@size) is correct and it does
3281 * not have any pages beyond @size. Returns zero if the inode is OK, %-EINVAL
3282 * if it has a data page beyond @size, and other negative error code in case of
3283 * other errors.
3284 */
3285int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
3286 loff_t size)
3287{
3288 int err, n;
3289 union ubifs_key from_key, to_key, *key;
3290 struct ubifs_znode *znode;
3291 unsigned int block;
3292
3293 if (!S_ISREG(inode->i_mode))
3294 return 0;
3295 if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
3296 return 0;
3297
3298 block = (size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
3299 data_key_init(c, &from_key, inode->i_ino, block);
3300 highest_data_key(c, &to_key, inode->i_ino);
3301
3302 mutex_lock(&c->tnc_mutex);
3303 err = ubifs_lookup_level0(c, &from_key, &znode, &n);
3304 if (err < 0)
3305 goto out_unlock;
3306
3307 if (err) {
3308 err = -EINVAL;
3309 key = &from_key;
3310 goto out_dump;
3311 }
3312
3313 err = tnc_next(c, &znode, &n);
3314 if (err == -ENOENT) {
3315 err = 0;
3316 goto out_unlock;
3317 }
3318 if (err < 0)
3319 goto out_unlock;
3320
3321 ubifs_assert(err == 0);
3322 key = &znode->zbranch[n].key;
3323 if (!key_in_range(c, key, &from_key, &to_key))
3324 goto out_unlock;
3325
3326out_dump:
3327 block = key_block(c, key);
3328 ubifs_err("inode %lu has size %lld, but there are data at offset %lld "
3329 "(data key %s)", (unsigned long)inode->i_ino, size,
3330 ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key));
3331 dbg_dump_inode(c, inode);
3332 dbg_dump_stack();
3333 err = -EINVAL;
3334
3335out_unlock:
3336 mutex_unlock(&c->tnc_mutex);
3337 return err;
3338}
3339
3340#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index fde8d127c768..53288e5d604e 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -245,7 +245,7 @@ static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
245 * it is more comprehensive and less efficient than is needed for this 245 * it is more comprehensive and less efficient than is needed for this
246 * purpose. 246 * purpose.
247 */ 247 */
248 sleb = ubifs_scan(c, lnum, 0, c->ileb_buf); 248 sleb = ubifs_scan(c, lnum, 0, c->ileb_buf, 0);
249 c->ileb_len = 0; 249 c->ileb_len = 0;
250 if (IS_ERR(sleb)) 250 if (IS_ERR(sleb))
251 return PTR_ERR(sleb); 251 return PTR_ERR(sleb);
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 3eee07e0c495..191ca7863fe7 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -135,6 +135,13 @@
135/* The key is always at the same position in all keyed nodes */ 135/* The key is always at the same position in all keyed nodes */
136#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key) 136#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key)
137 137
138/* Garbage collector journal head number */
139#define UBIFS_GC_HEAD 0
140/* Base journal head number */
141#define UBIFS_BASE_HEAD 1
142/* Data journal head number */
143#define UBIFS_DATA_HEAD 2
144
138/* 145/*
139 * LEB Properties Tree node types. 146 * LEB Properties Tree node types.
140 * 147 *
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index a29349094422..b2d976366a46 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -105,12 +105,10 @@
105/* Number of non-data journal heads */ 105/* Number of non-data journal heads */
106#define NONDATA_JHEADS_CNT 2 106#define NONDATA_JHEADS_CNT 2
107 107
108/* Garbage collector head */ 108/* Shorter names for journal head numbers for internal usage */
109#define GCHD 0 109#define GCHD UBIFS_GC_HEAD
110/* Base journal head number */ 110#define BASEHD UBIFS_BASE_HEAD
111#define BASEHD 1 111#define DATAHD UBIFS_DATA_HEAD
112/* First "general purpose" journal head */
113#define DATAHD 2
114 112
115/* 'No change' value for 'ubifs_change_lp()' */ 113/* 'No change' value for 'ubifs_change_lp()' */
116#define LPROPS_NC 0x80000001 114#define LPROPS_NC 0x80000001
@@ -1451,7 +1449,7 @@ int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode);
1451 1449
1452/* scan.c */ 1450/* scan.c */
1453struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, 1451struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
1454 int offs, void *sbuf); 1452 int offs, void *sbuf, int quiet);
1455void ubifs_scan_destroy(struct ubifs_scan_leb *sleb); 1453void ubifs_scan_destroy(struct ubifs_scan_leb *sleb);
1456int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, 1454int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
1457 int offs, int quiet); 1455 int offs, int quiet);
@@ -1676,6 +1674,7 @@ const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c);
1676const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c); 1674const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c);
1677const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c); 1675const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c);
1678const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c); 1676const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
1677int ubifs_calc_dark(const struct ubifs_info *c, int spc);
1679 1678
1680/* file.c */ 1679/* file.c */
1681int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync); 1680int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index adafcf556531..195830f47569 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -78,9 +78,9 @@ enum {
78 SECURITY_XATTR, 78 SECURITY_XATTR,
79}; 79};
80 80
81static struct inode_operations none_inode_operations; 81static const struct inode_operations none_inode_operations;
82static struct address_space_operations none_address_operations; 82static const struct address_space_operations none_address_operations;
83static struct file_operations none_file_operations; 83static const struct file_operations none_file_operations;
84 84
85/** 85/**
86 * create_xattr - create an extended attribute. 86 * create_xattr - create an extended attribute.
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 52f3fc63571a..381854461b28 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -216,7 +216,6 @@ xfs_setfilesize(
216 if (ip->i_d.di_size < isize) { 216 if (ip->i_d.di_size < isize) {
217 ip->i_d.di_size = isize; 217 ip->i_d.di_size = isize;
218 ip->i_update_core = 1; 218 ip->i_update_core = 1;
219 ip->i_update_size = 1;
220 xfs_mark_inode_dirty_sync(ip); 219 xfs_mark_inode_dirty_sync(ip);
221 } 220 }
222 221
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 0542fd507649..988d8f87bc0f 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -172,12 +172,21 @@ xfs_file_release(
172 */ 172 */
173STATIC int 173STATIC int
174xfs_file_fsync( 174xfs_file_fsync(
175 struct file *filp, 175 struct file *file,
176 struct dentry *dentry, 176 struct dentry *dentry,
177 int datasync) 177 int datasync)
178{ 178{
179 xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED); 179 struct inode *inode = dentry->d_inode;
180 return -xfs_fsync(XFS_I(dentry->d_inode)); 180 struct xfs_inode *ip = XFS_I(inode);
181 int error;
182
183 /* capture size updates in I/O completion before writing the inode. */
184 error = filemap_fdatawait(inode->i_mapping);
185 if (error)
186 return error;
187
188 xfs_iflags_clear(ip, XFS_ITRUNCATED);
189 return -xfs_fsync(ip);
181} 190}
182 191
183STATIC int 192STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 6c32f1d63d8c..da0159d99f82 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -43,7 +43,6 @@
43#include "xfs_error.h" 43#include "xfs_error.h"
44#include "xfs_itable.h" 44#include "xfs_itable.h"
45#include "xfs_rw.h" 45#include "xfs_rw.h"
46#include "xfs_acl.h"
47#include "xfs_attr.h" 46#include "xfs_attr.h"
48#include "xfs_buf_item.h" 47#include "xfs_buf_item.h"
49#include "xfs_utils.h" 48#include "xfs_utils.h"
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index fde63a3c4ecc..49e4a6aea73c 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -812,19 +812,21 @@ write_retry:
812 812
813 /* Handle various SYNC-type writes */ 813 /* Handle various SYNC-type writes */
814 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { 814 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
815 loff_t end = pos + ret - 1;
815 int error2; 816 int error2;
816 817
817 xfs_iunlock(xip, iolock); 818 xfs_iunlock(xip, iolock);
818 if (need_i_mutex) 819 if (need_i_mutex)
819 mutex_unlock(&inode->i_mutex); 820 mutex_unlock(&inode->i_mutex);
820 error2 = filemap_write_and_wait_range(mapping, pos, 821
821 pos + ret - 1); 822 error2 = filemap_write_and_wait_range(mapping, pos, end);
822 if (!error) 823 if (!error)
823 error = error2; 824 error = error2;
824 if (need_i_mutex) 825 if (need_i_mutex)
825 mutex_lock(&inode->i_mutex); 826 mutex_lock(&inode->i_mutex);
826 xfs_ilock(xip, iolock); 827 xfs_ilock(xip, iolock);
827 error2 = xfs_write_sync_logforce(mp, xip); 828
829 error2 = xfs_fsync(xip);
828 if (!error) 830 if (!error)
829 error = error2; 831 error = error2;
830 } 832 }
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index cb6e2cca214f..9e41f91aa269 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -150,7 +150,7 @@ xfs_fs_set_xquota(
150 return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq); 150 return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
151} 151}
152 152
153struct quotactl_ops xfs_quotactl_operations = { 153const struct quotactl_ops xfs_quotactl_operations = {
154 .quota_sync = xfs_fs_quota_sync, 154 .quota_sync = xfs_fs_quota_sync,
155 .get_xstate = xfs_fs_get_xstate, 155 .get_xstate = xfs_fs_get_xstate,
156 .set_xstate = xfs_fs_set_xstate, 156 .set_xstate = xfs_fs_set_xstate,
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index c3526d445f6a..76fdc5861932 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -20,16 +20,9 @@
20 20
21DEFINE_PER_CPU(struct xfsstats, xfsstats); 21DEFINE_PER_CPU(struct xfsstats, xfsstats);
22 22
23STATIC int 23static int xfs_stat_proc_show(struct seq_file *m, void *v)
24xfs_read_xfsstats(
25 char *buffer,
26 char **start,
27 off_t offset,
28 int count,
29 int *eof,
30 void *data)
31{ 24{
32 int c, i, j, len, val; 25 int c, i, j, val;
33 __uint64_t xs_xstrat_bytes = 0; 26 __uint64_t xs_xstrat_bytes = 0;
34 __uint64_t xs_write_bytes = 0; 27 __uint64_t xs_write_bytes = 0;
35 __uint64_t xs_read_bytes = 0; 28 __uint64_t xs_read_bytes = 0;
@@ -60,18 +53,18 @@ xfs_read_xfsstats(
60 }; 53 };
61 54
62 /* Loop over all stats groups */ 55 /* Loop over all stats groups */
63 for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) { 56 for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
64 len += sprintf(buffer + len, "%s", xstats[i].desc); 57 seq_printf(m, "%s", xstats[i].desc);
65 /* inner loop does each group */ 58 /* inner loop does each group */
66 while (j < xstats[i].endpoint) { 59 while (j < xstats[i].endpoint) {
67 val = 0; 60 val = 0;
68 /* sum over all cpus */ 61 /* sum over all cpus */
69 for_each_possible_cpu(c) 62 for_each_possible_cpu(c)
70 val += *(((__u32*)&per_cpu(xfsstats, c) + j)); 63 val += *(((__u32*)&per_cpu(xfsstats, c) + j));
71 len += sprintf(buffer + len, " %u", val); 64 seq_printf(m, " %u", val);
72 j++; 65 j++;
73 } 66 }
74 buffer[len++] = '\n'; 67 seq_putc(m, '\n');
75 } 68 }
76 /* extra precision counters */ 69 /* extra precision counters */
77 for_each_possible_cpu(i) { 70 for_each_possible_cpu(i) {
@@ -80,36 +73,38 @@ xfs_read_xfsstats(
80 xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes; 73 xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
81 } 74 }
82 75
83 len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n", 76 seq_printf(m, "xpc %Lu %Lu %Lu\n",
84 xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); 77 xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
85 len += sprintf(buffer + len, "debug %u\n", 78 seq_printf(m, "debug %u\n",
86#if defined(DEBUG) 79#if defined(DEBUG)
87 1); 80 1);
88#else 81#else
89 0); 82 0);
90#endif 83#endif
84 return 0;
85}
91 86
92 if (offset >= len) { 87static int xfs_stat_proc_open(struct inode *inode, struct file *file)
93 *start = buffer; 88{
94 *eof = 1; 89 return single_open(file, xfs_stat_proc_show, NULL);
95 return 0;
96 }
97 *start = buffer + offset;
98 if ((len -= offset) > count)
99 return count;
100 *eof = 1;
101
102 return len;
103} 90}
104 91
92static const struct file_operations xfs_stat_proc_fops = {
93 .owner = THIS_MODULE,
94 .open = xfs_stat_proc_open,
95 .read = seq_read,
96 .llseek = seq_lseek,
97 .release = single_release,
98};
99
105int 100int
106xfs_init_procfs(void) 101xfs_init_procfs(void)
107{ 102{
108 if (!proc_mkdir("fs/xfs", NULL)) 103 if (!proc_mkdir("fs/xfs", NULL))
109 goto out; 104 goto out;
110 105
111 if (!create_proc_read_entry("fs/xfs/stat", 0, NULL, 106 if (!proc_create("fs/xfs/stat", 0, NULL,
112 xfs_read_xfsstats, NULL)) 107 &xfs_stat_proc_fops))
113 goto out_remove_entry; 108 goto out_remove_entry;
114 return 0; 109 return 0;
115 110
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a220d36f789b..bdd41c8c342f 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -67,7 +67,7 @@
67#include <linux/freezer.h> 67#include <linux/freezer.h>
68#include <linux/parser.h> 68#include <linux/parser.h>
69 69
70static struct super_operations xfs_super_operations; 70static const struct super_operations xfs_super_operations;
71static kmem_zone_t *xfs_ioend_zone; 71static kmem_zone_t *xfs_ioend_zone;
72mempool_t *xfs_ioend_pool; 72mempool_t *xfs_ioend_pool;
73 73
@@ -579,15 +579,19 @@ xfs_showargs(
579 else if (mp->m_qflags & XFS_UQUOTA_ACCT) 579 else if (mp->m_qflags & XFS_UQUOTA_ACCT)
580 seq_puts(m, "," MNTOPT_UQUOTANOENF); 580 seq_puts(m, "," MNTOPT_UQUOTANOENF);
581 581
582 if (mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) 582 /* Either project or group quotas can be active, not both */
583 seq_puts(m, "," MNTOPT_PRJQUOTA); 583
584 else if (mp->m_qflags & XFS_PQUOTA_ACCT) 584 if (mp->m_qflags & XFS_PQUOTA_ACCT) {
585 seq_puts(m, "," MNTOPT_PQUOTANOENF); 585 if (mp->m_qflags & XFS_OQUOTA_ENFD)
586 586 seq_puts(m, "," MNTOPT_PRJQUOTA);
587 if (mp->m_qflags & (XFS_GQUOTA_ACCT|XFS_OQUOTA_ENFD)) 587 else
588 seq_puts(m, "," MNTOPT_GRPQUOTA); 588 seq_puts(m, "," MNTOPT_PQUOTANOENF);
589 else if (mp->m_qflags & XFS_GQUOTA_ACCT) 589 } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
590 seq_puts(m, "," MNTOPT_GQUOTANOENF); 590 if (mp->m_qflags & XFS_OQUOTA_ENFD)
591 seq_puts(m, "," MNTOPT_GRPQUOTA);
592 else
593 seq_puts(m, "," MNTOPT_GQUOTANOENF);
594 }
591 595
592 if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) 596 if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
593 seq_puts(m, "," MNTOPT_NOQUOTA); 597 seq_puts(m, "," MNTOPT_NOQUOTA);
@@ -687,7 +691,7 @@ xfs_barrier_test(
687 return error; 691 return error;
688} 692}
689 693
690void 694STATIC void
691xfs_mountfs_check_barriers(xfs_mount_t *mp) 695xfs_mountfs_check_barriers(xfs_mount_t *mp)
692{ 696{
693 int error; 697 int error;
@@ -1532,7 +1536,7 @@ xfs_fs_get_sb(
1532 mnt); 1536 mnt);
1533} 1537}
1534 1538
1535static struct super_operations xfs_super_operations = { 1539static const struct super_operations xfs_super_operations = {
1536 .alloc_inode = xfs_fs_alloc_inode, 1540 .alloc_inode = xfs_fs_alloc_inode,
1537 .destroy_inode = xfs_fs_destroy_inode, 1541 .destroy_inode = xfs_fs_destroy_inode,
1538 .write_inode = xfs_fs_write_inode, 1542 .write_inode = xfs_fs_write_inode,
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 5a2ea3a21781..18175ebd58ed 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -93,7 +93,7 @@ extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
93 93
94extern const struct export_operations xfs_export_operations; 94extern const struct export_operations xfs_export_operations;
95extern struct xattr_handler *xfs_xattr_handlers[]; 95extern struct xattr_handler *xfs_xattr_handlers[];
96extern struct quotactl_ops xfs_quotactl_operations; 96extern const struct quotactl_ops xfs_quotactl_operations;
97 97
98#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) 98#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
99 99
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 98ef624d9baf..320be6aea492 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -749,21 +749,6 @@ __xfs_inode_clear_reclaim_tag(
749 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 749 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
750} 750}
751 751
752void
753xfs_inode_clear_reclaim_tag(
754 xfs_inode_t *ip)
755{
756 xfs_mount_t *mp = ip->i_mount;
757 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
758
759 read_lock(&pag->pag_ici_lock);
760 spin_lock(&ip->i_flags_lock);
761 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
762 spin_unlock(&ip->i_flags_lock);
763 read_unlock(&pag->pag_ici_lock);
764 xfs_put_perag(mp, pag);
765}
766
767STATIC int 752STATIC int
768xfs_reclaim_inode_now( 753xfs_reclaim_inode_now(
769 struct xfs_inode *ip, 754 struct xfs_inode *ip,
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 59120602588a..27920eb7a820 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -49,7 +49,6 @@ int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
49 49
50void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); 50void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
51void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); 51void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
52void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
53void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, 52void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
54 struct xfs_inode *ip); 53 struct xfs_inode *ip);
55 54
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 916c0ffb6083..c5bc67c4e3bb 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -26,7 +26,6 @@ STATIC int
26xfs_stats_clear_proc_handler( 26xfs_stats_clear_proc_handler(
27 ctl_table *ctl, 27 ctl_table *ctl,
28 int write, 28 int write,
29 struct file *filp,
30 void __user *buffer, 29 void __user *buffer,
31 size_t *lenp, 30 size_t *lenp,
32 loff_t *ppos) 31 loff_t *ppos)
@@ -34,7 +33,7 @@ xfs_stats_clear_proc_handler(
34 int c, ret, *valp = ctl->data; 33 int c, ret, *valp = ctl->data;
35 __uint32_t vn_active; 34 __uint32_t vn_active;
36 35
37 ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos); 36 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
38 37
39 if (!ret && write && *valp) { 38 if (!ret && write && *valp) {
40 printk("XFS Clearing xfsstats\n"); 39 printk("XFS Clearing xfsstats\n");
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 21b08c0396a1..83e7ea3e25fa 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -48,50 +48,34 @@
48 48
49struct xqmstats xqmstats; 49struct xqmstats xqmstats;
50 50
51STATIC int 51static int xqm_proc_show(struct seq_file *m, void *v)
52xfs_qm_read_xfsquota(
53 char *buffer,
54 char **start,
55 off_t offset,
56 int count,
57 int *eof,
58 void *data)
59{ 52{
60 int len;
61
62 /* maximum; incore; ratio free to inuse; freelist */ 53 /* maximum; incore; ratio free to inuse; freelist */
63 len = sprintf(buffer, "%d\t%d\t%d\t%u\n", 54 seq_printf(m, "%d\t%d\t%d\t%u\n",
64 ndquot, 55 ndquot,
65 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, 56 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
66 xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, 57 xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
67 xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0); 58 xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
68 59 return 0;
69 if (offset >= len) {
70 *start = buffer;
71 *eof = 1;
72 return 0;
73 }
74 *start = buffer + offset;
75 if ((len -= offset) > count)
76 return count;
77 *eof = 1;
78
79 return len;
80} 60}
81 61
82STATIC int 62static int xqm_proc_open(struct inode *inode, struct file *file)
83xfs_qm_read_stats(
84 char *buffer,
85 char **start,
86 off_t offset,
87 int count,
88 int *eof,
89 void *data)
90{ 63{
91 int len; 64 return single_open(file, xqm_proc_show, NULL);
65}
66
67static const struct file_operations xqm_proc_fops = {
68 .owner = THIS_MODULE,
69 .open = xqm_proc_open,
70 .read = seq_read,
71 .llseek = seq_lseek,
72 .release = single_release,
73};
92 74
75static int xqmstat_proc_show(struct seq_file *m, void *v)
76{
93 /* quota performance statistics */ 77 /* quota performance statistics */
94 len = sprintf(buffer, "qm %u %u %u %u %u %u %u %u\n", 78 seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
95 xqmstats.xs_qm_dqreclaims, 79 xqmstats.xs_qm_dqreclaims,
96 xqmstats.xs_qm_dqreclaim_misses, 80 xqmstats.xs_qm_dqreclaim_misses,
97 xqmstats.xs_qm_dquot_dups, 81 xqmstats.xs_qm_dquot_dups,
@@ -100,25 +84,27 @@ xfs_qm_read_stats(
100 xqmstats.xs_qm_dqwants, 84 xqmstats.xs_qm_dqwants,
101 xqmstats.xs_qm_dqshake_reclaims, 85 xqmstats.xs_qm_dqshake_reclaims,
102 xqmstats.xs_qm_dqinact_reclaims); 86 xqmstats.xs_qm_dqinact_reclaims);
87 return 0;
88}
103 89
104 if (offset >= len) { 90static int xqmstat_proc_open(struct inode *inode, struct file *file)
105 *start = buffer; 91{
106 *eof = 1; 92 return single_open(file, xqmstat_proc_show, NULL);
107 return 0;
108 }
109 *start = buffer + offset;
110 if ((len -= offset) > count)
111 return count;
112 *eof = 1;
113
114 return len;
115} 93}
116 94
95static const struct file_operations xqmstat_proc_fops = {
96 .owner = THIS_MODULE,
97 .open = xqmstat_proc_open,
98 .read = seq_read,
99 .llseek = seq_lseek,
100 .release = single_release,
101};
102
117void 103void
118xfs_qm_init_procfs(void) 104xfs_qm_init_procfs(void)
119{ 105{
120 create_proc_read_entry("fs/xfs/xqmstat", 0, NULL, xfs_qm_read_stats, NULL); 106 proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
121 create_proc_read_entry("fs/xfs/xqm", 0, NULL, xfs_qm_read_xfsquota, NULL); 107 proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
122} 108}
123 109
124void 110void
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f24b50b68d03..a5d54bf4931b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -198,6 +198,15 @@ typedef struct xfs_perag
198 xfs_agino_t pagi_count; /* number of allocated inodes */ 198 xfs_agino_t pagi_count; /* number of allocated inodes */
199 int pagb_count; /* pagb slots in use */ 199 int pagb_count; /* pagb slots in use */
200 xfs_perag_busy_t *pagb_list; /* unstable blocks */ 200 xfs_perag_busy_t *pagb_list; /* unstable blocks */
201
202 /*
203 * Inode allocation search lookup optimisation.
204 * If the pagino matches, the search for new inodes
205 * doesn't need to search the near ones again straight away
206 */
207 xfs_agino_t pagl_pagino;
208 xfs_agino_t pagl_leftrec;
209 xfs_agino_t pagl_rightrec;
201#ifdef __KERNEL__ 210#ifdef __KERNEL__
202 spinlock_t pagb_lock; /* lock for pagb_list */ 211 spinlock_t pagb_lock; /* lock for pagb_list */
203 212
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8ee5b5a76a2a..8971fb09d387 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3713,7 +3713,7 @@ done:
3713 * entry (null if none). Else, *lastxp will be set to the index 3713 * entry (null if none). Else, *lastxp will be set to the index
3714 * of the found entry; *gotp will contain the entry. 3714 * of the found entry; *gotp will contain the entry.
3715 */ 3715 */
3716xfs_bmbt_rec_host_t * /* pointer to found extent entry */ 3716STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
3717xfs_bmap_search_multi_extents( 3717xfs_bmap_search_multi_extents(
3718 xfs_ifork_t *ifp, /* inode fork pointer */ 3718 xfs_ifork_t *ifp, /* inode fork pointer */
3719 xfs_fileoff_t bno, /* block number searched for */ 3719 xfs_fileoff_t bno, /* block number searched for */
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 1b8ff9256bd0..56f62d2edc35 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -392,17 +392,6 @@ xfs_bmap_count_blocks(
392 int whichfork, 392 int whichfork,
393 int *count); 393 int *count);
394 394
395/*
396 * Search the extent records for the entry containing block bno.
397 * If bno lies in a hole, point to the next entry. If bno lies
398 * past eof, *eofp will be set, and *prevp will contain the last
399 * entry (null if none). Else, *lastxp will be set to the index
400 * of the found entry; *gotp will contain the entry.
401 */
402xfs_bmbt_rec_host_t *
403xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *,
404 xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *);
405
406#endif /* __KERNEL__ */ 395#endif /* __KERNEL__ */
407 396
408#endif /* __XFS_BMAP_H__ */ 397#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 5c1ade06578e..eb7b702d0690 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -202,16 +202,6 @@ xfs_bmbt_get_state(
202 ext_flag); 202 ext_flag);
203} 203}
204 204
205/* Endian flipping versions of the bmbt extraction functions */
206void
207xfs_bmbt_disk_get_all(
208 xfs_bmbt_rec_t *r,
209 xfs_bmbt_irec_t *s)
210{
211 __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
212 get_unaligned_be64(&r->l1), s);
213}
214
215/* 205/*
216 * Extract the blockcount field from an on disk bmap extent record. 206 * Extract the blockcount field from an on disk bmap extent record.
217 */ 207 */
@@ -816,6 +806,16 @@ xfs_bmbt_trace_key(
816 *l1 = 0; 806 *l1 = 0;
817} 807}
818 808
809/* Endian flipping versions of the bmbt extraction functions */
810STATIC void
811xfs_bmbt_disk_get_all(
812 xfs_bmbt_rec_t *r,
813 xfs_bmbt_irec_t *s)
814{
815 __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
816 get_unaligned_be64(&r->l1), s);
817}
818
819STATIC void 819STATIC void
820xfs_bmbt_trace_record( 820xfs_bmbt_trace_record(
821 struct xfs_btree_cur *cur, 821 struct xfs_btree_cur *cur,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e8df007615e..5549d495947f 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -220,7 +220,6 @@ extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
220extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r); 220extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
221extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r); 221extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
222 222
223extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
224extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); 223extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
225extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); 224extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
226 225
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 26717388acf5..52b5f14d0c32 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -646,46 +646,6 @@ xfs_btree_read_bufl(
646} 646}
647 647
648/* 648/*
649 * Get a buffer for the block, return it read in.
650 * Short-form addressing.
651 */
652int /* error */
653xfs_btree_read_bufs(
654 xfs_mount_t *mp, /* file system mount point */
655 xfs_trans_t *tp, /* transaction pointer */
656 xfs_agnumber_t agno, /* allocation group number */
657 xfs_agblock_t agbno, /* allocation group block number */
658 uint lock, /* lock flags for read_buf */
659 xfs_buf_t **bpp, /* buffer for agno/agbno */
660 int refval) /* ref count value for buffer */
661{
662 xfs_buf_t *bp; /* return value */
663 xfs_daddr_t d; /* real disk block address */
664 int error;
665
666 ASSERT(agno != NULLAGNUMBER);
667 ASSERT(agbno != NULLAGBLOCK);
668 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
669 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
670 mp->m_bsize, lock, &bp))) {
671 return error;
672 }
673 ASSERT(!bp || !XFS_BUF_GETERROR(bp));
674 if (bp != NULL) {
675 switch (refval) {
676 case XFS_ALLOC_BTREE_REF:
677 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
678 break;
679 case XFS_INO_BTREE_REF:
680 XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval);
681 break;
682 }
683 }
684 *bpp = bp;
685 return 0;
686}
687
688/*
689 * Read-ahead the block, don't wait for it, don't return a buffer. 649 * Read-ahead the block, don't wait for it, don't return a buffer.
690 * Long-form addressing. 650 * Long-form addressing.
691 */ 651 */
@@ -2951,7 +2911,7 @@ error0:
2951 * inode we have to copy the single block it was pointing to into the 2911 * inode we have to copy the single block it was pointing to into the
2952 * inode. 2912 * inode.
2953 */ 2913 */
2954int 2914STATIC int
2955xfs_btree_kill_iroot( 2915xfs_btree_kill_iroot(
2956 struct xfs_btree_cur *cur) 2916 struct xfs_btree_cur *cur)
2957{ 2917{
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 4f852b735b96..7fa07062bdda 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -379,20 +379,6 @@ xfs_btree_read_bufl(
379 int refval);/* ref count value for buffer */ 379 int refval);/* ref count value for buffer */
380 380
381/* 381/*
382 * Get a buffer for the block, return it read in.
383 * Short-form addressing.
384 */
385int /* error */
386xfs_btree_read_bufs(
387 struct xfs_mount *mp, /* file system mount point */
388 struct xfs_trans *tp, /* transaction pointer */
389 xfs_agnumber_t agno, /* allocation group number */
390 xfs_agblock_t agbno, /* allocation group block number */
391 uint lock, /* lock flags for read_buf */
392 struct xfs_buf **bpp, /* buffer for agno/agbno */
393 int refval);/* ref count value for buffer */
394
395/*
396 * Read-ahead the block, don't wait for it, don't return a buffer. 382 * Read-ahead the block, don't wait for it, don't return a buffer.
397 * Long-form addressing. 383 * Long-form addressing.
398 */ 384 */
@@ -432,7 +418,6 @@ int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
432int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *); 418int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
433int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *); 419int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
434int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); 420int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
435int xfs_btree_kill_iroot(struct xfs_btree_cur *);
436int xfs_btree_insert(struct xfs_btree_cur *, int *); 421int xfs_btree_insert(struct xfs_btree_cur *, int *);
437int xfs_btree_delete(struct xfs_btree_cur *, int *); 422int xfs_btree_delete(struct xfs_btree_cur *, int *);
438int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); 423int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index c4ea51b55dce..f52ac276277e 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -117,7 +117,7 @@ struct getbmapx {
117#define BMV_IF_VALID \ 117#define BMV_IF_VALID \
118 (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC) 118 (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC)
119 119
120/* bmv_oflags values - returned for for each non-header segment */ 120/* bmv_oflags values - returned for each non-header segment */
121#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ 121#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
122#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */ 122#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */
123#define BMV_OF_LAST 0x4 /* segment is the last in the file */ 123#define BMV_OF_LAST 0x4 /* segment is the last in the file */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 3120a3a5e20f..ab64f3efb43b 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -57,75 +57,35 @@ xfs_ialloc_cluster_alignment(
57} 57}
58 58
59/* 59/*
60 * Lookup the record equal to ino in the btree given by cur. 60 * Lookup a record by ino in the btree given by cur.
61 */
62STATIC int /* error */
63xfs_inobt_lookup_eq(
64 struct xfs_btree_cur *cur, /* btree cursor */
65 xfs_agino_t ino, /* starting inode of chunk */
66 __int32_t fcnt, /* free inode count */
67 xfs_inofree_t free, /* free inode mask */
68 int *stat) /* success/failure */
69{
70 cur->bc_rec.i.ir_startino = ino;
71 cur->bc_rec.i.ir_freecount = fcnt;
72 cur->bc_rec.i.ir_free = free;
73 return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
74}
75
76/*
77 * Lookup the first record greater than or equal to ino
78 * in the btree given by cur.
79 */ 61 */
80int /* error */ 62int /* error */
81xfs_inobt_lookup_ge( 63xfs_inobt_lookup(
82 struct xfs_btree_cur *cur, /* btree cursor */ 64 struct xfs_btree_cur *cur, /* btree cursor */
83 xfs_agino_t ino, /* starting inode of chunk */ 65 xfs_agino_t ino, /* starting inode of chunk */
84 __int32_t fcnt, /* free inode count */ 66 xfs_lookup_t dir, /* <=, >=, == */
85 xfs_inofree_t free, /* free inode mask */
86 int *stat) /* success/failure */ 67 int *stat) /* success/failure */
87{ 68{
88 cur->bc_rec.i.ir_startino = ino; 69 cur->bc_rec.i.ir_startino = ino;
89 cur->bc_rec.i.ir_freecount = fcnt; 70 cur->bc_rec.i.ir_freecount = 0;
90 cur->bc_rec.i.ir_free = free; 71 cur->bc_rec.i.ir_free = 0;
91 return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); 72 return xfs_btree_lookup(cur, dir, stat);
92} 73}
93 74
94/* 75/*
95 * Lookup the first record less than or equal to ino 76 * Update the record referred to by cur to the value given.
96 * in the btree given by cur.
97 */
98int /* error */
99xfs_inobt_lookup_le(
100 struct xfs_btree_cur *cur, /* btree cursor */
101 xfs_agino_t ino, /* starting inode of chunk */
102 __int32_t fcnt, /* free inode count */
103 xfs_inofree_t free, /* free inode mask */
104 int *stat) /* success/failure */
105{
106 cur->bc_rec.i.ir_startino = ino;
107 cur->bc_rec.i.ir_freecount = fcnt;
108 cur->bc_rec.i.ir_free = free;
109 return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
110}
111
112/*
113 * Update the record referred to by cur to the value given
114 * by [ino, fcnt, free].
115 * This either works (return 0) or gets an EFSCORRUPTED error. 77 * This either works (return 0) or gets an EFSCORRUPTED error.
116 */ 78 */
117STATIC int /* error */ 79STATIC int /* error */
118xfs_inobt_update( 80xfs_inobt_update(
119 struct xfs_btree_cur *cur, /* btree cursor */ 81 struct xfs_btree_cur *cur, /* btree cursor */
120 xfs_agino_t ino, /* starting inode of chunk */ 82 xfs_inobt_rec_incore_t *irec) /* btree record */
121 __int32_t fcnt, /* free inode count */
122 xfs_inofree_t free) /* free inode mask */
123{ 83{
124 union xfs_btree_rec rec; 84 union xfs_btree_rec rec;
125 85
126 rec.inobt.ir_startino = cpu_to_be32(ino); 86 rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
127 rec.inobt.ir_freecount = cpu_to_be32(fcnt); 87 rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
128 rec.inobt.ir_free = cpu_to_be64(free); 88 rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
129 return xfs_btree_update(cur, &rec); 89 return xfs_btree_update(cur, &rec);
130} 90}
131 91
@@ -135,9 +95,7 @@ xfs_inobt_update(
135int /* error */ 95int /* error */
136xfs_inobt_get_rec( 96xfs_inobt_get_rec(
137 struct xfs_btree_cur *cur, /* btree cursor */ 97 struct xfs_btree_cur *cur, /* btree cursor */
138 xfs_agino_t *ino, /* output: starting inode of chunk */ 98 xfs_inobt_rec_incore_t *irec, /* btree record */
139 __int32_t *fcnt, /* output: number of free inodes */
140 xfs_inofree_t *free, /* output: free inode mask */
141 int *stat) /* output: success/failure */ 99 int *stat) /* output: success/failure */
142{ 100{
143 union xfs_btree_rec *rec; 101 union xfs_btree_rec *rec;
@@ -145,14 +103,136 @@ xfs_inobt_get_rec(
145 103
146 error = xfs_btree_get_rec(cur, &rec, stat); 104 error = xfs_btree_get_rec(cur, &rec, stat);
147 if (!error && *stat == 1) { 105 if (!error && *stat == 1) {
148 *ino = be32_to_cpu(rec->inobt.ir_startino); 106 irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
149 *fcnt = be32_to_cpu(rec->inobt.ir_freecount); 107 irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
150 *free = be64_to_cpu(rec->inobt.ir_free); 108 irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
151 } 109 }
152 return error; 110 return error;
153} 111}
154 112
155/* 113/*
114 * Verify that the number of free inodes in the AGI is correct.
115 */
116#ifdef DEBUG
117STATIC int
118xfs_check_agi_freecount(
119 struct xfs_btree_cur *cur,
120 struct xfs_agi *agi)
121{
122 if (cur->bc_nlevels == 1) {
123 xfs_inobt_rec_incore_t rec;
124 int freecount = 0;
125 int error;
126 int i;
127
128 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
129 if (error)
130 return error;
131
132 do {
133 error = xfs_inobt_get_rec(cur, &rec, &i);
134 if (error)
135 return error;
136
137 if (i) {
138 freecount += rec.ir_freecount;
139 error = xfs_btree_increment(cur, 0, &i);
140 if (error)
141 return error;
142 }
143 } while (i == 1);
144
145 if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
146 ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
147 }
148 return 0;
149}
150#else
151#define xfs_check_agi_freecount(cur, agi) 0
152#endif
153
154/*
155 * Initialise a new set of inodes.
156 */
157STATIC void
158xfs_ialloc_inode_init(
159 struct xfs_mount *mp,
160 struct xfs_trans *tp,
161 xfs_agnumber_t agno,
162 xfs_agblock_t agbno,
163 xfs_agblock_t length,
164 unsigned int gen)
165{
166 struct xfs_buf *fbuf;
167 struct xfs_dinode *free;
168 int blks_per_cluster, nbufs, ninodes;
169 int version;
170 int i, j;
171 xfs_daddr_t d;
172
173 /*
174 * Loop over the new block(s), filling in the inodes.
175 * For small block sizes, manipulate the inodes in buffers
176 * which are multiples of the blocks size.
177 */
178 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
179 blks_per_cluster = 1;
180 nbufs = length;
181 ninodes = mp->m_sb.sb_inopblock;
182 } else {
183 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
184 mp->m_sb.sb_blocksize;
185 nbufs = length / blks_per_cluster;
186 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
187 }
188
189 /*
190 * Figure out what version number to use in the inodes we create.
191 * If the superblock version has caught up to the one that supports
192 * the new inode format, then use the new inode version. Otherwise
193 * use the old version so that old kernels will continue to be
194 * able to use the file system.
195 */
196 if (xfs_sb_version_hasnlink(&mp->m_sb))
197 version = 2;
198 else
199 version = 1;
200
201 for (j = 0; j < nbufs; j++) {
202 /*
203 * Get the block.
204 */
205 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
206 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
207 mp->m_bsize * blks_per_cluster,
208 XFS_BUF_LOCK);
209 ASSERT(fbuf);
210 ASSERT(!XFS_BUF_GETERROR(fbuf));
211
212 /*
213 * Initialize all inodes in this buffer and then log them.
214 *
215 * XXX: It would be much better if we had just one transaction
216 * to log a whole cluster of inodes instead of all the
217 * individual transactions causing a lot of log traffic.
218 */
219 xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
220 for (i = 0; i < ninodes; i++) {
221 int ioffset = i << mp->m_sb.sb_inodelog;
222 uint isize = sizeof(struct xfs_dinode);
223
224 free = xfs_make_iptr(mp, fbuf, i);
225 free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
226 free->di_version = version;
227 free->di_gen = cpu_to_be32(gen);
228 free->di_next_unlinked = cpu_to_be32(NULLAGINO);
229 xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
230 }
231 xfs_trans_inode_alloc_buf(tp, fbuf);
232 }
233}
234
235/*
156 * Allocate new inodes in the allocation group specified by agbp. 236 * Allocate new inodes in the allocation group specified by agbp.
157 * Return 0 for success, else error code. 237 * Return 0 for success, else error code.
158 */ 238 */
@@ -164,24 +244,15 @@ xfs_ialloc_ag_alloc(
164{ 244{
165 xfs_agi_t *agi; /* allocation group header */ 245 xfs_agi_t *agi; /* allocation group header */
166 xfs_alloc_arg_t args; /* allocation argument structure */ 246 xfs_alloc_arg_t args; /* allocation argument structure */
167 int blks_per_cluster; /* fs blocks per inode cluster */
168 xfs_btree_cur_t *cur; /* inode btree cursor */ 247 xfs_btree_cur_t *cur; /* inode btree cursor */
169 xfs_daddr_t d; /* disk addr of buffer */
170 xfs_agnumber_t agno; 248 xfs_agnumber_t agno;
171 int error; 249 int error;
172 xfs_buf_t *fbuf; /* new free inodes' buffer */ 250 int i;
173 xfs_dinode_t *free; /* new free inode structure */
174 int i; /* inode counter */
175 int j; /* block counter */
176 int nbufs; /* num bufs of new inodes */
177 xfs_agino_t newino; /* new first inode's number */ 251 xfs_agino_t newino; /* new first inode's number */
178 xfs_agino_t newlen; /* new number of inodes */ 252 xfs_agino_t newlen; /* new number of inodes */
179 int ninodes; /* num inodes per buf */
180 xfs_agino_t thisino; /* current inode number, for loop */ 253 xfs_agino_t thisino; /* current inode number, for loop */
181 int version; /* inode version number to use */
182 int isaligned = 0; /* inode allocation at stripe unit */ 254 int isaligned = 0; /* inode allocation at stripe unit */
183 /* boundary */ 255 /* boundary */
184 unsigned int gen;
185 256
186 args.tp = tp; 257 args.tp = tp;
187 args.mp = tp->t_mountp; 258 args.mp = tp->t_mountp;
@@ -202,12 +273,12 @@ xfs_ialloc_ag_alloc(
202 */ 273 */
203 agi = XFS_BUF_TO_AGI(agbp); 274 agi = XFS_BUF_TO_AGI(agbp);
204 newino = be32_to_cpu(agi->agi_newino); 275 newino = be32_to_cpu(agi->agi_newino);
276 agno = be32_to_cpu(agi->agi_seqno);
205 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + 277 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
206 XFS_IALLOC_BLOCKS(args.mp); 278 XFS_IALLOC_BLOCKS(args.mp);
207 if (likely(newino != NULLAGINO && 279 if (likely(newino != NULLAGINO &&
208 (args.agbno < be32_to_cpu(agi->agi_length)))) { 280 (args.agbno < be32_to_cpu(agi->agi_length)))) {
209 args.fsbno = XFS_AGB_TO_FSB(args.mp, 281 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
210 be32_to_cpu(agi->agi_seqno), args.agbno);
211 args.type = XFS_ALLOCTYPE_THIS_BNO; 282 args.type = XFS_ALLOCTYPE_THIS_BNO;
212 args.mod = args.total = args.wasdel = args.isfl = 283 args.mod = args.total = args.wasdel = args.isfl =
213 args.userdata = args.minalignslop = 0; 284 args.userdata = args.minalignslop = 0;
@@ -258,8 +329,7 @@ xfs_ialloc_ag_alloc(
258 * For now, just allocate blocks up front. 329 * For now, just allocate blocks up front.
259 */ 330 */
260 args.agbno = be32_to_cpu(agi->agi_root); 331 args.agbno = be32_to_cpu(agi->agi_root);
261 args.fsbno = XFS_AGB_TO_FSB(args.mp, 332 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
262 be32_to_cpu(agi->agi_seqno), args.agbno);
263 /* 333 /*
264 * Allocate a fixed-size extent of inodes. 334 * Allocate a fixed-size extent of inodes.
265 */ 335 */
@@ -282,8 +352,7 @@ xfs_ialloc_ag_alloc(
282 if (isaligned && args.fsbno == NULLFSBLOCK) { 352 if (isaligned && args.fsbno == NULLFSBLOCK) {
283 args.type = XFS_ALLOCTYPE_NEAR_BNO; 353 args.type = XFS_ALLOCTYPE_NEAR_BNO;
284 args.agbno = be32_to_cpu(agi->agi_root); 354 args.agbno = be32_to_cpu(agi->agi_root);
285 args.fsbno = XFS_AGB_TO_FSB(args.mp, 355 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
286 be32_to_cpu(agi->agi_seqno), args.agbno);
287 args.alignment = xfs_ialloc_cluster_alignment(&args); 356 args.alignment = xfs_ialloc_cluster_alignment(&args);
288 if ((error = xfs_alloc_vextent(&args))) 357 if ((error = xfs_alloc_vextent(&args)))
289 return error; 358 return error;
@@ -294,85 +363,30 @@ xfs_ialloc_ag_alloc(
294 return 0; 363 return 0;
295 } 364 }
296 ASSERT(args.len == args.minlen); 365 ASSERT(args.len == args.minlen);
297 /*
298 * Convert the results.
299 */
300 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
301 /*
302 * Loop over the new block(s), filling in the inodes.
303 * For small block sizes, manipulate the inodes in buffers
304 * which are multiples of the blocks size.
305 */
306 if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
307 blks_per_cluster = 1;
308 nbufs = (int)args.len;
309 ninodes = args.mp->m_sb.sb_inopblock;
310 } else {
311 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
312 args.mp->m_sb.sb_blocksize;
313 nbufs = (int)args.len / blks_per_cluster;
314 ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
315 }
316 /*
317 * Figure out what version number to use in the inodes we create.
318 * If the superblock version has caught up to the one that supports
319 * the new inode format, then use the new inode version. Otherwise
320 * use the old version so that old kernels will continue to be
321 * able to use the file system.
322 */
323 if (xfs_sb_version_hasnlink(&args.mp->m_sb))
324 version = 2;
325 else
326 version = 1;
327 366
328 /* 367 /*
368 * Stamp and write the inode buffers.
369 *
329 * Seed the new inode cluster with a random generation number. This 370 * Seed the new inode cluster with a random generation number. This
330 * prevents short-term reuse of generation numbers if a chunk is 371 * prevents short-term reuse of generation numbers if a chunk is
331 * freed and then immediately reallocated. We use random numbers 372 * freed and then immediately reallocated. We use random numbers
332 * rather than a linear progression to prevent the next generation 373 * rather than a linear progression to prevent the next generation
333 * number from being easily guessable. 374 * number from being easily guessable.
334 */ 375 */
335 gen = random32(); 376 xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len,
336 for (j = 0; j < nbufs; j++) { 377 random32());
337 /*
338 * Get the block.
339 */
340 d = XFS_AGB_TO_DADDR(args.mp, be32_to_cpu(agi->agi_seqno),
341 args.agbno + (j * blks_per_cluster));
342 fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
343 args.mp->m_bsize * blks_per_cluster,
344 XFS_BUF_LOCK);
345 ASSERT(fbuf);
346 ASSERT(!XFS_BUF_GETERROR(fbuf));
347 378
348 /* 379 /*
349 * Initialize all inodes in this buffer and then log them. 380 * Convert the results.
350 * 381 */
351 * XXX: It would be much better if we had just one transaction to 382 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
352 * log a whole cluster of inodes instead of all the individual
353 * transactions causing a lot of log traffic.
354 */
355 xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
356 for (i = 0; i < ninodes; i++) {
357 int ioffset = i << args.mp->m_sb.sb_inodelog;
358 uint isize = sizeof(struct xfs_dinode);
359
360 free = xfs_make_iptr(args.mp, fbuf, i);
361 free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
362 free->di_version = version;
363 free->di_gen = cpu_to_be32(gen);
364 free->di_next_unlinked = cpu_to_be32(NULLAGINO);
365 xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
366 }
367 xfs_trans_inode_alloc_buf(tp, fbuf);
368 }
369 be32_add_cpu(&agi->agi_count, newlen); 383 be32_add_cpu(&agi->agi_count, newlen);
370 be32_add_cpu(&agi->agi_freecount, newlen); 384 be32_add_cpu(&agi->agi_freecount, newlen);
371 agno = be32_to_cpu(agi->agi_seqno);
372 down_read(&args.mp->m_peraglock); 385 down_read(&args.mp->m_peraglock);
373 args.mp->m_perag[agno].pagi_freecount += newlen; 386 args.mp->m_perag[agno].pagi_freecount += newlen;
374 up_read(&args.mp->m_peraglock); 387 up_read(&args.mp->m_peraglock);
375 agi->agi_newino = cpu_to_be32(newino); 388 agi->agi_newino = cpu_to_be32(newino);
389
376 /* 390 /*
377 * Insert records describing the new inode chunk into the btree. 391 * Insert records describing the new inode chunk into the btree.
378 */ 392 */
@@ -380,13 +394,17 @@ xfs_ialloc_ag_alloc(
380 for (thisino = newino; 394 for (thisino = newino;
381 thisino < newino + newlen; 395 thisino < newino + newlen;
382 thisino += XFS_INODES_PER_CHUNK) { 396 thisino += XFS_INODES_PER_CHUNK) {
383 if ((error = xfs_inobt_lookup_eq(cur, thisino, 397 cur->bc_rec.i.ir_startino = thisino;
384 XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) { 398 cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK;
399 cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE;
400 error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i);
401 if (error) {
385 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 402 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
386 return error; 403 return error;
387 } 404 }
388 ASSERT(i == 0); 405 ASSERT(i == 0);
389 if ((error = xfs_btree_insert(cur, &i))) { 406 error = xfs_btree_insert(cur, &i);
407 if (error) {
390 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 408 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
391 return error; 409 return error;
392 } 410 }
@@ -539,6 +557,62 @@ nextag:
539} 557}
540 558
541/* 559/*
560 * Try to retrieve the next record to the left/right from the current one.
561 */
562STATIC int
563xfs_ialloc_next_rec(
564 struct xfs_btree_cur *cur,
565 xfs_inobt_rec_incore_t *rec,
566 int *done,
567 int left)
568{
569 int error;
570 int i;
571
572 if (left)
573 error = xfs_btree_decrement(cur, 0, &i);
574 else
575 error = xfs_btree_increment(cur, 0, &i);
576
577 if (error)
578 return error;
579 *done = !i;
580 if (i) {
581 error = xfs_inobt_get_rec(cur, rec, &i);
582 if (error)
583 return error;
584 XFS_WANT_CORRUPTED_RETURN(i == 1);
585 }
586
587 return 0;
588}
589
590STATIC int
591xfs_ialloc_get_rec(
592 struct xfs_btree_cur *cur,
593 xfs_agino_t agino,
594 xfs_inobt_rec_incore_t *rec,
595 int *done,
596 int left)
597{
598 int error;
599 int i;
600
601 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
602 if (error)
603 return error;
604 *done = !i;
605 if (i) {
606 error = xfs_inobt_get_rec(cur, rec, &i);
607 if (error)
608 return error;
609 XFS_WANT_CORRUPTED_RETURN(i == 1);
610 }
611
612 return 0;
613}
614
615/*
542 * Visible inode allocation functions. 616 * Visible inode allocation functions.
543 */ 617 */
544 618
@@ -592,8 +666,8 @@ xfs_dialloc(
592 int j; /* result code */ 666 int j; /* result code */
593 xfs_mount_t *mp; /* file system mount structure */ 667 xfs_mount_t *mp; /* file system mount structure */
594 int offset; /* index of inode in chunk */ 668 int offset; /* index of inode in chunk */
595 xfs_agino_t pagino; /* parent's a.g. relative inode # */ 669 xfs_agino_t pagino; /* parent's AG relative inode # */
596 xfs_agnumber_t pagno; /* parent's allocation group number */ 670 xfs_agnumber_t pagno; /* parent's AG number */
597 xfs_inobt_rec_incore_t rec; /* inode allocation record */ 671 xfs_inobt_rec_incore_t rec; /* inode allocation record */
598 xfs_agnumber_t tagno; /* testing allocation group number */ 672 xfs_agnumber_t tagno; /* testing allocation group number */
599 xfs_btree_cur_t *tcur; /* temp cursor */ 673 xfs_btree_cur_t *tcur; /* temp cursor */
@@ -716,6 +790,8 @@ nextag:
716 */ 790 */
717 agno = tagno; 791 agno = tagno;
718 *IO_agbp = NULL; 792 *IO_agbp = NULL;
793
794 restart_pagno:
719 cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno)); 795 cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
720 /* 796 /*
721 * If pagino is 0 (this is the root inode allocation) use newino. 797 * If pagino is 0 (this is the root inode allocation) use newino.
@@ -723,220 +799,199 @@ nextag:
723 */ 799 */
724 if (!pagino) 800 if (!pagino)
725 pagino = be32_to_cpu(agi->agi_newino); 801 pagino = be32_to_cpu(agi->agi_newino);
726#ifdef DEBUG
727 if (cur->bc_nlevels == 1) {
728 int freecount = 0;
729 802
730 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) 803 error = xfs_check_agi_freecount(cur, agi);
731 goto error0; 804 if (error)
732 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 805 goto error0;
733 do {
734 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
735 &rec.ir_freecount, &rec.ir_free, &i)))
736 goto error0;
737 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
738 freecount += rec.ir_freecount;
739 if ((error = xfs_btree_increment(cur, 0, &i)))
740 goto error0;
741 } while (i == 1);
742 806
743 ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
744 XFS_FORCED_SHUTDOWN(mp));
745 }
746#endif
747 /* 807 /*
748 * If in the same a.g. as the parent, try to get near the parent. 808 * If in the same AG as the parent, try to get near the parent.
749 */ 809 */
750 if (pagno == agno) { 810 if (pagno == agno) {
751 if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i))) 811 xfs_perag_t *pag = &mp->m_perag[agno];
812 int doneleft; /* done, to the left */
813 int doneright; /* done, to the right */
814 int searchdistance = 10;
815
816 error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
817 if (error)
818 goto error0;
819 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
820
821 error = xfs_inobt_get_rec(cur, &rec, &j);
822 if (error)
752 goto error0; 823 goto error0;
753 if (i != 0 && 824 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
754 (error = xfs_inobt_get_rec(cur, &rec.ir_startino, 825
755 &rec.ir_freecount, &rec.ir_free, &j)) == 0 && 826 if (rec.ir_freecount > 0) {
756 j == 1 &&
757 rec.ir_freecount > 0) {
758 /* 827 /*
759 * Found a free inode in the same chunk 828 * Found a free inode in the same chunk
760 * as parent, done. 829 * as the parent, done.
761 */ 830 */
831 goto alloc_inode;
762 } 832 }
833
834
835 /*
836 * In the same AG as parent, but parent's chunk is full.
837 */
838
839 /* duplicate the cursor, search left & right simultaneously */
840 error = xfs_btree_dup_cursor(cur, &tcur);
841 if (error)
842 goto error0;
843
763 /* 844 /*
764 * In the same a.g. as parent, but parent's chunk is full. 845 * Skip to last blocks looked up if same parent inode.
765 */ 846 */
766 else { 847 if (pagino != NULLAGINO &&
767 int doneleft; /* done, to the left */ 848 pag->pagl_pagino == pagino &&
768 int doneright; /* done, to the right */ 849 pag->pagl_leftrec != NULLAGINO &&
850 pag->pagl_rightrec != NULLAGINO) {
851 error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
852 &trec, &doneleft, 1);
853 if (error)
854 goto error1;
769 855
856 error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
857 &rec, &doneright, 0);
770 if (error) 858 if (error)
771 goto error0;
772 ASSERT(i == 1);
773 ASSERT(j == 1);
774 /*
775 * Duplicate the cursor, search left & right
776 * simultaneously.
777 */
778 if ((error = xfs_btree_dup_cursor(cur, &tcur)))
779 goto error0;
780 /*
781 * Search left with tcur, back up 1 record.
782 */
783 if ((error = xfs_btree_decrement(tcur, 0, &i)))
784 goto error1; 859 goto error1;
785 doneleft = !i; 860 } else {
786 if (!doneleft) { 861 /* search left with tcur, back up 1 record */
787 if ((error = xfs_inobt_get_rec(tcur, 862 error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
788 &trec.ir_startino, 863 if (error)
789 &trec.ir_freecount,
790 &trec.ir_free, &i)))
791 goto error1;
792 XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
793 }
794 /*
795 * Search right with cur, go forward 1 record.
796 */
797 if ((error = xfs_btree_increment(cur, 0, &i)))
798 goto error1; 864 goto error1;
799 doneright = !i;
800 if (!doneright) {
801 if ((error = xfs_inobt_get_rec(cur,
802 &rec.ir_startino,
803 &rec.ir_freecount,
804 &rec.ir_free, &i)))
805 goto error1;
806 XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
807 }
808 /*
809 * Loop until we find the closest inode chunk
810 * with a free one.
811 */
812 while (!doneleft || !doneright) {
813 int useleft; /* using left inode
814 chunk this time */
815 865
866 /* search right with cur, go forward 1 record. */
867 error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
868 if (error)
869 goto error1;
870 }
871
872 /*
873 * Loop until we find an inode chunk with a free inode.
874 */
875 while (!doneleft || !doneright) {
876 int useleft; /* using left inode chunk this time */
877
878 if (!--searchdistance) {
816 /* 879 /*
817 * Figure out which block is closer, 880 * Not in range - save last search
818 * if both are valid. 881 * location and allocate a new inode
819 */
820 if (!doneleft && !doneright)
821 useleft =
822 pagino -
823 (trec.ir_startino +
824 XFS_INODES_PER_CHUNK - 1) <
825 rec.ir_startino - pagino;
826 else
827 useleft = !doneleft;
828 /*
829 * If checking the left, does it have
830 * free inodes?
831 */
832 if (useleft && trec.ir_freecount) {
833 /*
834 * Yes, set it up as the chunk to use.
835 */
836 rec = trec;
837 xfs_btree_del_cursor(cur,
838 XFS_BTREE_NOERROR);
839 cur = tcur;
840 break;
841 }
842 /*
843 * If checking the right, does it have
844 * free inodes?
845 */
846 if (!useleft && rec.ir_freecount) {
847 /*
848 * Yes, it's already set up.
849 */
850 xfs_btree_del_cursor(tcur,
851 XFS_BTREE_NOERROR);
852 break;
853 }
854 /*
855 * If used the left, get another one
856 * further left.
857 */
858 if (useleft) {
859 if ((error = xfs_btree_decrement(tcur, 0,
860 &i)))
861 goto error1;
862 doneleft = !i;
863 if (!doneleft) {
864 if ((error = xfs_inobt_get_rec(
865 tcur,
866 &trec.ir_startino,
867 &trec.ir_freecount,
868 &trec.ir_free, &i)))
869 goto error1;
870 XFS_WANT_CORRUPTED_GOTO(i == 1,
871 error1);
872 }
873 }
874 /*
875 * If used the right, get another one
876 * further right.
877 */ 882 */
878 else { 883 pag->pagl_leftrec = trec.ir_startino;
879 if ((error = xfs_btree_increment(cur, 0, 884 pag->pagl_rightrec = rec.ir_startino;
880 &i))) 885 pag->pagl_pagino = pagino;
881 goto error1; 886 goto newino;
882 doneright = !i; 887 }
883 if (!doneright) { 888
884 if ((error = xfs_inobt_get_rec( 889 /* figure out the closer block if both are valid. */
885 cur, 890 if (!doneleft && !doneright) {
886 &rec.ir_startino, 891 useleft = pagino -
887 &rec.ir_freecount, 892 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
888 &rec.ir_free, &i))) 893 rec.ir_startino - pagino;
889 goto error1; 894 } else {
890 XFS_WANT_CORRUPTED_GOTO(i == 1, 895 useleft = !doneleft;
891 error1);
892 }
893 }
894 } 896 }
895 ASSERT(!doneleft || !doneright); 897
898 /* free inodes to the left? */
899 if (useleft && trec.ir_freecount) {
900 rec = trec;
901 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
902 cur = tcur;
903
904 pag->pagl_leftrec = trec.ir_startino;
905 pag->pagl_rightrec = rec.ir_startino;
906 pag->pagl_pagino = pagino;
907 goto alloc_inode;
908 }
909
910 /* free inodes to the right? */
911 if (!useleft && rec.ir_freecount) {
912 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
913
914 pag->pagl_leftrec = trec.ir_startino;
915 pag->pagl_rightrec = rec.ir_startino;
916 pag->pagl_pagino = pagino;
917 goto alloc_inode;
918 }
919
920 /* get next record to check */
921 if (useleft) {
922 error = xfs_ialloc_next_rec(tcur, &trec,
923 &doneleft, 1);
924 } else {
925 error = xfs_ialloc_next_rec(cur, &rec,
926 &doneright, 0);
927 }
928 if (error)
929 goto error1;
896 } 930 }
931
932 /*
933 * We've reached the end of the btree. because
934 * we are only searching a small chunk of the
935 * btree each search, there is obviously free
936 * inodes closer to the parent inode than we
937 * are now. restart the search again.
938 */
939 pag->pagl_pagino = NULLAGINO;
940 pag->pagl_leftrec = NULLAGINO;
941 pag->pagl_rightrec = NULLAGINO;
942 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
943 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
944 goto restart_pagno;
897 } 945 }
946
898 /* 947 /*
899 * In a different a.g. from the parent. 948 * In a different AG from the parent.
900 * See if the most recently allocated block has any free. 949 * See if the most recently allocated block has any free.
901 */ 950 */
902 else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) { 951newino:
903 if ((error = xfs_inobt_lookup_eq(cur, 952 if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
904 be32_to_cpu(agi->agi_newino), 0, 0, &i))) 953 error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
954 XFS_LOOKUP_EQ, &i);
955 if (error)
905 goto error0; 956 goto error0;
906 if (i == 1 && 957
907 (error = xfs_inobt_get_rec(cur, &rec.ir_startino, 958 if (i == 1) {
908 &rec.ir_freecount, &rec.ir_free, &j)) == 0 && 959 error = xfs_inobt_get_rec(cur, &rec, &j);
909 j == 1 &&
910 rec.ir_freecount > 0) {
911 /*
912 * The last chunk allocated in the group still has
913 * a free inode.
914 */
915 }
916 /*
917 * None left in the last group, search the whole a.g.
918 */
919 else {
920 if (error) 960 if (error)
921 goto error0; 961 goto error0;
922 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) 962
923 goto error0; 963 if (j == 1 && rec.ir_freecount > 0) {
924 ASSERT(i == 1); 964 /*
925 for (;;) { 965 * The last chunk allocated in the group
926 if ((error = xfs_inobt_get_rec(cur, 966 * still has a free inode.
927 &rec.ir_startino, 967 */
928 &rec.ir_freecount, &rec.ir_free, 968 goto alloc_inode;
929 &i)))
930 goto error0;
931 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
932 if (rec.ir_freecount > 0)
933 break;
934 if ((error = xfs_btree_increment(cur, 0, &i)))
935 goto error0;
936 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
937 } 969 }
938 } 970 }
939 } 971 }
972
973 /*
974 * None left in the last group, search the whole AG
975 */
976 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
977 if (error)
978 goto error0;
979 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
980
981 for (;;) {
982 error = xfs_inobt_get_rec(cur, &rec, &i);
983 if (error)
984 goto error0;
985 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
986 if (rec.ir_freecount > 0)
987 break;
988 error = xfs_btree_increment(cur, 0, &i);
989 if (error)
990 goto error0;
991 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
992 }
993
994alloc_inode:
940 offset = xfs_ialloc_find_free(&rec.ir_free); 995 offset = xfs_ialloc_find_free(&rec.ir_free);
941 ASSERT(offset >= 0); 996 ASSERT(offset >= 0);
942 ASSERT(offset < XFS_INODES_PER_CHUNK); 997 ASSERT(offset < XFS_INODES_PER_CHUNK);
@@ -945,33 +1000,19 @@ nextag:
945 ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); 1000 ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
946 rec.ir_free &= ~XFS_INOBT_MASK(offset); 1001 rec.ir_free &= ~XFS_INOBT_MASK(offset);
947 rec.ir_freecount--; 1002 rec.ir_freecount--;
948 if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, 1003 error = xfs_inobt_update(cur, &rec);
949 rec.ir_free))) 1004 if (error)
950 goto error0; 1005 goto error0;
951 be32_add_cpu(&agi->agi_freecount, -1); 1006 be32_add_cpu(&agi->agi_freecount, -1);
952 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); 1007 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
953 down_read(&mp->m_peraglock); 1008 down_read(&mp->m_peraglock);
954 mp->m_perag[tagno].pagi_freecount--; 1009 mp->m_perag[tagno].pagi_freecount--;
955 up_read(&mp->m_peraglock); 1010 up_read(&mp->m_peraglock);
956#ifdef DEBUG
957 if (cur->bc_nlevels == 1) {
958 int freecount = 0;
959 1011
960 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) 1012 error = xfs_check_agi_freecount(cur, agi);
961 goto error0; 1013 if (error)
962 do { 1014 goto error0;
963 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, 1015
964 &rec.ir_freecount, &rec.ir_free, &i)))
965 goto error0;
966 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
967 freecount += rec.ir_freecount;
968 if ((error = xfs_btree_increment(cur, 0, &i)))
969 goto error0;
970 } while (i == 1);
971 ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
972 XFS_FORCED_SHUTDOWN(mp));
973 }
974#endif
975 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1016 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
976 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); 1017 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
977 *inop = ino; 1018 *inop = ino;
@@ -1062,38 +1103,23 @@ xfs_difree(
1062 * Initialize the cursor. 1103 * Initialize the cursor.
1063 */ 1104 */
1064 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); 1105 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
1065#ifdef DEBUG
1066 if (cur->bc_nlevels == 1) {
1067 int freecount = 0;
1068 1106
1069 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) 1107 error = xfs_check_agi_freecount(cur, agi);
1070 goto error0; 1108 if (error)
1071 do { 1109 goto error0;
1072 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, 1110
1073 &rec.ir_freecount, &rec.ir_free, &i)))
1074 goto error0;
1075 if (i) {
1076 freecount += rec.ir_freecount;
1077 if ((error = xfs_btree_increment(cur, 0, &i)))
1078 goto error0;
1079 }
1080 } while (i == 1);
1081 ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
1082 XFS_FORCED_SHUTDOWN(mp));
1083 }
1084#endif
1085 /* 1111 /*
1086 * Look for the entry describing this inode. 1112 * Look for the entry describing this inode.
1087 */ 1113 */
1088 if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) { 1114 if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
1089 cmn_err(CE_WARN, 1115 cmn_err(CE_WARN,
1090 "xfs_difree: xfs_inobt_lookup_le returned() an error %d on %s. Returning error.", 1116 "xfs_difree: xfs_inobt_lookup returned() an error %d on %s. Returning error.",
1091 error, mp->m_fsname); 1117 error, mp->m_fsname);
1092 goto error0; 1118 goto error0;
1093 } 1119 }
1094 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1120 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1095 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount, 1121 error = xfs_inobt_get_rec(cur, &rec, &i);
1096 &rec.ir_free, &i))) { 1122 if (error) {
1097 cmn_err(CE_WARN, 1123 cmn_err(CE_WARN,
1098 "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.", 1124 "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.",
1099 error, mp->m_fsname); 1125 error, mp->m_fsname);
@@ -1148,12 +1174,14 @@ xfs_difree(
1148 } else { 1174 } else {
1149 *delete = 0; 1175 *delete = 0;
1150 1176
1151 if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) { 1177 error = xfs_inobt_update(cur, &rec);
1178 if (error) {
1152 cmn_err(CE_WARN, 1179 cmn_err(CE_WARN,
1153 "xfs_difree: xfs_inobt_update() returned an error %d on %s. Returning error.", 1180 "xfs_difree: xfs_inobt_update returned an error %d on %s.",
1154 error, mp->m_fsname); 1181 error, mp->m_fsname);
1155 goto error0; 1182 goto error0;
1156 } 1183 }
1184
1157 /* 1185 /*
1158 * Change the inode free counts and log the ag/sb changes. 1186 * Change the inode free counts and log the ag/sb changes.
1159 */ 1187 */
@@ -1165,28 +1193,10 @@ xfs_difree(
1165 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); 1193 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
1166 } 1194 }
1167 1195
1168#ifdef DEBUG 1196 error = xfs_check_agi_freecount(cur, agi);
1169 if (cur->bc_nlevels == 1) { 1197 if (error)
1170 int freecount = 0; 1198 goto error0;
1171 1199
1172 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
1173 goto error0;
1174 do {
1175 if ((error = xfs_inobt_get_rec(cur,
1176 &rec.ir_startino,
1177 &rec.ir_freecount,
1178 &rec.ir_free, &i)))
1179 goto error0;
1180 if (i) {
1181 freecount += rec.ir_freecount;
1182 if ((error = xfs_btree_increment(cur, 0, &i)))
1183 goto error0;
1184 }
1185 } while (i == 1);
1186 ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
1187 XFS_FORCED_SHUTDOWN(mp));
1188 }
1189#endif
1190 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1200 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1191 return 0; 1201 return 0;
1192 1202
@@ -1297,9 +1307,7 @@ xfs_imap(
1297 chunk_agbno = agbno - offset_agbno; 1307 chunk_agbno = agbno - offset_agbno;
1298 } else { 1308 } else {
1299 xfs_btree_cur_t *cur; /* inode btree cursor */ 1309 xfs_btree_cur_t *cur; /* inode btree cursor */
1300 xfs_agino_t chunk_agino; /* first agino in inode chunk */ 1310 xfs_inobt_rec_incore_t chunk_rec;
1301 __int32_t chunk_cnt; /* count of free inodes in chunk */
1302 xfs_inofree_t chunk_free; /* mask of free inodes in chunk */
1303 xfs_buf_t *agbp; /* agi buffer */ 1311 xfs_buf_t *agbp; /* agi buffer */
1304 int i; /* temp state */ 1312 int i; /* temp state */
1305 1313
@@ -1315,15 +1323,14 @@ xfs_imap(
1315 } 1323 }
1316 1324
1317 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); 1325 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
1318 error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i); 1326 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
1319 if (error) { 1327 if (error) {
1320 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1328 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1321 "xfs_inobt_lookup_le() failed"); 1329 "xfs_inobt_lookup() failed");
1322 goto error0; 1330 goto error0;
1323 } 1331 }
1324 1332
1325 error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt, 1333 error = xfs_inobt_get_rec(cur, &chunk_rec, &i);
1326 &chunk_free, &i);
1327 if (error) { 1334 if (error) {
1328 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1335 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1329 "xfs_inobt_get_rec() failed"); 1336 "xfs_inobt_get_rec() failed");
@@ -1341,7 +1348,7 @@ xfs_imap(
1341 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1348 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1342 if (error) 1349 if (error)
1343 return error; 1350 return error;
1344 chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino); 1351 chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino);
1345 offset_agbno = agbno - chunk_agbno; 1352 offset_agbno = agbno - chunk_agbno;
1346 } 1353 }
1347 1354
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index aeee8278f92c..bb5385475e1f 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,23 +150,15 @@ xfs_ialloc_pagi_init(
150 xfs_agnumber_t agno); /* allocation group number */ 150 xfs_agnumber_t agno); /* allocation group number */
151 151
152/* 152/*
153 * Lookup the first record greater than or equal to ino 153 * Lookup a record by ino in the btree given by cur.
154 * in the btree given by cur.
155 */ 154 */
156int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino, 155int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
157 __int32_t fcnt, xfs_inofree_t free, int *stat); 156 xfs_lookup_t dir, int *stat);
158
159/*
160 * Lookup the first record less than or equal to ino
161 * in the btree given by cur.
162 */
163int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
164 __int32_t fcnt, xfs_inofree_t free, int *stat);
165 157
166/* 158/*
167 * Get the data from the pointed-to record. 159 * Get the data from the pointed-to record.
168 */ 160 */
169extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino, 161extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
170 __int32_t *fcnt, xfs_inofree_t *free, int *stat); 162 xfs_inobt_rec_incore_t *rec, int *stat);
171 163
172#endif /* __XFS_IALLOC_H__ */ 164#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index ecbf8b4d2e2e..80e526489be5 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -82,7 +82,6 @@ xfs_inode_alloc(
82 memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); 82 memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
83 ip->i_flags = 0; 83 ip->i_flags = 0;
84 ip->i_update_core = 0; 84 ip->i_update_core = 0;
85 ip->i_update_size = 0;
86 ip->i_delayed_blks = 0; 85 ip->i_delayed_blks = 0;
87 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); 86 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
88 ip->i_size = 0; 87 ip->i_size = 0;
@@ -456,32 +455,6 @@ out_error_or_again:
456 return error; 455 return error;
457} 456}
458 457
459
460/*
461 * Look for the inode corresponding to the given ino in the hash table.
462 * If it is there and its i_transp pointer matches tp, return it.
463 * Otherwise, return NULL.
464 */
465xfs_inode_t *
466xfs_inode_incore(xfs_mount_t *mp,
467 xfs_ino_t ino,
468 xfs_trans_t *tp)
469{
470 xfs_inode_t *ip;
471 xfs_perag_t *pag;
472
473 pag = xfs_get_perag(mp, ino);
474 read_lock(&pag->pag_ici_lock);
475 ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino));
476 read_unlock(&pag->pag_ici_lock);
477 xfs_put_perag(mp, pag);
478
479 /* the returned inode must match the transaction */
480 if (ip && (ip->i_transp != tp))
481 return NULL;
482 return ip;
483}
484
485/* 458/*
486 * Decrement reference count of an inode structure and unlock it. 459 * Decrement reference count of an inode structure and unlock it.
487 * 460 *
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index da428b3fe0f5..c1dc7ef5a1d8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -651,7 +651,7 @@ xfs_iformat_btree(
651 return 0; 651 return 0;
652} 652}
653 653
654void 654STATIC void
655xfs_dinode_from_disk( 655xfs_dinode_from_disk(
656 xfs_icdinode_t *to, 656 xfs_icdinode_t *to,
657 xfs_dinode_t *from) 657 xfs_dinode_t *from)
@@ -1247,7 +1247,7 @@ xfs_isize_check(
1247 * In that case the pages will still be in memory, but the inode size 1247 * In that case the pages will still be in memory, but the inode size
1248 * will never have been updated. 1248 * will never have been updated.
1249 */ 1249 */
1250xfs_fsize_t 1250STATIC xfs_fsize_t
1251xfs_file_last_byte( 1251xfs_file_last_byte(
1252 xfs_inode_t *ip) 1252 xfs_inode_t *ip)
1253{ 1253{
@@ -3837,7 +3837,7 @@ xfs_iext_inline_to_direct(
3837/* 3837/*
3838 * Resize an extent indirection array to new_size bytes. 3838 * Resize an extent indirection array to new_size bytes.
3839 */ 3839 */
3840void 3840STATIC void
3841xfs_iext_realloc_indirect( 3841xfs_iext_realloc_indirect(
3842 xfs_ifork_t *ifp, /* inode fork pointer */ 3842 xfs_ifork_t *ifp, /* inode fork pointer */
3843 int new_size) /* new indirection array size */ 3843 int new_size) /* new indirection array size */
@@ -3862,7 +3862,7 @@ xfs_iext_realloc_indirect(
3862/* 3862/*
3863 * Switch from indirection array to linear (direct) extent allocations. 3863 * Switch from indirection array to linear (direct) extent allocations.
3864 */ 3864 */
3865void 3865STATIC void
3866xfs_iext_indirect_to_direct( 3866xfs_iext_indirect_to_direct(
3867 xfs_ifork_t *ifp) /* inode fork pointer */ 3867 xfs_ifork_t *ifp) /* inode fork pointer */
3868{ 3868{
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 65f24a3cc992..0b38b9a869ec 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -261,7 +261,6 @@ typedef struct xfs_inode {
261 /* Miscellaneous state. */ 261 /* Miscellaneous state. */
262 unsigned short i_flags; /* see defined flags below */ 262 unsigned short i_flags; /* see defined flags below */
263 unsigned char i_update_core; /* timestamps/size is dirty */ 263 unsigned char i_update_core; /* timestamps/size is dirty */
264 unsigned char i_update_size; /* di_size field is dirty */
265 unsigned int i_delayed_blks; /* count of delay alloc blks */ 264 unsigned int i_delayed_blks; /* count of delay alloc blks */
266 265
267 xfs_icdinode_t i_d; /* most of ondisk inode */ 266 xfs_icdinode_t i_d; /* most of ondisk inode */
@@ -468,8 +467,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
468/* 467/*
469 * xfs_iget.c prototypes. 468 * xfs_iget.c prototypes.
470 */ 469 */
471xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
472 struct xfs_trans *);
473int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, 470int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
474 uint, uint, xfs_inode_t **, xfs_daddr_t); 471 uint, uint, xfs_inode_t **, xfs_daddr_t);
475void xfs_iput(xfs_inode_t *, uint); 472void xfs_iput(xfs_inode_t *, uint);
@@ -504,7 +501,6 @@ void xfs_ipin(xfs_inode_t *);
504void xfs_iunpin(xfs_inode_t *); 501void xfs_iunpin(xfs_inode_t *);
505int xfs_iflush(xfs_inode_t *, uint); 502int xfs_iflush(xfs_inode_t *, uint);
506void xfs_ichgtime(xfs_inode_t *, int); 503void xfs_ichgtime(xfs_inode_t *, int);
507xfs_fsize_t xfs_file_last_byte(xfs_inode_t *);
508void xfs_lock_inodes(xfs_inode_t **, int, uint); 504void xfs_lock_inodes(xfs_inode_t **, int, uint);
509void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 505void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
510 506
@@ -572,8 +568,6 @@ int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
572 struct xfs_buf **, uint); 568 struct xfs_buf **, uint);
573int xfs_iread(struct xfs_mount *, struct xfs_trans *, 569int xfs_iread(struct xfs_mount *, struct xfs_trans *,
574 struct xfs_inode *, xfs_daddr_t, uint); 570 struct xfs_inode *, xfs_daddr_t, uint);
575void xfs_dinode_from_disk(struct xfs_icdinode *,
576 struct xfs_dinode *);
577void xfs_dinode_to_disk(struct xfs_dinode *, 571void xfs_dinode_to_disk(struct xfs_dinode *,
578 struct xfs_icdinode *); 572 struct xfs_icdinode *);
579void xfs_idestroy_fork(struct xfs_inode *, int); 573void xfs_idestroy_fork(struct xfs_inode *, int);
@@ -592,8 +586,6 @@ void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
592void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int); 586void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
593void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int); 587void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
594void xfs_iext_realloc_direct(xfs_ifork_t *, int); 588void xfs_iext_realloc_direct(xfs_ifork_t *, int);
595void xfs_iext_realloc_indirect(xfs_ifork_t *, int);
596void xfs_iext_indirect_to_direct(xfs_ifork_t *);
597void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t); 589void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
598void xfs_iext_inline_to_direct(xfs_ifork_t *, int); 590void xfs_iext_inline_to_direct(xfs_ifork_t *, int);
599void xfs_iext_destroy(xfs_ifork_t *); 591void xfs_iext_destroy(xfs_ifork_t *);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 977c4aec587e..47d5b663c37e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -263,14 +263,6 @@ xfs_inode_item_format(
263 } 263 }
264 264
265 /* 265 /*
266 * We don't have to worry about re-ordering here because
267 * the update_size field is protected by the inode lock
268 * and we have that held in exclusive mode.
269 */
270 if (ip->i_update_size)
271 ip->i_update_size = 0;
272
273 /*
274 * Make sure to get the latest atime from the Linux inode. 266 * Make sure to get the latest atime from the Linux inode.
275 */ 267 */
276 xfs_synchronize_atime(ip); 268 xfs_synchronize_atime(ip);
@@ -712,8 +704,6 @@ xfs_inode_item_unlock(
712 * Clear out the fields of the inode log item particular 704 * Clear out the fields of the inode log item particular
713 * to the current transaction. 705 * to the current transaction.
714 */ 706 */
715 iip->ili_ilock_recur = 0;
716 iip->ili_iolock_recur = 0;
717 iip->ili_flags = 0; 707 iip->ili_flags = 0;
718 708
719 /* 709 /*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index a52ac125f055..65bae4c9b8bf 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -137,8 +137,6 @@ typedef struct xfs_inode_log_item {
137 struct xfs_inode *ili_inode; /* inode ptr */ 137 struct xfs_inode *ili_inode; /* inode ptr */
138 xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ 138 xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
139 xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ 139 xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
140 unsigned short ili_ilock_recur; /* lock recursion count */
141 unsigned short ili_iolock_recur; /* lock recursion count */
142 unsigned short ili_flags; /* misc flags */ 140 unsigned short ili_flags; /* misc flags */
143 unsigned short ili_logged; /* flushed logged data */ 141 unsigned short ili_logged; /* flushed logged data */
144 unsigned int ili_last_fields; /* fields when flushed */ 142 unsigned int ili_last_fields; /* fields when flushed */
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
index 7a28191cb0de..b8e4ee4e89a4 100644
--- a/fs/xfs/xfs_inum.h
+++ b/fs/xfs/xfs_inum.h
@@ -72,7 +72,6 @@ struct xfs_mount;
72 72
73#if XFS_BIG_INUMS 73#if XFS_BIG_INUMS
74#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) 74#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
75#define XFS_INO64_OFFSET ((xfs_ino_t)(1ULL << 32))
76#else 75#else
77#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL)) 76#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL))
78#endif 77#endif
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index aeb2d2221c7d..b68f9107e26c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -39,7 +39,7 @@
39#include "xfs_error.h" 39#include "xfs_error.h"
40#include "xfs_btree.h" 40#include "xfs_btree.h"
41 41
42int 42STATIC int
43xfs_internal_inum( 43xfs_internal_inum(
44 xfs_mount_t *mp, 44 xfs_mount_t *mp,
45 xfs_ino_t ino) 45 xfs_ino_t ino)
@@ -353,9 +353,6 @@ xfs_bulkstat(
353 int end_of_ag; /* set if we've seen the ag end */ 353 int end_of_ag; /* set if we've seen the ag end */
354 int error; /* error code */ 354 int error; /* error code */
355 int fmterror;/* bulkstat formatter result */ 355 int fmterror;/* bulkstat formatter result */
356 __int32_t gcnt; /* current btree rec's count */
357 xfs_inofree_t gfree; /* current btree rec's free mask */
358 xfs_agino_t gino; /* current btree rec's start inode */
359 int i; /* loop index */ 356 int i; /* loop index */
360 int icount; /* count of inodes good in irbuf */ 357 int icount; /* count of inodes good in irbuf */
361 size_t irbsize; /* size of irec buffer in bytes */ 358 size_t irbsize; /* size of irec buffer in bytes */
@@ -442,40 +439,43 @@ xfs_bulkstat(
442 * we need to get the remainder of the chunk we're in. 439 * we need to get the remainder of the chunk we're in.
443 */ 440 */
444 if (agino > 0) { 441 if (agino > 0) {
442 xfs_inobt_rec_incore_t r;
443
445 /* 444 /*
446 * Lookup the inode chunk that this inode lives in. 445 * Lookup the inode chunk that this inode lives in.
447 */ 446 */
448 error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp); 447 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE,
448 &tmp);
449 if (!error && /* no I/O error */ 449 if (!error && /* no I/O error */
450 tmp && /* lookup succeeded */ 450 tmp && /* lookup succeeded */
451 /* got the record, should always work */ 451 /* got the record, should always work */
452 !(error = xfs_inobt_get_rec(cur, &gino, &gcnt, 452 !(error = xfs_inobt_get_rec(cur, &r, &i)) &&
453 &gfree, &i)) &&
454 i == 1 && 453 i == 1 &&
455 /* this is the right chunk */ 454 /* this is the right chunk */
456 agino < gino + XFS_INODES_PER_CHUNK && 455 agino < r.ir_startino + XFS_INODES_PER_CHUNK &&
457 /* lastino was not last in chunk */ 456 /* lastino was not last in chunk */
458 (chunkidx = agino - gino + 1) < 457 (chunkidx = agino - r.ir_startino + 1) <
459 XFS_INODES_PER_CHUNK && 458 XFS_INODES_PER_CHUNK &&
460 /* there are some left allocated */ 459 /* there are some left allocated */
461 xfs_inobt_maskn(chunkidx, 460 xfs_inobt_maskn(chunkidx,
462 XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) { 461 XFS_INODES_PER_CHUNK - chunkidx) &
462 ~r.ir_free) {
463 /* 463 /*
464 * Grab the chunk record. Mark all the 464 * Grab the chunk record. Mark all the
465 * uninteresting inodes (because they're 465 * uninteresting inodes (because they're
466 * before our start point) free. 466 * before our start point) free.
467 */ 467 */
468 for (i = 0; i < chunkidx; i++) { 468 for (i = 0; i < chunkidx; i++) {
469 if (XFS_INOBT_MASK(i) & ~gfree) 469 if (XFS_INOBT_MASK(i) & ~r.ir_free)
470 gcnt++; 470 r.ir_freecount++;
471 } 471 }
472 gfree |= xfs_inobt_maskn(0, chunkidx); 472 r.ir_free |= xfs_inobt_maskn(0, chunkidx);
473 irbp->ir_startino = gino; 473 irbp->ir_startino = r.ir_startino;
474 irbp->ir_freecount = gcnt; 474 irbp->ir_freecount = r.ir_freecount;
475 irbp->ir_free = gfree; 475 irbp->ir_free = r.ir_free;
476 irbp++; 476 irbp++;
477 agino = gino + XFS_INODES_PER_CHUNK; 477 agino = r.ir_startino + XFS_INODES_PER_CHUNK;
478 icount = XFS_INODES_PER_CHUNK - gcnt; 478 icount = XFS_INODES_PER_CHUNK - r.ir_freecount;
479 } else { 479 } else {
480 /* 480 /*
481 * If any of those tests failed, bump the 481 * If any of those tests failed, bump the
@@ -493,7 +493,7 @@ xfs_bulkstat(
493 /* 493 /*
494 * Start of ag. Lookup the first inode chunk. 494 * Start of ag. Lookup the first inode chunk.
495 */ 495 */
496 error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp); 496 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
497 icount = 0; 497 icount = 0;
498 } 498 }
499 /* 499 /*
@@ -501,6 +501,8 @@ xfs_bulkstat(
501 * until we run out of inodes or space in the buffer. 501 * until we run out of inodes or space in the buffer.
502 */ 502 */
503 while (irbp < irbufend && icount < ubcount) { 503 while (irbp < irbufend && icount < ubcount) {
504 xfs_inobt_rec_incore_t r;
505
504 /* 506 /*
505 * Loop as long as we're unable to read the 507 * Loop as long as we're unable to read the
506 * inode btree. 508 * inode btree.
@@ -510,51 +512,55 @@ xfs_bulkstat(
510 if (XFS_AGINO_TO_AGBNO(mp, agino) >= 512 if (XFS_AGINO_TO_AGBNO(mp, agino) >=
511 be32_to_cpu(agi->agi_length)) 513 be32_to_cpu(agi->agi_length))
512 break; 514 break;
513 error = xfs_inobt_lookup_ge(cur, agino, 0, 0, 515 error = xfs_inobt_lookup(cur, agino,
514 &tmp); 516 XFS_LOOKUP_GE, &tmp);
515 cond_resched(); 517 cond_resched();
516 } 518 }
517 /* 519 /*
518 * If ran off the end of the ag either with an error, 520 * If ran off the end of the ag either with an error,
519 * or the normal way, set end and stop collecting. 521 * or the normal way, set end and stop collecting.
520 */ 522 */
521 if (error || 523 if (error) {
522 (error = xfs_inobt_get_rec(cur, &gino, &gcnt,
523 &gfree, &i)) ||
524 i == 0) {
525 end_of_ag = 1; 524 end_of_ag = 1;
526 break; 525 break;
527 } 526 }
527
528 error = xfs_inobt_get_rec(cur, &r, &i);
529 if (error || i == 0) {
530 end_of_ag = 1;
531 break;
532 }
533
528 /* 534 /*
529 * If this chunk has any allocated inodes, save it. 535 * If this chunk has any allocated inodes, save it.
530 * Also start read-ahead now for this chunk. 536 * Also start read-ahead now for this chunk.
531 */ 537 */
532 if (gcnt < XFS_INODES_PER_CHUNK) { 538 if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
533 /* 539 /*
534 * Loop over all clusters in the next chunk. 540 * Loop over all clusters in the next chunk.
535 * Do a readahead if there are any allocated 541 * Do a readahead if there are any allocated
536 * inodes in that cluster. 542 * inodes in that cluster.
537 */ 543 */
538 for (agbno = XFS_AGINO_TO_AGBNO(mp, gino), 544 agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
539 chunkidx = 0; 545 for (chunkidx = 0;
540 chunkidx < XFS_INODES_PER_CHUNK; 546 chunkidx < XFS_INODES_PER_CHUNK;
541 chunkidx += nicluster, 547 chunkidx += nicluster,
542 agbno += nbcluster) { 548 agbno += nbcluster) {
543 if (xfs_inobt_maskn(chunkidx, 549 if (xfs_inobt_maskn(chunkidx, nicluster)
544 nicluster) & ~gfree) 550 & ~r.ir_free)
545 xfs_btree_reada_bufs(mp, agno, 551 xfs_btree_reada_bufs(mp, agno,
546 agbno, nbcluster); 552 agbno, nbcluster);
547 } 553 }
548 irbp->ir_startino = gino; 554 irbp->ir_startino = r.ir_startino;
549 irbp->ir_freecount = gcnt; 555 irbp->ir_freecount = r.ir_freecount;
550 irbp->ir_free = gfree; 556 irbp->ir_free = r.ir_free;
551 irbp++; 557 irbp++;
552 icount += XFS_INODES_PER_CHUNK - gcnt; 558 icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
553 } 559 }
554 /* 560 /*
555 * Set agino to after this chunk and bump the cursor. 561 * Set agino to after this chunk and bump the cursor.
556 */ 562 */
557 agino = gino + XFS_INODES_PER_CHUNK; 563 agino = r.ir_startino + XFS_INODES_PER_CHUNK;
558 error = xfs_btree_increment(cur, 0, &tmp); 564 error = xfs_btree_increment(cur, 0, &tmp);
559 cond_resched(); 565 cond_resched();
560 } 566 }
@@ -820,9 +826,7 @@ xfs_inumbers(
820 int bufidx; 826 int bufidx;
821 xfs_btree_cur_t *cur; 827 xfs_btree_cur_t *cur;
822 int error; 828 int error;
823 __int32_t gcnt; 829 xfs_inobt_rec_incore_t r;
824 xfs_inofree_t gfree;
825 xfs_agino_t gino;
826 int i; 830 int i;
827 xfs_ino_t ino; 831 xfs_ino_t ino;
828 int left; 832 int left;
@@ -855,7 +859,8 @@ xfs_inumbers(
855 continue; 859 continue;
856 } 860 }
857 cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno); 861 cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
858 error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp); 862 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
863 &tmp);
859 if (error) { 864 if (error) {
860 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 865 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
861 cur = NULL; 866 cur = NULL;
@@ -870,9 +875,8 @@ xfs_inumbers(
870 continue; 875 continue;
871 } 876 }
872 } 877 }
873 if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree, 878 error = xfs_inobt_get_rec(cur, &r, &i);
874 &i)) || 879 if (error || i == 0) {
875 i == 0) {
876 xfs_buf_relse(agbp); 880 xfs_buf_relse(agbp);
877 agbp = NULL; 881 agbp = NULL;
878 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 882 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -881,10 +885,12 @@ xfs_inumbers(
881 agino = 0; 885 agino = 0;
882 continue; 886 continue;
883 } 887 }
884 agino = gino + XFS_INODES_PER_CHUNK - 1; 888 agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
885 buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino); 889 buffer[bufidx].xi_startino =
886 buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt; 890 XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
887 buffer[bufidx].xi_allocmask = ~gfree; 891 buffer[bufidx].xi_alloccount =
892 XFS_INODES_PER_CHUNK - r.ir_freecount;
893 buffer[bufidx].xi_allocmask = ~r.ir_free;
888 bufidx++; 894 bufidx++;
889 left--; 895 left--;
890 if (bufidx == bcount) { 896 if (bufidx == bcount) {
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 1fb04e7deb61..20792bf45946 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -99,11 +99,6 @@ xfs_bulkstat_one(
99 void *dibuff, 99 void *dibuff,
100 int *stat); 100 int *stat);
101 101
102int
103xfs_internal_inum(
104 xfs_mount_t *mp,
105 xfs_ino_t ino);
106
107typedef int (*inumbers_fmt_pf)( 102typedef int (*inumbers_fmt_pf)(
108 void __user *ubuffer, /* buffer to write to */ 103 void __user *ubuffer, /* buffer to write to */
109 const xfs_inogrp_t *buffer, /* buffer to read from */ 104 const xfs_inogrp_t *buffer, /* buffer to read from */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index bcad5f4c1fd1..679c7c4926a2 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -451,8 +451,6 @@ extern int xlog_find_tail(xlog_t *log,
451extern int xlog_recover(xlog_t *log); 451extern int xlog_recover(xlog_t *log);
452extern int xlog_recover_finish(xlog_t *log); 452extern int xlog_recover_finish(xlog_t *log);
453extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 453extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
454extern void xlog_recover_process_iunlinks(xlog_t *log);
455
456extern struct xfs_buf *xlog_get_bp(xlog_t *, int); 454extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
457extern void xlog_put_bp(struct xfs_buf *); 455extern void xlog_put_bp(struct xfs_buf *);
458 456
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 47da2fb45377..1099395d7d6c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3263,7 +3263,7 @@ xlog_recover_process_one_iunlink(
3263 * freeing of the inode and its removal from the list must be 3263 * freeing of the inode and its removal from the list must be
3264 * atomic. 3264 * atomic.
3265 */ 3265 */
3266void 3266STATIC void
3267xlog_recover_process_iunlinks( 3267xlog_recover_process_iunlinks(
3268 xlog_t *log) 3268 xlog_t *log)
3269{ 3269{
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5c6f092659c1..8b6c9e807efb 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1568,7 +1568,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
1568 * 1568 *
1569 * The m_sb_lock must be held when this routine is called. 1569 * The m_sb_lock must be held when this routine is called.
1570 */ 1570 */
1571int 1571STATIC int
1572xfs_mod_incore_sb_unlocked( 1572xfs_mod_incore_sb_unlocked(
1573 xfs_mount_t *mp, 1573 xfs_mount_t *mp,
1574 xfs_sb_field_t field, 1574 xfs_sb_field_t field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a5122382afde..a6c023bc0fb2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -414,13 +414,10 @@ typedef struct xfs_mod_sb {
414 414
415extern int xfs_log_sbcount(xfs_mount_t *, uint); 415extern int xfs_log_sbcount(xfs_mount_t *, uint);
416extern int xfs_mountfs(xfs_mount_t *mp); 416extern int xfs_mountfs(xfs_mount_t *mp);
417extern void xfs_mountfs_check_barriers(xfs_mount_t *mp);
418 417
419extern void xfs_unmountfs(xfs_mount_t *); 418extern void xfs_unmountfs(xfs_mount_t *);
420extern int xfs_unmountfs_writesb(xfs_mount_t *); 419extern int xfs_unmountfs_writesb(xfs_mount_t *);
421extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); 420extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
422extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
423 int64_t, int);
424extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, 421extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
425 uint, int); 422 uint, int);
426extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t); 423extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index afee7eb24323..4b0613d99faa 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -564,35 +564,6 @@ xfs_mru_cache_lookup(
564} 564}
565 565
566/* 566/*
567 * To look up an element using its key, but leave its location in the internal
568 * lists alone, call xfs_mru_cache_peek(). If the element isn't found, this
569 * function returns NULL.
570 *
571 * See the comments above the declaration of the xfs_mru_cache_lookup() function
572 * for important locking information pertaining to this call.
573 */
574void *
575xfs_mru_cache_peek(
576 xfs_mru_cache_t *mru,
577 unsigned long key)
578{
579 xfs_mru_cache_elem_t *elem;
580
581 ASSERT(mru && mru->lists);
582 if (!mru || !mru->lists)
583 return NULL;
584
585 spin_lock(&mru->lock);
586 elem = radix_tree_lookup(&mru->store, key);
587 if (!elem)
588 spin_unlock(&mru->lock);
589 else
590 __release(mru_lock); /* help sparse not be stupid */
591
592 return elem ? elem->value : NULL;
593}
594
595/*
596 * To release the internal data structure spinlock after having performed an 567 * To release the internal data structure spinlock after having performed an
597 * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done() 568 * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
598 * with the data store pointer. 569 * with the data store pointer.
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index dd58ea1bbebe..5d439f34b0c9 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -49,7 +49,6 @@ int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
49void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); 49void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
50void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key); 50void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
51void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); 51void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
52void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key);
53void xfs_mru_cache_done(struct xfs_mru_cache *mru); 52void xfs_mru_cache_done(struct xfs_mru_cache *mru);
54 53
55#endif /* __XFS_MRU_CACHE_H__ */ 54#endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index fea68615ed23..3f816ad7ff19 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -88,90 +88,6 @@ xfs_write_clear_setuid(
88} 88}
89 89
90/* 90/*
91 * Handle logging requirements of various synchronous types of write.
92 */
93int
94xfs_write_sync_logforce(
95 xfs_mount_t *mp,
96 xfs_inode_t *ip)
97{
98 int error = 0;
99
100 /*
101 * If we're treating this as O_DSYNC and we have not updated the
102 * size, force the log.
103 */
104 if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
105 !(ip->i_update_size)) {
106 xfs_inode_log_item_t *iip = ip->i_itemp;
107
108 /*
109 * If an allocation transaction occurred
110 * without extending the size, then we have to force
111 * the log up the proper point to ensure that the
112 * allocation is permanent. We can't count on
113 * the fact that buffered writes lock out direct I/O
114 * writes - the direct I/O write could have extended
115 * the size nontransactionally, then finished before
116 * we started. xfs_write_file will think that the file
117 * didn't grow but the update isn't safe unless the
118 * size change is logged.
119 *
120 * Force the log if we've committed a transaction
121 * against the inode or if someone else has and
122 * the commit record hasn't gone to disk (e.g.
123 * the inode is pinned). This guarantees that
124 * all changes affecting the inode are permanent
125 * when we return.
126 */
127 if (iip && iip->ili_last_lsn) {
128 error = _xfs_log_force(mp, iip->ili_last_lsn,
129 XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
130 } else if (xfs_ipincount(ip) > 0) {
131 error = _xfs_log_force(mp, (xfs_lsn_t)0,
132 XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
133 }
134
135 } else {
136 xfs_trans_t *tp;
137
138 /*
139 * O_SYNC or O_DSYNC _with_ a size update are handled
140 * the same way.
141 *
142 * If the write was synchronous then we need to make
143 * sure that the inode modification time is permanent.
144 * We'll have updated the timestamp above, so here
145 * we use a synchronous transaction to log the inode.
146 * It's not fast, but it's necessary.
147 *
148 * If this a dsync write and the size got changed
149 * non-transactionally, then we need to ensure that
150 * the size change gets logged in a synchronous
151 * transaction.
152 */
153 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
154 if ((error = xfs_trans_reserve(tp, 0,
155 XFS_SWRITE_LOG_RES(mp),
156 0, 0, 0))) {
157 /* Transaction reserve failed */
158 xfs_trans_cancel(tp, 0);
159 } else {
160 /* Transaction reserve successful */
161 xfs_ilock(ip, XFS_ILOCK_EXCL);
162 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
163 xfs_trans_ihold(tp, ip);
164 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
165 xfs_trans_set_sync(tp);
166 error = xfs_trans_commit(tp, 0);
167 xfs_iunlock(ip, XFS_ILOCK_EXCL);
168 }
169 }
170
171 return error;
172}
173
174/*
175 * Force a shutdown of the filesystem instantly while keeping 91 * Force a shutdown of the filesystem instantly while keeping
176 * the filesystem consistent. We don't do an unmount here; just shutdown 92 * the filesystem consistent. We don't do an unmount here; just shutdown
177 * the shop, make sure that absolutely nothing persistent happens to 93 * the shop, make sure that absolutely nothing persistent happens to
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f76c003ec55d..f5e4874c37d8 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -68,7 +68,6 @@ xfs_get_extsz_hint(
68 * Prototypes for functions in xfs_rw.c. 68 * Prototypes for functions in xfs_rw.c.
69 */ 69 */
70extern int xfs_write_clear_setuid(struct xfs_inode *ip); 70extern int xfs_write_clear_setuid(struct xfs_inode *ip);
71extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip);
72extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); 71extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
73extern int xfs_bioerror(struct xfs_buf *bp); 72extern int xfs_bioerror(struct xfs_buf *bp);
74extern int xfs_bioerror_relse(struct xfs_buf *bp); 73extern int xfs_bioerror_relse(struct xfs_buf *bp);
@@ -78,10 +77,4 @@ extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
78extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp, 77extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
79 xfs_buf_t *bp, xfs_daddr_t blkno); 78 xfs_buf_t *bp, xfs_daddr_t blkno);
80 79
81/*
82 * Prototypes for functions in xfs_vnodeops.c.
83 */
84extern int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
85 int flags);
86
87#endif /* __XFS_RW_H__ */ 80#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 775249a54f6f..ed47fc77759c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -68,7 +68,7 @@ typedef struct xfs_trans_header {
68#define XFS_TRANS_GROWFS 14 68#define XFS_TRANS_GROWFS 14
69#define XFS_TRANS_STRAT_WRITE 15 69#define XFS_TRANS_STRAT_WRITE 15
70#define XFS_TRANS_DIOSTRAT 16 70#define XFS_TRANS_DIOSTRAT 16
71#define XFS_TRANS_WRITE_SYNC 17 71/* 17 was XFS_TRANS_WRITE_SYNC */
72#define XFS_TRANS_WRITEID 18 72#define XFS_TRANS_WRITEID 18
73#define XFS_TRANS_ADDAFORK 19 73#define XFS_TRANS_ADDAFORK 19
74#define XFS_TRANS_ATTRINVAL 20 74#define XFS_TRANS_ATTRINVAL 20
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8ee2f8c8b0a6..218829e6a152 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -307,7 +307,7 @@ xfs_trans_read_buf(
307 return (flags & XFS_BUF_TRYLOCK) ? 307 return (flags & XFS_BUF_TRYLOCK) ?
308 EAGAIN : XFS_ERROR(ENOMEM); 308 EAGAIN : XFS_ERROR(ENOMEM);
309 309
310 if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) { 310 if (XFS_BUF_GETERROR(bp) != 0) {
311 xfs_ioerror_alert("xfs_trans_read_buf", mp, 311 xfs_ioerror_alert("xfs_trans_read_buf", mp,
312 bp, blkno); 312 bp, blkno);
313 error = XFS_BUF_GETERROR(bp); 313 error = XFS_BUF_GETERROR(bp);
@@ -315,7 +315,7 @@ xfs_trans_read_buf(
315 return error; 315 return error;
316 } 316 }
317#ifdef DEBUG 317#ifdef DEBUG
318 if (xfs_do_error && (bp != NULL)) { 318 if (xfs_do_error) {
319 if (xfs_error_target == target) { 319 if (xfs_error_target == target) {
320 if (((xfs_req_num++) % xfs_error_mod) == 0) { 320 if (((xfs_req_num++) % xfs_error_mod) == 0) {
321 xfs_buf_relse(bp); 321 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 23d276af2e0c..785ff101da0a 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -49,30 +49,7 @@ xfs_trans_inode_broot_debug(
49 49
50 50
51/* 51/*
52 * Get and lock the inode for the caller if it is not already 52 * Get an inode and join it to the transaction.
53 * locked within the given transaction. If it is already locked
54 * within the transaction, just increment its lock recursion count
55 * and return a pointer to it.
56 *
57 * For an inode to be locked in a transaction, the inode lock, as
58 * opposed to the io lock, must be taken exclusively. This ensures
59 * that the inode can be involved in only 1 transaction at a time.
60 * Lock recursion is handled on the io lock, but only for lock modes
61 * of equal or lesser strength. That is, you can recur on the io lock
62 * held EXCL with a SHARED request but not vice versa. Also, if
63 * the inode is already a part of the transaction then you cannot
64 * go from not holding the io lock to having it EXCL or SHARED.
65 *
66 * Use the inode cache routine xfs_inode_incore() to find the inode
67 * if it is already owned by this transaction.
68 *
69 * If we don't already own the inode, use xfs_iget() to get it.
70 * Since the inode log item structure is embedded in the incore
71 * inode structure and is initialized when the inode is brought
72 * into memory, there is nothing to do with it here.
73 *
74 * If the given transaction pointer is NULL, just call xfs_iget().
75 * This simplifies code which must handle both cases.
76 */ 53 */
77int 54int
78xfs_trans_iget( 55xfs_trans_iget(
@@ -84,62 +61,11 @@ xfs_trans_iget(
84 xfs_inode_t **ipp) 61 xfs_inode_t **ipp)
85{ 62{
86 int error; 63 int error;
87 xfs_inode_t *ip;
88
89 /*
90 * If the transaction pointer is NULL, just call the normal
91 * xfs_iget().
92 */
93 if (tp == NULL)
94 return xfs_iget(mp, NULL, ino, flags, lock_flags, ipp, 0);
95
96 /*
97 * If we find the inode in core with this transaction
98 * pointer in its i_transp field, then we know we already
99 * have it locked. In this case we just increment the lock
100 * recursion count and return the inode to the caller.
101 * Assert that the inode is already locked in the mode requested
102 * by the caller. We cannot do lock promotions yet, so
103 * die if someone gets this wrong.
104 */
105 if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) {
106 /*
107 * Make sure that the inode lock is held EXCL and
108 * that the io lock is never upgraded when the inode
109 * is already a part of the transaction.
110 */
111 ASSERT(ip->i_itemp != NULL);
112 ASSERT(lock_flags & XFS_ILOCK_EXCL);
113 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
114 ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
115 xfs_isilocked(ip, XFS_IOLOCK_EXCL));
116 ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
117 (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL));
118 ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
119 xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED));
120 ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
121 (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY));
122
123 if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
124 ip->i_itemp->ili_iolock_recur++;
125 }
126 if (lock_flags & XFS_ILOCK_EXCL) {
127 ip->i_itemp->ili_ilock_recur++;
128 }
129 *ipp = ip;
130 return 0;
131 }
132
133 ASSERT(lock_flags & XFS_ILOCK_EXCL);
134 error = xfs_iget(tp->t_mountp, tp, ino, flags, lock_flags, &ip, 0);
135 if (error) {
136 return error;
137 }
138 ASSERT(ip != NULL);
139 64
140 xfs_trans_ijoin(tp, ip, lock_flags); 65 error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0);
141 *ipp = ip; 66 if (!error && tp)
142 return 0; 67 xfs_trans_ijoin(tp, *ipp, lock_flags);
68 return error;
143} 69}
144 70
145/* 71/*
@@ -163,8 +89,6 @@ xfs_trans_ijoin(
163 xfs_inode_item_init(ip, ip->i_mount); 89 xfs_inode_item_init(ip, ip->i_mount);
164 iip = ip->i_itemp; 90 iip = ip->i_itemp;
165 ASSERT(iip->ili_flags == 0); 91 ASSERT(iip->ili_flags == 0);
166 ASSERT(iip->ili_ilock_recur == 0);
167 ASSERT(iip->ili_iolock_recur == 0);
168 92
169 /* 93 /*
170 * Get a log_item_desc to point at the new item. 94 * Get a log_item_desc to point at the new item.
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 492d75bae2bf..a434f287962d 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -611,7 +611,7 @@ xfs_fsync(
611 xfs_inode_t *ip) 611 xfs_inode_t *ip)
612{ 612{
613 xfs_trans_t *tp; 613 xfs_trans_t *tp;
614 int error; 614 int error = 0;
615 int log_flushed = 0, changed = 1; 615 int log_flushed = 0, changed = 1;
616 616
617 xfs_itrace_entry(ip); 617 xfs_itrace_entry(ip);
@@ -619,14 +619,9 @@ xfs_fsync(
619 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 619 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
620 return XFS_ERROR(EIO); 620 return XFS_ERROR(EIO);
621 621
622 /* capture size updates in I/O completion before writing the inode. */
623 error = xfs_wait_on_pages(ip, 0, -1);
624 if (error)
625 return XFS_ERROR(error);
626
627 /* 622 /*
628 * We always need to make sure that the required inode state is safe on 623 * We always need to make sure that the required inode state is safe on
629 * disk. The vnode might be clean but we still might need to force the 624 * disk. The inode might be clean but we still might need to force the
630 * log because of committed transactions that haven't hit the disk yet. 625 * log because of committed transactions that haven't hit the disk yet.
631 * Likewise, there could be unflushed non-transactional changes to the 626 * Likewise, there could be unflushed non-transactional changes to the
632 * inode core that have to go to disk and this requires us to issue 627 * inode core that have to go to disk and this requires us to issue
@@ -638,7 +633,7 @@ xfs_fsync(
638 */ 633 */
639 xfs_ilock(ip, XFS_ILOCK_SHARED); 634 xfs_ilock(ip, XFS_ILOCK_SHARED);
640 635
641 if (!(ip->i_update_size || ip->i_update_core)) { 636 if (!ip->i_update_core) {
642 /* 637 /*
643 * Timestamps/size haven't changed since last inode flush or 638 * Timestamps/size haven't changed since last inode flush or
644 * inode transaction commit. That means either nothing got 639 * inode transaction commit. That means either nothing got
@@ -718,7 +713,7 @@ xfs_fsync(
718 * when the link count isn't zero and by xfs_dm_punch_hole() when 713 * when the link count isn't zero and by xfs_dm_punch_hole() when
719 * punching a hole to EOF. 714 * punching a hole to EOF.
720 */ 715 */
721int 716STATIC int
722xfs_free_eofblocks( 717xfs_free_eofblocks(
723 xfs_mount_t *mp, 718 xfs_mount_t *mp,
724 xfs_inode_t *ip, 719 xfs_inode_t *ip,
@@ -1476,8 +1471,8 @@ xfs_create(
1476 if (error == ENOSPC) { 1471 if (error == ENOSPC) {
1477 /* flush outstanding delalloc blocks and retry */ 1472 /* flush outstanding delalloc blocks and retry */
1478 xfs_flush_inodes(dp); 1473 xfs_flush_inodes(dp);
1479 error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0, 1474 error = xfs_trans_reserve(tp, resblks, log_res, 0,
1480 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); 1475 XFS_TRANS_PERM_LOG_RES, log_count);
1481 } 1476 }
1482 if (error == ENOSPC) { 1477 if (error == ENOSPC) {
1483 /* No space at all so try a "no-allocation" reservation */ 1478 /* No space at all so try a "no-allocation" reservation */