aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2010-08-10 07:22:08 -0400
committerJiri Kosina <jkosina@suse.cz>2010-08-10 07:22:08 -0400
commitfb8231a8b139035476f2a8aaac837d0099b66dad (patch)
tree2875806beb96ea0cdab292146767a5085721dc6a /fs
parent426d31071ac476ea62c62656b242930c17b58c00 (diff)
parentf6cec0ae58c17522a7bc4e2f39dae19f199ab534 (diff)
Merge branch 'master' into for-next
Conflicts: arch/arm/mach-omap1/board-nokia770.c
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/Kconfig1
-rw-r--r--fs/afs/cell.c40
-rw-r--r--fs/afs/main.c9
-rw-r--r--fs/aio.c21
-rw-r--r--fs/cachefiles/namei.c13
-rw-r--r--fs/cachefiles/rdwr.c4
-rw-r--r--fs/char_dev.c1
-rw-r--r--fs/cifs/Kconfig18
-rw-r--r--fs/cifs/README5
-rw-r--r--fs/cifs/cifs_debug.c25
-rw-r--r--fs/cifs/cifs_dfs_ref.c5
-rw-r--r--fs/cifs/cifs_spnego.c4
-rw-r--r--fs/cifs/cifsfs.c18
-rw-r--r--fs/cifs/cifsglob.h8
-rw-r--r--fs/cifs/cifsproto.h4
-rw-r--r--fs/cifs/connect.c1
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/dns_resolve.c231
-rw-r--r--fs/cifs/dns_resolve.h2
-rw-r--r--fs/cifs/file.c30
-rw-r--r--fs/cifs/inode.c43
-rw-r--r--fs/cifs/misc.c20
-rw-r--r--fs/cifs/netmisc.c45
-rw-r--r--fs/compat.c10
-rw-r--r--fs/dlm/lowcomms.c2
-rw-r--r--fs/dlm/netlink.c15
-rw-r--r--fs/exec.c8
-rw-r--r--fs/ext3/Kconfig1
-rw-r--r--fs/ext3/inode.c80
-rw-r--r--fs/ext3/namei.c3
-rw-r--r--fs/ext3/resize.c2
-rw-r--r--fs/ext3/super.c17
-rw-r--r--fs/ext4/acl.c1
-rw-r--r--fs/ext4/balloc.c6
-rw-r--r--fs/ext4/block_validity.c8
-rw-r--r--fs/ext4/dir.c23
-rw-r--r--fs/ext4/ext4.h152
-rw-r--r--fs/ext4/ext4_jbd2.c71
-rw-r--r--fs/ext4/ext4_jbd2.h56
-rw-r--r--fs/ext4/extents.c18
-rw-r--r--fs/ext4/file.c5
-rw-r--r--fs/ext4/ialloc.c4
-rw-r--r--fs/ext4/inode.c205
-rw-r--r--fs/ext4/mballoc.c153
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c10
-rw-r--r--fs/ext4/namei.c40
-rw-r--r--fs/ext4/resize.c8
-rw-r--r--fs/ext4/super.c334
-rw-r--r--fs/ext4/xattr.c3
-rw-r--r--fs/file.c3
-rw-r--r--fs/fs-writeback.c5
-rw-r--r--fs/fscache/Kconfig1
-rw-r--r--fs/fscache/internal.h8
-rw-r--r--fs/fscache/main.c106
-rw-r--r--fs/fscache/object-list.c11
-rw-r--r--fs/fscache/object.c106
-rw-r--r--fs/fscache/operation.c67
-rw-r--r--fs/fscache/page.c36
-rw-r--r--fs/fuse/dev.c229
-rw-r--r--fs/fuse/file.c2
-rw-r--r--fs/fuse/fuse_i.h3
-rw-r--r--fs/gfs2/Kconfig1
-rw-r--r--fs/gfs2/incore.h3
-rw-r--r--fs/gfs2/main.c14
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/quota.c10
-rw-r--r--fs/gfs2/recovery.c54
-rw-r--r--fs/gfs2/recovery.h6
-rw-r--r--fs/gfs2/sys.c3
-rw-r--r--fs/jbd/journal.c7
-rw-r--r--fs/jbd/recovery.c11
-rw-r--r--fs/jbd2/checkpoint.c18
-rw-r--r--fs/jbd2/commit.c50
-rw-r--r--fs/jbd2/journal.c121
-rw-r--r--fs/jbd2/recovery.c10
-rw-r--r--fs/jbd2/transaction.c233
-rw-r--r--fs/nfs/Kconfig10
-rw-r--r--fs/nfs/callback_proc.c19
-rw-r--r--fs/nfs/client.c21
-rw-r--r--fs/nfs/delegation.c16
-rw-r--r--fs/nfs/delegation.h4
-rw-r--r--fs/nfs/dir.c9
-rw-r--r--fs/nfs/direct.c29
-rw-r--r--fs/nfs/file.c51
-rw-r--r--fs/nfs/inode.c74
-rw-r--r--fs/nfs/internal.h7
-rw-r--r--fs/nfs/nfs2xdr.c7
-rw-r--r--fs/nfs/nfs3xdr.c8
-rw-r--r--fs/nfs/nfs4_fs.h57
-rw-r--r--fs/nfs/nfs4proc.c474
-rw-r--r--fs/nfs/nfs4renewd.c4
-rw-r--r--fs/nfs/nfs4state.c82
-rw-r--r--fs/nfs/nfs4xdr.c107
-rw-r--r--fs/nfs/pagelist.c8
-rw-r--r--fs/nfs/read.c3
-rw-r--r--fs/nfs/super.c4
-rw-r--r--fs/nfs/unlink.c2
-rw-r--r--fs/nfs/write.c9
-rw-r--r--fs/nfsd/nfs3proc.c8
-rw-r--r--fs/nfsd/nfs4callback.c57
-rw-r--r--fs/nfsd/nfs4state.c381
-rw-r--r--fs/nfsd/nfs4xdr.c3
-rw-r--r--fs/nfsd/nfsctl.c24
-rw-r--r--fs/nfsd/nfsd.h1
-rw-r--r--fs/nfsd/nfsproc.c4
-rw-r--r--fs/nfsd/nfssvc.c151
-rw-r--r--fs/nfsd/state.h40
-rw-r--r--fs/nfsd/vfs.c79
-rw-r--r--fs/nfsd/vfs.h4
-rw-r--r--fs/nilfs2/bmap.c6
-rw-r--r--fs/nilfs2/bmap.h16
-rw-r--r--fs/nilfs2/bmap_union.h42
-rw-r--r--fs/nilfs2/btnode.c23
-rw-r--r--fs/nilfs2/btnode.h4
-rw-r--r--fs/nilfs2/btree.c914
-rw-r--r--fs/nilfs2/btree.h12
-rw-r--r--fs/nilfs2/dir.c33
-rw-r--r--fs/nilfs2/direct.c96
-rw-r--r--fs/nilfs2/direct.h11
-rw-r--r--fs/nilfs2/gcinode.c17
-rw-r--r--fs/nilfs2/mdt.c1
-rw-r--r--fs/nilfs2/nilfs.h22
-rw-r--r--fs/nilfs2/page.c5
-rw-r--r--fs/nilfs2/page.h2
-rw-r--r--fs/nilfs2/recovery.c348
-rw-r--r--fs/nilfs2/segbuf.h24
-rw-r--r--fs/nilfs2/segment.c19
-rw-r--r--fs/nilfs2/segment.h10
-rw-r--r--fs/nilfs2/super.c333
-rw-r--r--fs/nilfs2/the_nilfs.c161
-rw-r--r--fs/nilfs2/the_nilfs.h23
-rw-r--r--fs/ocfs2/journal.c4
-rw-r--r--fs/partitions/check.c1
-rw-r--r--fs/proc/base.c104
-rw-r--r--fs/quota/dquot.c79
-rw-r--r--fs/quota/quota_tree.c85
-rw-r--r--fs/quota/quota_tree.h6
-rw-r--r--fs/quota/quota_v1.c3
-rw-r--r--fs/quota/quota_v2.c11
-rw-r--r--fs/readdir.c8
-rw-r--r--fs/sysfs/file.c3
-rw-r--r--fs/udf/file.c1
-rw-r--r--fs/udf/super.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c10
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c32
146 files changed, 4203 insertions, 2938 deletions
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index 5c4e61d3c772..8f975f25b486 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -2,6 +2,7 @@ config AFS_FS
2 tristate "Andrew File System support (AFS) (EXPERIMENTAL)" 2 tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL 3 depends on INET && EXPERIMENTAL
4 select AF_RXRPC 4 select AF_RXRPC
5 select DNS_RESOLVER
5 help 6 help
6 If you say Y here, you will get an experimental Andrew File System 7 If you say Y here, you will get an experimental Andrew File System
7 driver. It currently only supports unsecured read-only AFS access. 8 driver. It currently only supports unsecured read-only AFS access.
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index e19c13f059ed..ffea35c63879 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -13,6 +13,7 @@
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/key.h> 14#include <linux/key.h>
15#include <linux/ctype.h> 15#include <linux/ctype.h>
16#include <linux/dns_resolver.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
17#include <keys/rxrpc-type.h> 18#include <keys/rxrpc-type.h>
18#include "internal.h" 19#include "internal.h"
@@ -36,6 +37,8 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
36 struct key *key; 37 struct key *key;
37 size_t namelen; 38 size_t namelen;
38 char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next; 39 char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next;
40 char *dvllist = NULL, *_vllist = NULL;
41 char delimiter = ':';
39 int ret; 42 int ret;
40 43
41 _enter("%s,%s", name, vllist); 44 _enter("%s,%s", name, vllist);
@@ -43,8 +46,10 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
43 BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */ 46 BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */
44 47
45 namelen = strlen(name); 48 namelen = strlen(name);
46 if (namelen > AFS_MAXCELLNAME) 49 if (namelen > AFS_MAXCELLNAME) {
50 _leave(" = -ENAMETOOLONG");
47 return ERR_PTR(-ENAMETOOLONG); 51 return ERR_PTR(-ENAMETOOLONG);
52 }
48 53
49 /* allocate and initialise a cell record */ 54 /* allocate and initialise a cell record */
50 cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL); 55 cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL);
@@ -64,15 +69,31 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
64 INIT_LIST_HEAD(&cell->vl_list); 69 INIT_LIST_HEAD(&cell->vl_list);
65 spin_lock_init(&cell->vl_lock); 70 spin_lock_init(&cell->vl_lock);
66 71
72 /* if the ip address is invalid, try dns query */
73 if (!vllist || strlen(vllist) < 7) {
74 ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL);
75 if (ret < 0) {
76 _leave(" = %d", ret);
77 return ERR_PTR(ret);
78 }
79 _vllist = dvllist;
80
81 /* change the delimiter for user-space reply */
82 delimiter = ',';
83
84 } else {
85 _vllist = vllist;
86 }
87
67 /* fill in the VL server list from the rest of the string */ 88 /* fill in the VL server list from the rest of the string */
68 do { 89 do {
69 unsigned a, b, c, d; 90 unsigned a, b, c, d;
70 91
71 next = strchr(vllist, ':'); 92 next = strchr(_vllist, delimiter);
72 if (next) 93 if (next)
73 *next++ = 0; 94 *next++ = 0;
74 95
75 if (sscanf(vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4) 96 if (sscanf(_vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4)
76 goto bad_address; 97 goto bad_address;
77 98
78 if (a > 255 || b > 255 || c > 255 || d > 255) 99 if (a > 255 || b > 255 || c > 255 || d > 255)
@@ -81,7 +102,7 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
81 cell->vl_addrs[cell->vl_naddrs++].s_addr = 102 cell->vl_addrs[cell->vl_naddrs++].s_addr =
82 htonl((a << 24) | (b << 16) | (c << 8) | d); 103 htonl((a << 24) | (b << 16) | (c << 8) | d);
83 104
84 } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (vllist = next)); 105 } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (_vllist = next));
85 106
86 /* create a key to represent an anonymous user */ 107 /* create a key to represent an anonymous user */
87 memcpy(keyname, "afs@", 4); 108 memcpy(keyname, "afs@", 4);
@@ -110,6 +131,7 @@ bad_address:
110 ret = -EINVAL; 131 ret = -EINVAL;
111error: 132error:
112 key_put(cell->anonymous_key); 133 key_put(cell->anonymous_key);
134 kfree(dvllist);
113 kfree(cell); 135 kfree(cell);
114 _leave(" = %d", ret); 136 _leave(" = %d", ret);
115 return ERR_PTR(ret); 137 return ERR_PTR(ret);
@@ -201,14 +223,12 @@ int afs_cell_init(char *rootcell)
201 } 223 }
202 224
203 cp = strchr(rootcell, ':'); 225 cp = strchr(rootcell, ':');
204 if (!cp) { 226 if (!cp)
205 printk(KERN_ERR "kAFS: no VL server IP addresses specified\n"); 227 _debug("kAFS: no VL server IP addresses specified");
206 _leave(" = -EINVAL"); 228 else
207 return -EINVAL; 229 *cp++ = 0;
208 }
209 230
210 /* allocate a cell record for the root cell */ 231 /* allocate a cell record for the root cell */
211 *cp++ = 0;
212 new_root = afs_cell_create(rootcell, cp); 232 new_root = afs_cell_create(rootcell, cp);
213 if (IS_ERR(new_root)) { 233 if (IS_ERR(new_root)) {
214 _leave(" = %ld", PTR_ERR(new_root)); 234 _leave(" = %ld", PTR_ERR(new_root));
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 66d54d348c55..cfd1cbe25b22 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -111,6 +111,8 @@ static int __init afs_init(void)
111 111
112 /* initialise the callback update process */ 112 /* initialise the callback update process */
113 ret = afs_callback_update_init(); 113 ret = afs_callback_update_init();
114 if (ret < 0)
115 goto error_callback_update_init;
114 116
115 /* create the RxRPC transport */ 117 /* create the RxRPC transport */
116 ret = afs_open_socket(); 118 ret = afs_open_socket();
@@ -127,15 +129,16 @@ static int __init afs_init(void)
127error_fs: 129error_fs:
128 afs_close_socket(); 130 afs_close_socket();
129error_open_socket: 131error_open_socket:
132 afs_callback_update_kill();
133error_callback_update_init:
134 afs_vlocation_purge();
130error_vl_update_init: 135error_vl_update_init:
136 afs_cell_purge();
131error_cell_init: 137error_cell_init:
132#ifdef CONFIG_AFS_FSCACHE 138#ifdef CONFIG_AFS_FSCACHE
133 fscache_unregister_netfs(&afs_cache_netfs); 139 fscache_unregister_netfs(&afs_cache_netfs);
134error_cache: 140error_cache:
135#endif 141#endif
136 afs_callback_update_kill();
137 afs_vlocation_purge();
138 afs_cell_purge();
139 afs_proc_cleanup(); 142 afs_proc_cleanup();
140 rcu_barrier(); 143 rcu_barrier();
141 printk(KERN_ERR "kAFS: failed to register: %d\n", ret); 144 printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
diff --git a/fs/aio.c b/fs/aio.c
index 1ccf25cef1f0..3006b5bc33d6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1277,7 +1277,7 @@ out:
1277/* sys_io_destroy: 1277/* sys_io_destroy:
1278 * Destroy the aio_context specified. May cancel any outstanding 1278 * Destroy the aio_context specified. May cancel any outstanding
1279 * AIOs and block on completion. Will fail with -ENOSYS if not 1279 * AIOs and block on completion. Will fail with -ENOSYS if not
1280 * implemented. May fail with -EFAULT if the context pointed to 1280 * implemented. May fail with -EINVAL if the context pointed to
1281 * is invalid. 1281 * is invalid.
1282 */ 1282 */
1283SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) 1283SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
@@ -1795,15 +1795,16 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
1795 1795
1796/* io_getevents: 1796/* io_getevents:
1797 * Attempts to read at least min_nr events and up to nr events from 1797 * Attempts to read at least min_nr events and up to nr events from
1798 * the completion queue for the aio_context specified by ctx_id. May 1798 * the completion queue for the aio_context specified by ctx_id. If
1799 * fail with -EINVAL if ctx_id is invalid, if min_nr is out of range, 1799 * it succeeds, the number of read events is returned. May fail with
1800 * if nr is out of range, if when is out of range. May fail with 1800 * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
1801 * -EFAULT if any of the memory specified to is invalid. May return 1801 * out of range, if timeout is out of range. May fail with -EFAULT
1802 * 0 or < min_nr if no events are available and the timeout specified 1802 * if any of the memory specified is invalid. May return 0 or
1803 * by when has elapsed, where when == NULL specifies an infinite 1803 * < min_nr if the timeout specified by timeout has elapsed
1804 * timeout. Note that the timeout pointed to by when is relative and 1804 * before sufficient events are available, where timeout == NULL
1805 * will be updated if not NULL and the operation blocks. Will fail 1805 * specifies an infinite timeout. Note that the timeout pointed to by
1806 * with -ENOSYS if not implemented. 1806 * timeout is relative and will be updated if not NULL and the
1807 * operation blocks. Will fail with -ENOSYS if not implemented.
1807 */ 1808 */
1808SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, 1809SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
1809 long, min_nr, 1810 long, min_nr,
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index f4a7840bf42c..42c7fafc8bfe 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -37,9 +37,9 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
37 37
38 printk(KERN_ERR "%sobject: OBJ%x\n", 38 printk(KERN_ERR "%sobject: OBJ%x\n",
39 prefix, object->fscache.debug_id); 39 prefix, object->fscache.debug_id);
40 printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n", 40 printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
41 prefix, fscache_object_states[object->fscache.state], 41 prefix, fscache_object_states[object->fscache.state],
42 object->fscache.flags, object->fscache.work.flags, 42 object->fscache.flags, work_busy(&object->fscache.work),
43 object->fscache.events, 43 object->fscache.events,
44 object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK); 44 object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
45 printk(KERN_ERR "%sops=%u inp=%u exc=%u\n", 45 printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
@@ -212,7 +212,7 @@ wait_for_old_object:
212 212
213 /* if the object we're waiting for is queued for processing, 213 /* if the object we're waiting for is queued for processing,
214 * then just put ourselves on the queue behind it */ 214 * then just put ourselves on the queue behind it */
215 if (slow_work_is_queued(&xobject->fscache.work)) { 215 if (work_pending(&xobject->fscache.work)) {
216 _debug("queue OBJ%x behind OBJ%x immediately", 216 _debug("queue OBJ%x behind OBJ%x immediately",
217 object->fscache.debug_id, 217 object->fscache.debug_id,
218 xobject->fscache.debug_id); 218 xobject->fscache.debug_id);
@@ -220,8 +220,7 @@ wait_for_old_object:
220 } 220 }
221 221
222 /* otherwise we sleep until either the object we're waiting for 222 /* otherwise we sleep until either the object we're waiting for
223 * is done, or the slow-work facility wants the thread back to 223 * is done, or the fscache_object is congested */
224 * do other work */
225 wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE); 224 wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
226 init_wait(&wait); 225 init_wait(&wait);
227 requeue = false; 226 requeue = false;
@@ -229,8 +228,8 @@ wait_for_old_object:
229 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); 228 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
230 if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) 229 if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
231 break; 230 break;
232 requeue = slow_work_sleep_till_thread_needed( 231
233 &object->fscache.work, &timeout); 232 requeue = fscache_object_sleep_till_congested(&timeout);
234 } while (timeout > 0 && !requeue); 233 } while (timeout > 0 && !requeue);
235 finish_wait(wq, &wait); 234 finish_wait(wq, &wait);
236 235
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 0f0d41fbb03f..0e3c0924cc3a 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -422,7 +422,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
422 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; 422 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
423 423
424 op->op.flags &= FSCACHE_OP_KEEP_FLAGS; 424 op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
425 op->op.flags |= FSCACHE_OP_FAST; 425 op->op.flags |= FSCACHE_OP_ASYNC;
426 op->op.processor = cachefiles_read_copier; 426 op->op.processor = cachefiles_read_copier;
427 427
428 pagevec_init(&pagevec, 0); 428 pagevec_init(&pagevec, 0);
@@ -729,7 +729,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
729 pagevec_init(&pagevec, 0); 729 pagevec_init(&pagevec, 0);
730 730
731 op->op.flags &= FSCACHE_OP_KEEP_FLAGS; 731 op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
732 op->op.flags |= FSCACHE_OP_FAST; 732 op->op.flags |= FSCACHE_OP_ASYNC;
733 op->op.processor = cachefiles_read_copier; 733 op->op.processor = cachefiles_read_copier;
734 734
735 INIT_LIST_HEAD(&backpages); 735 INIT_LIST_HEAD(&backpages);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index d6db933df2b2..f80a4f25123c 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -20,6 +20,7 @@
20#include <linux/cdev.h> 20#include <linux/cdev.h>
21#include <linux/mutex.h> 21#include <linux/mutex.h>
22#include <linux/backing-dev.h> 22#include <linux/backing-dev.h>
23#include <linux/tty.h>
23 24
24#include "internal.h" 25#include "internal.h"
25 26
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 5739fd7f88b4..917b7d449bb2 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,7 +2,6 @@ config CIFS
2 tristate "CIFS support (advanced network filesystem, SMBFS successor)" 2 tristate "CIFS support (advanced network filesystem, SMBFS successor)"
3 depends on INET 3 depends on INET
4 select NLS 4 select NLS
5 select SLOW_WORK
6 help 5 help
7 This is the client VFS module for the Common Internet File System 6 This is the client VFS module for the Common Internet File System
8 (CIFS) protocol which is the successor to the Server Message Block 7 (CIFS) protocol which is the successor to the Server Message Block
@@ -71,14 +70,14 @@ config CIFS_WEAK_PW_HASH
71 If unsure, say N. 70 If unsure, say N.
72 71
73config CIFS_UPCALL 72config CIFS_UPCALL
74 bool "Kerberos/SPNEGO advanced session setup" 73 bool "Kerberos/SPNEGO advanced session setup"
75 depends on CIFS && KEYS 74 depends on CIFS && KEYS
76 help 75 select DNS_RESOLVER
77 Enables an upcall mechanism for CIFS which accesses 76 help
78 userspace helper utilities to provide SPNEGO packaged (RFC 4178) 77 Enables an upcall mechanism for CIFS which accesses userspace helper
79 Kerberos tickets which are needed to mount to certain secure servers 78 utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets
80 (for which more secure Kerberos authentication is required). If 79 which are needed to mount to certain secure servers (for which more
81 unsure, say N. 80 secure Kerberos authentication is required). If unsure, say N.
82 81
83config CIFS_XATTR 82config CIFS_XATTR
84 bool "CIFS extended attributes" 83 bool "CIFS extended attributes"
@@ -122,6 +121,7 @@ config CIFS_DEBUG2
122config CIFS_DFS_UPCALL 121config CIFS_DFS_UPCALL
123 bool "DFS feature support" 122 bool "DFS feature support"
124 depends on CIFS && KEYS 123 depends on CIFS && KEYS
124 select DNS_RESOLVER
125 help 125 help
126 Distributed File System (DFS) support is used to access shares 126 Distributed File System (DFS) support is used to access shares
127 transparently in an enterprise name space, even if the share 127 transparently in an enterprise name space, even if the share
diff --git a/fs/cifs/README b/fs/cifs/README
index a727b7cb075f..a7081eeeb85d 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -568,8 +568,9 @@ module can be displayed via modinfo.
568Misc /proc/fs/cifs Flags and Debug Info 568Misc /proc/fs/cifs Flags and Debug Info
569======================================= 569=======================================
570Informational pseudo-files: 570Informational pseudo-files:
571DebugData Displays information about active CIFS sessions 571DebugData Displays information about active CIFS sessions and
572 and shares, as well as the cifs.ko version. 572 shares, features enabled as well as the cifs.ko
573 version.
573Stats Lists summary resource usage information as well as per 574Stats Lists summary resource usage information as well as per
574 share statistics, if CONFIG_CIFS_STATS in enabled 575 share statistics, if CONFIG_CIFS_STATS in enabled
575 in the kernel configuration. 576 in the kernel configuration.
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 4fce6e61b34e..eb1ba493489f 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -119,6 +119,31 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
119 "Display Internal CIFS Data Structures for Debugging\n" 119 "Display Internal CIFS Data Structures for Debugging\n"
120 "---------------------------------------------------\n"); 120 "---------------------------------------------------\n");
121 seq_printf(m, "CIFS Version %s\n", CIFS_VERSION); 121 seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
122 seq_printf(m, "Features: ");
123#ifdef CONFIG_CIFS_DFS_UPCALL
124 seq_printf(m, "dfs");
125 seq_putc(m, ' ');
126#endif
127#ifdef CONFIG_CIFS_FSCACHE
128 seq_printf(m, "fscache");
129 seq_putc(m, ' ');
130#endif
131#ifdef CONFIG_CIFS_WEAK_PW_HASH
132 seq_printf(m, "lanman");
133 seq_putc(m, ' ');
134#endif
135#ifdef CONFIG_CIFS_POSIX
136 seq_printf(m, "posix");
137 seq_putc(m, ' ');
138#endif
139#ifdef CONFIG_CIFS_UPCALL
140 seq_printf(m, "spnego");
141 seq_putc(m, ' ');
142#endif
143#ifdef CONFIG_CIFS_XATTR
144 seq_printf(m, "xattr");
145#endif
146 seq_putc(m, '\n');
122 seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); 147 seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
123 seq_printf(m, "Servers:"); 148 seq_printf(m, "Servers:");
124 149
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index dc1ed50ea06e..d6ced7aa23cf 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -141,7 +141,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
141 } 141 }
142 142
143 rc = dns_resolve_server_name_to_ip(*devname, &srvIP); 143 rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
144 if (rc != 0) { 144 if (rc < 0) {
145 cERROR(1, "%s: Failed to resolve server part of %s to IP: %d", 145 cERROR(1, "%s: Failed to resolve server part of %s to IP: %d",
146 __func__, *devname, rc); 146 __func__, *devname, rc);
147 goto compose_mount_options_err; 147 goto compose_mount_options_err;
@@ -150,8 +150,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
150 * assuming that we have 'unc=' and 'ip=' in 150 * assuming that we have 'unc=' and 'ip=' in
151 * the original sb_mountdata 151 * the original sb_mountdata
152 */ 152 */
153 md_len = strlen(sb_mountdata) + strlen(srvIP) + 153 md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12;
154 strlen(ref->node_name) + 12;
155 mountdata = kzalloc(md_len+1, GFP_KERNEL); 154 mountdata = kzalloc(md_len+1, GFP_KERNEL);
156 if (mountdata == NULL) { 155 if (mountdata == NULL) {
157 rc = -ENOMEM; 156 rc = -ENOMEM;
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 6effccff85a5..87044906cd1f 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -84,6 +84,9 @@ struct key_type cifs_spnego_key_type = {
84/* strlen of ";uid=0x" */ 84/* strlen of ";uid=0x" */
85#define UID_KEY_LEN 7 85#define UID_KEY_LEN 7
86 86
87/* strlen of ";creduid=0x" */
88#define CREDUID_KEY_LEN 11
89
87/* strlen of ";user=" */ 90/* strlen of ";user=" */
88#define USER_KEY_LEN 6 91#define USER_KEY_LEN 6
89 92
@@ -107,6 +110,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
107 IP_KEY_LEN + INET6_ADDRSTRLEN + 110 IP_KEY_LEN + INET6_ADDRSTRLEN +
108 MAX_MECH_STR_LEN + 111 MAX_MECH_STR_LEN +
109 UID_KEY_LEN + (sizeof(uid_t) * 2) + 112 UID_KEY_LEN + (sizeof(uid_t) * 2) +
113 CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
110 USER_KEY_LEN + strlen(sesInfo->userName) + 114 USER_KEY_LEN + strlen(sesInfo->userName) +
111 PID_KEY_LEN + (sizeof(pid_t) * 2) + 1; 115 PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
112 116
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8a2cf129e535..a5ed10c9afef 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -45,7 +45,6 @@
45#include "cifs_fs_sb.h" 45#include "cifs_fs_sb.h"
46#include <linux/mm.h> 46#include <linux/mm.h>
47#include <linux/key-type.h> 47#include <linux/key-type.h>
48#include "dns_resolve.h"
49#include "cifs_spnego.h" 48#include "cifs_spnego.h"
50#include "fscache.h" 49#include "fscache.h"
51#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ 50#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
@@ -934,27 +933,13 @@ init_cifs(void)
934 if (rc) 933 if (rc)
935 goto out_unregister_filesystem; 934 goto out_unregister_filesystem;
936#endif 935#endif
937#ifdef CONFIG_CIFS_DFS_UPCALL
938 rc = cifs_init_dns_resolver();
939 if (rc)
940 goto out_unregister_key_type;
941#endif
942 rc = slow_work_register_user(THIS_MODULE);
943 if (rc)
944 goto out_unregister_resolver_key;
945 936
946 return 0; 937 return 0;
947 938
948 out_unregister_resolver_key:
949#ifdef CONFIG_CIFS_DFS_UPCALL
950 cifs_exit_dns_resolver();
951 out_unregister_key_type:
952#endif
953#ifdef CONFIG_CIFS_UPCALL 939#ifdef CONFIG_CIFS_UPCALL
954 unregister_key_type(&cifs_spnego_key_type);
955 out_unregister_filesystem: 940 out_unregister_filesystem:
956#endif
957 unregister_filesystem(&cifs_fs_type); 941 unregister_filesystem(&cifs_fs_type);
942#endif
958 out_destroy_request_bufs: 943 out_destroy_request_bufs:
959 cifs_destroy_request_bufs(); 944 cifs_destroy_request_bufs();
960 out_destroy_mids: 945 out_destroy_mids:
@@ -976,7 +961,6 @@ exit_cifs(void)
976 cifs_fscache_unregister(); 961 cifs_fscache_unregister();
977#ifdef CONFIG_CIFS_DFS_UPCALL 962#ifdef CONFIG_CIFS_DFS_UPCALL
978 cifs_dfs_release_automount_timer(); 963 cifs_dfs_release_automount_timer();
979 cifs_exit_dns_resolver();
980#endif 964#endif
981#ifdef CONFIG_CIFS_UPCALL 965#ifdef CONFIG_CIFS_UPCALL
982 unregister_key_type(&cifs_spnego_key_type); 966 unregister_key_type(&cifs_spnego_key_type);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 59906146ad36..0cdfb8c32ac6 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -22,7 +22,7 @@
22#include <linux/in.h> 22#include <linux/in.h>
23#include <linux/in6.h> 23#include <linux/in6.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/slow-work.h> 25#include <linux/workqueue.h>
26#include "cifs_fs_sb.h" 26#include "cifs_fs_sb.h"
27#include "cifsacl.h" 27#include "cifsacl.h"
28/* 28/*
@@ -356,7 +356,7 @@ struct cifsFileInfo {
356 atomic_t count; /* reference count */ 356 atomic_t count; /* reference count */
357 struct mutex fh_mutex; /* prevents reopen race after dead ses*/ 357 struct mutex fh_mutex; /* prevents reopen race after dead ses*/
358 struct cifs_search_info srch_inf; 358 struct cifs_search_info srch_inf;
359 struct slow_work oplock_break; /* slow_work job for oplock breaks */ 359 struct work_struct oplock_break; /* work for oplock breaks */
360}; 360};
361 361
362/* Take a reference on the file private data */ 362/* Take a reference on the file private data */
@@ -728,6 +728,10 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
728GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ 728GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
729GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ 729GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
730 730
731void cifs_oplock_break(struct work_struct *work);
732void cifs_oplock_break_get(struct cifsFileInfo *cfile);
733void cifs_oplock_break_put(struct cifsFileInfo *cfile);
734
731extern const struct slow_work_ops cifs_oplock_break_ops; 735extern const struct slow_work_ops cifs_oplock_break_ops;
732 736
733#endif /* _CIFS_GLOB_H */ 737#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 2eaebbd31132..1f5450814087 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -86,8 +86,8 @@ extern unsigned int smbCalcSize(struct smb_hdr *ptr);
86extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); 86extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
87extern int decode_negTokenInit(unsigned char *security_blob, int length, 87extern int decode_negTokenInit(unsigned char *security_blob, int length,
88 struct TCP_Server_Info *server); 88 struct TCP_Server_Info *server);
89extern int cifs_convert_address(struct sockaddr *dst, char *src); 89extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
90extern int cifs_fill_sockaddr(struct sockaddr *dst, char *src, 90extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
91 unsigned short int port); 91 unsigned short int port);
92extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr); 92extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr);
93extern void header_assemble(struct smb_hdr *, char /* command */ , 93extern void header_assemble(struct smb_hdr *, char /* command */ ,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2a43a0aca965..95c2ea67edfb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1543,6 +1543,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1543 if (volume_info->UNCip && volume_info->UNC) { 1543 if (volume_info->UNCip && volume_info->UNC) {
1544 rc = cifs_fill_sockaddr((struct sockaddr *)&addr, 1544 rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
1545 volume_info->UNCip, 1545 volume_info->UNCip,
1546 strlen(volume_info->UNCip),
1546 volume_info->port); 1547 volume_info->port);
1547 if (!rc) { 1548 if (!rc) {
1548 /* we failed translating address */ 1549 /* we failed translating address */
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index a7de5e9fff11..578d88c5b46e 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -157,7 +157,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
157 mutex_init(&pCifsFile->lock_mutex); 157 mutex_init(&pCifsFile->lock_mutex);
158 INIT_LIST_HEAD(&pCifsFile->llist); 158 INIT_LIST_HEAD(&pCifsFile->llist);
159 atomic_set(&pCifsFile->count, 1); 159 atomic_set(&pCifsFile->count, 1);
160 slow_work_init(&pCifsFile->oplock_break, &cifs_oplock_break_ops); 160 INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break);
161 161
162 write_lock(&GlobalSMBSeslock); 162 write_lock(&GlobalSMBSeslock);
163 list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList); 163 list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 3ad7f4300c45..0eb87026cad3 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -4,6 +4,8 @@
4 * Copyright (c) 2007 Igor Mammedov 4 * Copyright (c) 2007 Igor Mammedov
5 * Author(s): Igor Mammedov (niallain@gmail.com) 5 * Author(s): Igor Mammedov (niallain@gmail.com)
6 * Steve French (sfrench@us.ibm.com) 6 * Steve French (sfrench@us.ibm.com)
7 * Wang Lei (wang840925@gmail.com)
8 * David Howells (dhowells@redhat.com)
7 * 9 *
8 * Contains the CIFS DFS upcall routines used for hostname to 10 * Contains the CIFS DFS upcall routines used for hostname to
9 * IP address translation. 11 * IP address translation.
@@ -24,214 +26,73 @@
24 */ 26 */
25 27
26#include <linux/slab.h> 28#include <linux/slab.h>
27#include <linux/keyctl.h> 29#include <linux/dns_resolver.h>
28#include <linux/key-type.h>
29#include <keys/user-type.h>
30#include "dns_resolve.h" 30#include "dns_resolve.h"
31#include "cifsglob.h" 31#include "cifsglob.h"
32#include "cifsproto.h" 32#include "cifsproto.h"
33#include "cifs_debug.h" 33#include "cifs_debug.h"
34 34
35static const struct cred *dns_resolver_cache; 35/**
36 36 * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address.
37/* Checks if supplied name is IP address 37 * @unc: UNC path specifying the server
38 * returns: 38 * @ip_addr: Where to return the IP address.
39 * 1 - name is IP 39 *
40 * 0 - name is not IP 40 * The IP address will be returned in string form, and the caller is
41 */ 41 * responsible for freeing it.
42static int 42 *
43is_ip(char *name) 43 * Returns length of result on success, -ve on error.
44{
45 struct sockaddr_storage ss;
46
47 return cifs_convert_address((struct sockaddr *)&ss, name);
48}
49
50static int
51dns_resolver_instantiate(struct key *key, const void *data,
52 size_t datalen)
53{
54 int rc = 0;
55 char *ip;
56
57 ip = kmalloc(datalen + 1, GFP_KERNEL);
58 if (!ip)
59 return -ENOMEM;
60
61 memcpy(ip, data, datalen);
62 ip[datalen] = '\0';
63
64 /* make sure this looks like an address */
65 if (!is_ip(ip)) {
66 kfree(ip);
67 return -EINVAL;
68 }
69
70 key->type_data.x[0] = datalen;
71 key->payload.data = ip;
72
73 return rc;
74}
75
76static void
77dns_resolver_destroy(struct key *key)
78{
79 kfree(key->payload.data);
80}
81
82struct key_type key_type_dns_resolver = {
83 .name = "dns_resolver",
84 .def_datalen = sizeof(struct in_addr),
85 .describe = user_describe,
86 .instantiate = dns_resolver_instantiate,
87 .destroy = dns_resolver_destroy,
88 .match = user_match,
89};
90
91/* Resolves server name to ip address.
92 * input:
93 * unc - server UNC
94 * output:
95 * *ip_addr - pointer to server ip, caller responcible for freeing it.
96 * return 0 on success
97 */ 44 */
98int 45int
99dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) 46dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
100{ 47{
101 const struct cred *saved_cred; 48 struct sockaddr_storage ss;
102 int rc = -EAGAIN; 49 const char *hostname, *sep;
103 struct key *rkey = ERR_PTR(-EAGAIN);
104 char *name; 50 char *name;
105 char *data = NULL; 51 int len, rc;
106 int len;
107 52
108 if (!ip_addr || !unc) 53 if (!ip_addr || !unc)
109 return -EINVAL; 54 return -EINVAL;
110 55
111 /* search for server name delimiter */
112 len = strlen(unc); 56 len = strlen(unc);
113 if (len < 3) { 57 if (len < 3) {
114 cFYI(1, "%s: unc is too short: %s", __func__, unc); 58 cFYI(1, "%s: unc is too short: %s", __func__, unc);
115 return -EINVAL; 59 return -EINVAL;
116 } 60 }
117 len -= 2;
118 name = memchr(unc+2, '\\', len);
119 if (!name) {
120 cFYI(1, "%s: probably server name is whole unc: %s",
121 __func__, unc);
122 } else {
123 len = (name - unc) - 2/* leading // */;
124 }
125
126 name = kmalloc(len+1, GFP_KERNEL);
127 if (!name) {
128 rc = -ENOMEM;
129 return rc;
130 }
131 memcpy(name, unc+2, len);
132 name[len] = 0;
133
134 if (is_ip(name)) {
135 cFYI(1, "%s: it is IP, skipping dns upcall: %s",
136 __func__, name);
137 data = name;
138 goto skip_upcall;
139 }
140 61
141 saved_cred = override_creds(dns_resolver_cache); 62 /* Discount leading slashes for cifs */
142 rkey = request_key(&key_type_dns_resolver, name, ""); 63 len -= 2;
143 revert_creds(saved_cred); 64 hostname = unc + 2;
144 if (!IS_ERR(rkey)) {
145 if (!(rkey->perm & KEY_USR_VIEW)) {
146 down_read(&rkey->sem);
147 rkey->perm |= KEY_USR_VIEW;
148 up_read(&rkey->sem);
149 }
150 len = rkey->type_data.x[0];
151 data = rkey->payload.data;
152 } else {
153 cERROR(1, "%s: unable to resolve: %s", __func__, name);
154 goto out;
155 }
156
157skip_upcall:
158 if (data) {
159 *ip_addr = kmalloc(len + 1, GFP_KERNEL);
160 if (*ip_addr) {
161 memcpy(*ip_addr, data, len + 1);
162 if (!IS_ERR(rkey))
163 cFYI(1, "%s: resolved: %s to %s", __func__,
164 name,
165 *ip_addr
166 );
167 rc = 0;
168 } else {
169 rc = -ENOMEM;
170 }
171 if (!IS_ERR(rkey))
172 key_put(rkey);
173 }
174 65
175out: 66 /* Search for server name delimiter */
176 kfree(name); 67 sep = memchr(hostname, '\\', len);
68 if (sep)
69 len = sep - unc;
70 else
71 cFYI(1, "%s: probably server name is whole unc: %s",
72 __func__, unc);
73
74 /* Try to interpret hostname as an IPv4 or IPv6 address */
75 rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len);
76 if (rc > 0)
77 goto name_is_IP_address;
78
79 /* Perform the upcall */
80 rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL);
81 if (rc < 0)
82 cERROR(1, "%s: unable to resolve: %*.*s",
83 __func__, len, len, hostname);
84 else
85 cFYI(1, "%s: resolved: %*.*s to %s",
86 __func__, len, len, hostname, *ip_addr);
177 return rc; 87 return rc;
178}
179 88
180int __init cifs_init_dns_resolver(void) 89name_is_IP_address:
181{ 90 name = kmalloc(len + 1, GFP_KERNEL);
182 struct cred *cred; 91 if (!name)
183 struct key *keyring;
184 int ret;
185
186 printk(KERN_NOTICE "Registering the %s key type\n",
187 key_type_dns_resolver.name);
188
189 /* create an override credential set with a special thread keyring in
190 * which DNS requests are cached
191 *
192 * this is used to prevent malicious redirections from being installed
193 * with add_key().
194 */
195 cred = prepare_kernel_cred(NULL);
196 if (!cred)
197 return -ENOMEM; 92 return -ENOMEM;
198 93 memcpy(name, hostname, len);
199 keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred, 94 name[len] = 0;
200 (KEY_POS_ALL & ~KEY_POS_SETATTR) | 95 cFYI(1, "%s: unc is IP, skipping dns upcall: %s", __func__, name);
201 KEY_USR_VIEW | KEY_USR_READ, 96 *ip_addr = name;
202 KEY_ALLOC_NOT_IN_QUOTA);
203 if (IS_ERR(keyring)) {
204 ret = PTR_ERR(keyring);
205 goto failed_put_cred;
206 }
207
208 ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
209 if (ret < 0)
210 goto failed_put_key;
211
212 ret = register_key_type(&key_type_dns_resolver);
213 if (ret < 0)
214 goto failed_put_key;
215
216 /* instruct request_key() to use this special keyring as a cache for
217 * the results it looks up */
218 cred->thread_keyring = keyring;
219 cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
220 dns_resolver_cache = cred;
221 return 0; 97 return 0;
222
223failed_put_key:
224 key_put(keyring);
225failed_put_cred:
226 put_cred(cred);
227 return ret;
228}
229
230void cifs_exit_dns_resolver(void)
231{
232 key_revoke(dns_resolver_cache->thread_keyring);
233 unregister_key_type(&key_type_dns_resolver);
234 put_cred(dns_resolver_cache);
235 printk(KERN_NOTICE "Unregistered %s key type\n",
236 key_type_dns_resolver.name);
237} 98}
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index 5d7f291df162..d3f5d27f4d06 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -24,8 +24,6 @@
24#define _DNS_RESOLVE_H 24#define _DNS_RESOLVE_H
25 25
26#ifdef __KERNEL__ 26#ifdef __KERNEL__
27extern int __init cifs_init_dns_resolver(void);
28extern void cifs_exit_dns_resolver(void);
29extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr); 27extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr);
30#endif /* KERNEL */ 28#endif /* KERNEL */
31 29
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index fa04a00d126d..db11fdef0e92 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2307,8 +2307,7 @@ static void cifs_invalidate_page(struct page *page, unsigned long offset)
2307 cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); 2307 cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
2308} 2308}
2309 2309
2310static void 2310void cifs_oplock_break(struct work_struct *work)
2311cifs_oplock_break(struct slow_work *work)
2312{ 2311{
2313 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, 2312 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2314 oplock_break); 2313 oplock_break);
@@ -2345,33 +2344,30 @@ cifs_oplock_break(struct slow_work *work)
2345 LOCKING_ANDX_OPLOCK_RELEASE, false); 2344 LOCKING_ANDX_OPLOCK_RELEASE, false);
2346 cFYI(1, "Oplock release rc = %d", rc); 2345 cFYI(1, "Oplock release rc = %d", rc);
2347 } 2346 }
2347
2348 /*
2349 * We might have kicked in before is_valid_oplock_break()
2350 * finished grabbing reference for us. Make sure it's done by
2351 * waiting for GlobalSMSSeslock.
2352 */
2353 write_lock(&GlobalSMBSeslock);
2354 write_unlock(&GlobalSMBSeslock);
2355
2356 cifs_oplock_break_put(cfile);
2348} 2357}
2349 2358
2350static int 2359void cifs_oplock_break_get(struct cifsFileInfo *cfile)
2351cifs_oplock_break_get(struct slow_work *work)
2352{ 2360{
2353 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2354 oplock_break);
2355 mntget(cfile->mnt); 2361 mntget(cfile->mnt);
2356 cifsFileInfo_get(cfile); 2362 cifsFileInfo_get(cfile);
2357 return 0;
2358} 2363}
2359 2364
2360static void 2365void cifs_oplock_break_put(struct cifsFileInfo *cfile)
2361cifs_oplock_break_put(struct slow_work *work)
2362{ 2366{
2363 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2364 oplock_break);
2365 mntput(cfile->mnt); 2367 mntput(cfile->mnt);
2366 cifsFileInfo_put(cfile); 2368 cifsFileInfo_put(cfile);
2367} 2369}
2368 2370
2369const struct slow_work_ops cifs_oplock_break_ops = {
2370 .get_ref = cifs_oplock_break_get,
2371 .put_ref = cifs_oplock_break_put,
2372 .execute = cifs_oplock_break,
2373};
2374
2375const struct address_space_operations cifs_addr_ops = { 2371const struct address_space_operations cifs_addr_ops = {
2376 .readpage = cifs_readpage, 2372 .readpage = cifs_readpage,
2377 .readpages = cifs_readpages, 2373 .readpages = cifs_readpages,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a15b3a9bbff4..dc4c47ab9588 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -732,15 +732,9 @@ cifs_find_inode(struct inode *inode, void *opaque)
732 if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT)) 732 if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT))
733 return 0; 733 return 0;
734 734
735 /* 735 /* if it's not a directory or has no dentries, then flag it */
736 * uh oh -- it's a directory. We can't use it since hardlinked dirs are 736 if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry))
737 * verboten. Disable serverino and return it as if it were found, the
738 * caller can discard it, generate a uniqueid and retry the find
739 */
740 if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) {
741 fattr->cf_flags |= CIFS_FATTR_INO_COLLISION; 737 fattr->cf_flags |= CIFS_FATTR_INO_COLLISION;
742 cifs_autodisable_serverino(CIFS_SB(inode->i_sb));
743 }
744 738
745 return 1; 739 return 1;
746} 740}
@@ -754,6 +748,27 @@ cifs_init_inode(struct inode *inode, void *opaque)
754 return 0; 748 return 0;
755} 749}
756 750
751/*
752 * walk dentry list for an inode and report whether it has aliases that
753 * are hashed. We use this to determine if a directory inode can actually
754 * be used.
755 */
756static bool
757inode_has_hashed_dentries(struct inode *inode)
758{
759 struct dentry *dentry;
760
761 spin_lock(&dcache_lock);
762 list_for_each_entry(dentry, &inode->i_dentry, d_alias) {
763 if (!d_unhashed(dentry) || IS_ROOT(dentry)) {
764 spin_unlock(&dcache_lock);
765 return true;
766 }
767 }
768 spin_unlock(&dcache_lock);
769 return false;
770}
771
757/* Given fattrs, get a corresponding inode */ 772/* Given fattrs, get a corresponding inode */
758struct inode * 773struct inode *
759cifs_iget(struct super_block *sb, struct cifs_fattr *fattr) 774cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
@@ -769,12 +784,16 @@ retry_iget5_locked:
769 784
770 inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr); 785 inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
771 if (inode) { 786 if (inode) {
772 /* was there a problematic inode number collision? */ 787 /* was there a potentially problematic inode collision? */
773 if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) { 788 if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) {
774 iput(inode);
775 fattr->cf_uniqueid = iunique(sb, ROOT_I);
776 fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION; 789 fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION;
777 goto retry_iget5_locked; 790
791 if (inode_has_hashed_dentries(inode)) {
792 cifs_autodisable_serverino(CIFS_SB(sb));
793 iput(inode);
794 fattr->cf_uniqueid = iunique(sb, ROOT_I);
795 goto retry_iget5_locked;
796 }
778 } 797 }
779 798
780 cifs_fattr_to_inode(inode, fattr); 799 cifs_fattr_to_inode(inode, fattr);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 1394aa37f26c..3ccadc1326d6 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -498,7 +498,6 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
498 struct cifsTconInfo *tcon; 498 struct cifsTconInfo *tcon;
499 struct cifsInodeInfo *pCifsInode; 499 struct cifsInodeInfo *pCifsInode;
500 struct cifsFileInfo *netfile; 500 struct cifsFileInfo *netfile;
501 int rc;
502 501
503 cFYI(1, "Checking for oplock break or dnotify response"); 502 cFYI(1, "Checking for oplock break or dnotify response");
504 if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && 503 if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
@@ -583,13 +582,18 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
583 pCifsInode->clientCanCacheAll = false; 582 pCifsInode->clientCanCacheAll = false;
584 if (pSMB->OplockLevel == 0) 583 if (pSMB->OplockLevel == 0)
585 pCifsInode->clientCanCacheRead = false; 584 pCifsInode->clientCanCacheRead = false;
586 rc = slow_work_enqueue(&netfile->oplock_break); 585
587 if (rc) { 586 /*
588 cERROR(1, "failed to enqueue oplock " 587 * cifs_oplock_break_put() can't be called
589 "break: %d\n", rc); 588 * from here. Get reference after queueing
590 } else { 589 * succeeded. cifs_oplock_break() will
591 netfile->oplock_break_cancelled = false; 590 * synchronize using GlobalSMSSeslock.
592 } 591 */
592 if (queue_work(system_nrt_wq,
593 &netfile->oplock_break))
594 cifs_oplock_break_get(netfile);
595 netfile->oplock_break_cancelled = false;
596
593 read_unlock(&GlobalSMBSeslock); 597 read_unlock(&GlobalSMBSeslock);
594 read_unlock(&cifs_tcp_ses_lock); 598 read_unlock(&cifs_tcp_ses_lock);
595 return true; 599 return true;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index c6721ee26dbc..f97851119e6c 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -140,17 +140,18 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
140 * Returns 0 on failure. 140 * Returns 0 on failure.
141 */ 141 */
142static int 142static int
143cifs_inet_pton(const int address_family, const char *cp, void *dst) 143cifs_inet_pton(const int address_family, const char *cp, int len, void *dst)
144{ 144{
145 int ret = 0; 145 int ret = 0;
146 146
147 /* calculate length by finding first slash or NULL */ 147 /* calculate length by finding first slash or NULL */
148 if (address_family == AF_INET) 148 if (address_family == AF_INET)
149 ret = in4_pton(cp, -1 /* len */, dst, '\\', NULL); 149 ret = in4_pton(cp, len, dst, '\\', NULL);
150 else if (address_family == AF_INET6) 150 else if (address_family == AF_INET6)
151 ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL); 151 ret = in6_pton(cp, len, dst , '\\', NULL);
152 152
153 cFYI(DBG2, "address conversion returned %d for %s", ret, cp); 153 cFYI(DBG2, "address conversion returned %d for %*.*s",
154 ret, len, len, cp);
154 if (ret > 0) 155 if (ret > 0)
155 ret = 1; 156 ret = 1;
156 return ret; 157 return ret;
@@ -165,37 +166,39 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst)
165 * Returns 0 on failure. 166 * Returns 0 on failure.
166 */ 167 */
167int 168int
168cifs_convert_address(struct sockaddr *dst, char *src) 169cifs_convert_address(struct sockaddr *dst, const char *src, int len)
169{ 170{
170 int rc; 171 int rc, alen, slen;
171 char *pct, *endp; 172 const char *pct;
173 char *endp, scope_id[13];
172 struct sockaddr_in *s4 = (struct sockaddr_in *) dst; 174 struct sockaddr_in *s4 = (struct sockaddr_in *) dst;
173 struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst; 175 struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst;
174 176
175 /* IPv4 address */ 177 /* IPv4 address */
176 if (cifs_inet_pton(AF_INET, src, &s4->sin_addr.s_addr)) { 178 if (cifs_inet_pton(AF_INET, src, len, &s4->sin_addr.s_addr)) {
177 s4->sin_family = AF_INET; 179 s4->sin_family = AF_INET;
178 return 1; 180 return 1;
179 } 181 }
180 182
181 /* temporarily terminate string */ 183 /* attempt to exclude the scope ID from the address part */
182 pct = strchr(src, '%'); 184 pct = memchr(src, '%', len);
183 if (pct) 185 alen = pct ? pct - src : len;
184 *pct = '\0';
185
186 rc = cifs_inet_pton(AF_INET6, src, &s6->sin6_addr.s6_addr);
187
188 /* repair temp termination (if any) and make pct point to scopeid */
189 if (pct)
190 *pct++ = '%';
191 186
187 rc = cifs_inet_pton(AF_INET6, src, alen, &s6->sin6_addr.s6_addr);
192 if (!rc) 188 if (!rc)
193 return rc; 189 return rc;
194 190
195 s6->sin6_family = AF_INET6; 191 s6->sin6_family = AF_INET6;
196 if (pct) { 192 if (pct) {
193 /* grab the scope ID */
194 slen = len - (alen + 1);
195 if (slen <= 0 || slen > 12)
196 return 0;
197 memcpy(scope_id, pct + 1, slen);
198 scope_id[slen] = '\0';
199
197 s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0); 200 s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0);
198 if (!*pct || *endp) 201 if (endp != scope_id + slen)
199 return 0; 202 return 0;
200 } 203 }
201 204
@@ -203,10 +206,10 @@ cifs_convert_address(struct sockaddr *dst, char *src)
203} 206}
204 207
205int 208int
206cifs_fill_sockaddr(struct sockaddr *dst, char *src, 209cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
207 const unsigned short int port) 210 const unsigned short int port)
208{ 211{
209 if (!cifs_convert_address(dst, src)) 212 if (!cifs_convert_address(dst, src, len))
210 return 0; 213 return 0;
211 214
212 switch (dst->sa_family) { 215 switch (dst->sa_family) {
diff --git a/fs/compat.c b/fs/compat.c
index c6fda9aeb864..5976bad85f65 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -15,6 +15,7 @@
15 * published by the Free Software Foundation. 15 * published by the Free Software Foundation.
16 */ 16 */
17 17
18#include <linux/stddef.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/linkage.h> 20#include <linux/linkage.h>
20#include <linux/compat.h> 21#include <linux/compat.h>
@@ -891,8 +892,6 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
891 return retval; 892 return retval;
892} 893}
893 894
894#define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de)))
895
896struct compat_old_linux_dirent { 895struct compat_old_linux_dirent {
897 compat_ulong_t d_ino; 896 compat_ulong_t d_ino;
898 compat_ulong_t d_offset; 897 compat_ulong_t d_offset;
@@ -981,7 +980,8 @@ static int compat_filldir(void *__buf, const char *name, int namlen,
981 struct compat_linux_dirent __user * dirent; 980 struct compat_linux_dirent __user * dirent;
982 struct compat_getdents_callback *buf = __buf; 981 struct compat_getdents_callback *buf = __buf;
983 compat_ulong_t d_ino; 982 compat_ulong_t d_ino;
984 int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(compat_long_t)); 983 int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
984 namlen + 2, sizeof(compat_long_t));
985 985
986 buf->error = -EINVAL; /* only used if we fail.. */ 986 buf->error = -EINVAL; /* only used if we fail.. */
987 if (reclen > buf->count) 987 if (reclen > buf->count)
@@ -1068,8 +1068,8 @@ static int compat_filldir64(void * __buf, const char * name, int namlen, loff_t
1068{ 1068{
1069 struct linux_dirent64 __user *dirent; 1069 struct linux_dirent64 __user *dirent;
1070 struct compat_getdents_callback64 *buf = __buf; 1070 struct compat_getdents_callback64 *buf = __buf;
1071 int jj = NAME_OFFSET(dirent); 1071 int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
1072 int reclen = ALIGN(jj + namlen + 1, sizeof(u64)); 1072 sizeof(u64));
1073 u64 off; 1073 u64 off;
1074 1074
1075 buf->error = -EINVAL; /* only used if we fail.. */ 1075 buf->error = -EINVAL; /* only used if we fail.. */
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index c0d35c620526..37a34c2c622a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -248,7 +248,7 @@ static struct connection *assoc2con(int assoc_id)
248 248
249 for (i = 0 ; i < CONN_HASH_SIZE; i++) { 249 for (i = 0 ; i < CONN_HASH_SIZE; i++) {
250 hlist_for_each_entry(con, h, &connection_hash[i], list) { 250 hlist_for_each_entry(con, h, &connection_hash[i], list) {
251 if (con && con->sctp_assoc == assoc_id) { 251 if (con->sctp_assoc == assoc_id) {
252 mutex_unlock(&connections_lock); 252 mutex_unlock(&connections_lock);
253 return con; 253 return con;
254 } 254 }
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 2c6ad518100d..ef17e0169da1 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -81,24 +81,11 @@ static struct genl_ops dlm_nl_ops = {
81 81
82int __init dlm_netlink_init(void) 82int __init dlm_netlink_init(void)
83{ 83{
84 int rv; 84 return genl_register_family_with_ops(&family, &dlm_nl_ops, 1);
85
86 rv = genl_register_family(&family);
87 if (rv)
88 return rv;
89
90 rv = genl_register_ops(&family, &dlm_nl_ops);
91 if (rv < 0)
92 goto err;
93 return 0;
94 err:
95 genl_unregister_family(&family);
96 return rv;
97} 85}
98 86
99void dlm_netlink_exit(void) 87void dlm_netlink_exit(void)
100{ 88{
101 genl_unregister_ops(&family, &dlm_nl_ops);
102 genl_unregister_family(&family); 89 genl_unregister_family(&family);
103} 90}
104 91
diff --git a/fs/exec.c b/fs/exec.c
index e19de6a80339..dab85ecad686 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -28,7 +28,6 @@
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/stat.h> 29#include <linux/stat.h>
30#include <linux/fcntl.h> 30#include <linux/fcntl.h>
31#include <linux/smp_lock.h>
32#include <linux/swap.h> 31#include <linux/swap.h>
33#include <linux/string.h> 32#include <linux/string.h>
34#include <linux/init.h> 33#include <linux/init.h>
@@ -653,6 +652,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
653 else 652 else
654 stack_base = vma->vm_start - stack_expand; 653 stack_base = vma->vm_start - stack_expand;
655#endif 654#endif
655 current->mm->start_stack = bprm->p;
656 ret = expand_stack(vma, stack_base); 656 ret = expand_stack(vma, stack_base);
657 if (ret) 657 if (ret)
658 ret = -EFAULT; 658 ret = -EFAULT;
@@ -1891,13 +1891,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1891 */ 1891 */
1892 clear_thread_flag(TIF_SIGPENDING); 1892 clear_thread_flag(TIF_SIGPENDING);
1893 1893
1894 /*
1895 * lock_kernel() because format_corename() is controlled by sysctl, which
1896 * uses lock_kernel()
1897 */
1898 lock_kernel();
1899 ispipe = format_corename(corename, signr); 1894 ispipe = format_corename(corename, signr);
1900 unlock_kernel();
1901 1895
1902 if (ispipe) { 1896 if (ispipe) {
1903 int dump_count; 1897 int dump_count;
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 522b15498f45..e8c6ba0e4a3e 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -31,6 +31,7 @@ config EXT3_FS
31config EXT3_DEFAULTS_TO_ORDERED 31config EXT3_DEFAULTS_TO_ORDERED
32 bool "Default to 'data=ordered' in ext3" 32 bool "Default to 'data=ordered' in ext3"
33 depends on EXT3_FS 33 depends on EXT3_FS
34 default y
34 help 35 help
35 The journal mode options for ext3 have different tradeoffs 36 The journal mode options for ext3 have different tradeoffs
36 between when data is guaranteed to be on disk and 37 between when data is guaranteed to be on disk and
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 735f0190ec2a..001eb0e2d48e 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1149,9 +1149,25 @@ static int walk_page_buffers( handle_t *handle,
1149static int do_journal_get_write_access(handle_t *handle, 1149static int do_journal_get_write_access(handle_t *handle,
1150 struct buffer_head *bh) 1150 struct buffer_head *bh)
1151{ 1151{
1152 int dirty = buffer_dirty(bh);
1153 int ret;
1154
1152 if (!buffer_mapped(bh) || buffer_freed(bh)) 1155 if (!buffer_mapped(bh) || buffer_freed(bh))
1153 return 0; 1156 return 0;
1154 return ext3_journal_get_write_access(handle, bh); 1157 /*
1158 * __block_prepare_write() could have dirtied some buffers. Clean
1159 * the dirty bit as jbd2_journal_get_write_access() could complain
1160 * otherwise about fs integrity issues. Setting of the dirty bit
1161 * by __block_prepare_write() isn't a real problem here as we clear
1162 * the bit before releasing a page lock and thus writeback cannot
1163 * ever write the buffer.
1164 */
1165 if (dirty)
1166 clear_buffer_dirty(bh);
1167 ret = ext3_journal_get_write_access(handle, bh);
1168 if (!ret && dirty)
1169 ret = ext3_journal_dirty_metadata(handle, bh);
1170 return ret;
1155} 1171}
1156 1172
1157/* 1173/*
@@ -1625,10 +1641,7 @@ static int ext3_writeback_writepage(struct page *page,
1625 goto out_fail; 1641 goto out_fail;
1626 } 1642 }
1627 1643
1628 if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) 1644 ret = block_write_full_page(page, ext3_get_block, wbc);
1629 ret = nobh_writepage(page, ext3_get_block, wbc);
1630 else
1631 ret = block_write_full_page(page, ext3_get_block, wbc);
1632 1645
1633 err = ext3_journal_stop(handle); 1646 err = ext3_journal_stop(handle);
1634 if (!ret) 1647 if (!ret)
@@ -1922,17 +1935,6 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1922 length = blocksize - (offset & (blocksize - 1)); 1935 length = blocksize - (offset & (blocksize - 1));
1923 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 1936 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1924 1937
1925 /*
1926 * For "nobh" option, we can only work if we don't need to
1927 * read-in the page - otherwise we create buffers to do the IO.
1928 */
1929 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
1930 ext3_should_writeback_data(inode) && PageUptodate(page)) {
1931 zero_user(page, offset, length);
1932 set_page_dirty(page);
1933 goto unlock;
1934 }
1935
1936 if (!page_has_buffers(page)) 1938 if (!page_has_buffers(page))
1937 create_empty_buffers(page, blocksize, 0); 1939 create_empty_buffers(page, blocksize, 0);
1938 1940
@@ -2284,27 +2286,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
2284 depth); 2286 depth);
2285 2287
2286 /* 2288 /*
2287 * We've probably journalled the indirect block several
2288 * times during the truncate. But it's no longer
2289 * needed and we now drop it from the transaction via
2290 * journal_revoke().
2291 *
2292 * That's easy if it's exclusively part of this
2293 * transaction. But if it's part of the committing
2294 * transaction then journal_forget() will simply
2295 * brelse() it. That means that if the underlying
2296 * block is reallocated in ext3_get_block(),
2297 * unmap_underlying_metadata() will find this block
2298 * and will try to get rid of it. damn, damn.
2299 *
2300 * If this block has already been committed to the
2301 * journal, a revoke record will be written. And
2302 * revoke records must be emitted *before* clearing
2303 * this block's bit in the bitmaps.
2304 */
2305 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2306
2307 /*
2308 * Everything below this this pointer has been 2289 * Everything below this this pointer has been
2309 * released. Now let this top-of-subtree go. 2290 * released. Now let this top-of-subtree go.
2310 * 2291 *
@@ -2327,6 +2308,31 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
2327 truncate_restart_transaction(handle, inode); 2308 truncate_restart_transaction(handle, inode);
2328 } 2309 }
2329 2310
2311 /*
2312 * We've probably journalled the indirect block several
2313 * times during the truncate. But it's no longer
2314 * needed and we now drop it from the transaction via
2315 * journal_revoke().
2316 *
2317 * That's easy if it's exclusively part of this
2318 * transaction. But if it's part of the committing
2319 * transaction then journal_forget() will simply
2320 * brelse() it. That means that if the underlying
2321 * block is reallocated in ext3_get_block(),
2322 * unmap_underlying_metadata() will find this block
2323 * and will try to get rid of it. damn, damn. Thus
2324 * we don't allow a block to be reallocated until
2325 * a transaction freeing it has fully committed.
2326 *
2327 * We also have to make sure journal replay after a
2328 * crash does not overwrite non-journaled data blocks
2329 * with old metadata when the block got reallocated for
2330 * data. Thus we have to store a revoke record for a
2331 * block in the same transaction in which we free the
2332 * block.
2333 */
2334 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2335
2330 ext3_free_blocks(handle, inode, nr, 1); 2336 ext3_free_blocks(handle, inode, nr, 1);
2331 2337
2332 if (parent_bh) { 2338 if (parent_bh) {
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index ee184084ca42..2b35ddb70d65 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1447,7 +1447,6 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
1447 struct inode *inode) 1447 struct inode *inode)
1448{ 1448{
1449 struct inode *dir = dentry->d_parent->d_inode; 1449 struct inode *dir = dentry->d_parent->d_inode;
1450 unsigned long offset;
1451 struct buffer_head * bh; 1450 struct buffer_head * bh;
1452 struct ext3_dir_entry_2 *de; 1451 struct ext3_dir_entry_2 *de;
1453 struct super_block * sb; 1452 struct super_block * sb;
@@ -1469,7 +1468,7 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
1469 ext3_mark_inode_dirty(handle, dir); 1468 ext3_mark_inode_dirty(handle, dir);
1470 } 1469 }
1471 blocks = dir->i_size >> sb->s_blocksize_bits; 1470 blocks = dir->i_size >> sb->s_blocksize_bits;
1472 for (block = 0, offset = 0; block < blocks; block++) { 1471 for (block = 0; block < blocks; block++) {
1473 bh = ext3_bread(handle, dir, block, 0, &retval); 1472 bh = ext3_bread(handle, dir, block, 0, &retval);
1474 if(!bh) 1473 if(!bh)
1475 return retval; 1474 return retval;
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 54351ac7cef9..0ccd7b12b73c 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -964,7 +964,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
964 ext3_fsblk_t n_blocks_count) 964 ext3_fsblk_t n_blocks_count)
965{ 965{
966 ext3_fsblk_t o_blocks_count; 966 ext3_fsblk_t o_blocks_count;
967 unsigned long o_groups_count;
968 ext3_grpblk_t last; 967 ext3_grpblk_t last;
969 ext3_grpblk_t add; 968 ext3_grpblk_t add;
970 struct buffer_head * bh; 969 struct buffer_head * bh;
@@ -976,7 +975,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
976 * yet: we're going to revalidate es->s_blocks_count after 975 * yet: we're going to revalidate es->s_blocks_count after
977 * taking the s_resize_lock below. */ 976 * taking the s_resize_lock below. */
978 o_blocks_count = le32_to_cpu(es->s_blocks_count); 977 o_blocks_count = le32_to_cpu(es->s_blocks_count);
979 o_groups_count = EXT3_SB(sb)->s_groups_count;
980 978
981 if (test_opt(sb, DEBUG)) 979 if (test_opt(sb, DEBUG))
982 printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n", 980 printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n",
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6c953bb255e7..9650a956fd0e 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -661,9 +661,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
661 */ 661 */
662 seq_puts(seq, ",barrier="); 662 seq_puts(seq, ",barrier=");
663 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); 663 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
664 if (test_opt(sb, NOBH))
665 seq_puts(seq, ",nobh");
666
667 seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS))); 664 seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS)));
668 if (test_opt(sb, DATA_ERR_ABORT)) 665 if (test_opt(sb, DATA_ERR_ABORT))
669 seq_puts(seq, ",data_err=abort"); 666 seq_puts(seq, ",data_err=abort");
@@ -1255,10 +1252,12 @@ set_qf_format:
1255 *n_blocks_count = option; 1252 *n_blocks_count = option;
1256 break; 1253 break;
1257 case Opt_nobh: 1254 case Opt_nobh:
1258 set_opt(sbi->s_mount_opt, NOBH); 1255 ext3_msg(sb, KERN_WARNING,
1256 "warning: ignoring deprecated nobh option");
1259 break; 1257 break;
1260 case Opt_bh: 1258 case Opt_bh:
1261 clear_opt(sbi->s_mount_opt, NOBH); 1259 ext3_msg(sb, KERN_WARNING,
1260 "warning: ignoring deprecated bh option");
1262 break; 1261 break;
1263 default: 1262 default:
1264 ext3_msg(sb, KERN_ERR, 1263 ext3_msg(sb, KERN_ERR,
@@ -2001,14 +2000,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
2001 break; 2000 break;
2002 } 2001 }
2003 2002
2004 if (test_opt(sb, NOBH)) {
2005 if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) {
2006 ext3_msg(sb, KERN_WARNING,
2007 "warning: ignoring nobh option - "
2008 "it is supported only with writeback mode");
2009 clear_opt(sbi->s_mount_opt, NOBH);
2010 }
2011 }
2012 /* 2003 /*
2013 * The journal_load will have done any necessary log recovery, 2004 * The journal_load will have done any necessary log recovery,
2014 * so we can safely mount the rest of the filesystem now. 2005 * so we can safely mount the rest of the filesystem now.
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index feaf498feaa6..5e2ed4504ead 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -204,6 +204,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
204 return error; 204 return error;
205 else { 205 else {
206 inode->i_mode = mode; 206 inode->i_mode = mode;
207 inode->i_ctime = ext4_current_time(inode);
207 ext4_mark_inode_dirty(handle, inode); 208 ext4_mark_inode_dirty(handle, inode);
208 if (error == 0) 209 if (error == 0)
209 acl = NULL; 210 acl = NULL;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 95b7594c76f9..bd30799a43ed 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -377,14 +377,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
377 ext4_grpblk_t bit; 377 ext4_grpblk_t bit;
378 unsigned int i; 378 unsigned int i;
379 struct ext4_group_desc *desc; 379 struct ext4_group_desc *desc;
380 struct ext4_super_block *es; 380 struct ext4_sb_info *sbi = EXT4_SB(sb);
381 struct ext4_sb_info *sbi;
382 int err = 0, ret, blk_free_count; 381 int err = 0, ret, blk_free_count;
383 ext4_grpblk_t blocks_freed; 382 ext4_grpblk_t blocks_freed;
384 struct ext4_group_info *grp; 383 struct ext4_group_info *grp;
385 384
386 sbi = EXT4_SB(sb);
387 es = sbi->s_es;
388 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); 385 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
389 386
390 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 387 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -477,7 +474,6 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
477 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 474 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
478 if (!err) 475 if (!err)
479 err = ret; 476 err = ret;
480 sb->s_dirt = 1;
481 477
482error_return: 478error_return:
483 brelse(bitmap_bh); 479 brelse(bitmap_bh);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 5b6973fbf1bd..3db5084db9bd 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -229,16 +229,20 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
229 229
230 if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || 230 if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
231 (start_blk + count < start_blk) || 231 (start_blk + count < start_blk) ||
232 (start_blk + count > ext4_blocks_count(sbi->s_es))) 232 (start_blk + count > ext4_blocks_count(sbi->s_es))) {
233 sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
233 return 0; 234 return 0;
235 }
234 while (n) { 236 while (n) {
235 entry = rb_entry(n, struct ext4_system_zone, node); 237 entry = rb_entry(n, struct ext4_system_zone, node);
236 if (start_blk + count - 1 < entry->start_blk) 238 if (start_blk + count - 1 < entry->start_blk)
237 n = n->rb_left; 239 n = n->rb_left;
238 else if (start_blk >= (entry->start_blk + entry->count)) 240 else if (start_blk >= (entry->start_blk + entry->count))
239 n = n->rb_right; 241 n = n->rb_right;
240 else 242 else {
243 sbi->s_es->s_last_error_block = cpu_to_le64(start_blk);
241 return 0; 244 return 0;
245 }
242 } 246 }
243 return 1; 247 return 1;
244} 248}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ea5e6cb7e2a5..374510f72baa 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -61,10 +61,11 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
61} 61}
62 62
63 63
64int ext4_check_dir_entry(const char *function, struct inode *dir, 64int __ext4_check_dir_entry(const char *function, unsigned int line,
65 struct ext4_dir_entry_2 *de, 65 struct inode *dir,
66 struct buffer_head *bh, 66 struct ext4_dir_entry_2 *de,
67 unsigned int offset) 67 struct buffer_head *bh,
68 unsigned int offset)
68{ 69{
69 const char *error_msg = NULL; 70 const char *error_msg = NULL;
70 const int rlen = ext4_rec_len_from_disk(de->rec_len, 71 const int rlen = ext4_rec_len_from_disk(de->rec_len,
@@ -83,11 +84,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
83 error_msg = "inode out of bounds"; 84 error_msg = "inode out of bounds";
84 85
85 if (error_msg != NULL) 86 if (error_msg != NULL)
86 ext4_error_inode(function, dir, 87 ext4_error_inode(dir, function, line, bh->b_blocknr,
87 "bad entry in directory: %s - block=%llu" 88 "bad entry in directory: %s - "
88 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", 89 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
89 error_msg, (unsigned long long) bh->b_blocknr, 90 error_msg, (unsigned) (offset%bh->b_size), offset,
90 (unsigned) (offset%bh->b_size), offset,
91 le32_to_cpu(de->inode), 91 le32_to_cpu(de->inode),
92 rlen, de->name_len); 92 rlen, de->name_len);
93 return error_msg == NULL ? 1 : 0; 93 return error_msg == NULL ? 1 : 0;
@@ -121,7 +121,8 @@ static int ext4_readdir(struct file *filp,
121 * We don't set the inode dirty flag since it's not 121 * We don't set the inode dirty flag since it's not
122 * critical that it get flushed back to the disk. 122 * critical that it get flushed back to the disk.
123 */ 123 */
124 ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX); 124 ext4_clear_inode_flag(filp->f_path.dentry->d_inode,
125 EXT4_INODE_INDEX);
125 } 126 }
126 stored = 0; 127 stored = 0;
127 offset = filp->f_pos & (sb->s_blocksize - 1); 128 offset = filp->f_pos & (sb->s_blocksize - 1);
@@ -193,7 +194,7 @@ revalidate:
193 while (!error && filp->f_pos < inode->i_size 194 while (!error && filp->f_pos < inode->i_size
194 && offset < sb->s_blocksize) { 195 && offset < sb->s_blocksize) {
195 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); 196 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
196 if (!ext4_check_dir_entry("ext4_readdir", inode, de, 197 if (!ext4_check_dir_entry(inode, de,
197 bh, offset)) { 198 bh, offset)) {
198 /* 199 /*
199 * On error, skip the f_pos to the next block 200 * On error, skip the f_pos to the next block
@@ -343,7 +344,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
343 struct dir_private_info *info; 344 struct dir_private_info *info;
344 int len; 345 int len;
345 346
346 info = (struct dir_private_info *) dir_file->private_data; 347 info = dir_file->private_data;
347 p = &info->root.rb_node; 348 p = &info->root.rb_node;
348 349
349 /* Create and allocate the fname structure */ 350 /* Create and allocate the fname structure */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 19a4de57128a..e03841d9f30b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -57,10 +57,13 @@
57#endif 57#endif
58 58
59#define EXT4_ERROR_INODE(inode, fmt, a...) \ 59#define EXT4_ERROR_INODE(inode, fmt, a...) \
60 ext4_error_inode(__func__, (inode), (fmt), ## a) 60 ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
61
62#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \
63 ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a)
61 64
62#define EXT4_ERROR_FILE(file, fmt, a...) \ 65#define EXT4_ERROR_FILE(file, fmt, a...) \
63 ext4_error_file(__func__, (file), (fmt), ## a) 66 ext4_error_file(__func__, __LINE__, (file), (fmt), ## a)
64 67
65/* data type for block offset of block group */ 68/* data type for block offset of block group */
66typedef int ext4_grpblk_t; 69typedef int ext4_grpblk_t;
@@ -167,13 +170,15 @@ struct mpage_da_data {
167}; 170};
168#define EXT4_IO_UNWRITTEN 0x1 171#define EXT4_IO_UNWRITTEN 0x1
169typedef struct ext4_io_end { 172typedef struct ext4_io_end {
170 struct list_head list; /* per-file finished AIO list */ 173 struct list_head list; /* per-file finished IO list */
171 struct inode *inode; /* file being written to */ 174 struct inode *inode; /* file being written to */
172 unsigned int flag; /* unwritten or not */ 175 unsigned int flag; /* unwritten or not */
173 struct page *page; /* page struct for buffer write */ 176 struct page *page; /* page struct for buffer write */
174 loff_t offset; /* offset in the file */ 177 loff_t offset; /* offset in the file */
175 ssize_t size; /* size of the extent */ 178 ssize_t size; /* size of the extent */
176 struct work_struct work; /* data work queue */ 179 struct work_struct work; /* data work queue */
180 struct kiocb *iocb; /* iocb struct for AIO */
181 int result; /* error value for AIO */
177} ext4_io_end_t; 182} ext4_io_end_t;
178 183
179/* 184/*
@@ -460,7 +465,7 @@ struct ext4_new_group_data {
460}; 465};
461 466
462/* 467/*
463 * Flags used by ext4_get_blocks() 468 * Flags used by ext4_map_blocks()
464 */ 469 */
465 /* Allocate any needed blocks and/or convert an unitialized 470 /* Allocate any needed blocks and/or convert an unitialized
466 extent to be an initialized ext4 */ 471 extent to be an initialized ext4 */
@@ -873,7 +878,6 @@ struct ext4_inode_info {
873#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ 878#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
874#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ 879#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */
875#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ 880#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */
876#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */
877#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 881#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
878#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 882#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
879#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 883#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
@@ -982,7 +986,7 @@ struct ext4_super_block {
982 __le32 s_last_orphan; /* start of list of inodes to delete */ 986 __le32 s_last_orphan; /* start of list of inodes to delete */
983 __le32 s_hash_seed[4]; /* HTREE hash seed */ 987 __le32 s_hash_seed[4]; /* HTREE hash seed */
984 __u8 s_def_hash_version; /* Default hash version to use */ 988 __u8 s_def_hash_version; /* Default hash version to use */
985 __u8 s_reserved_char_pad; 989 __u8 s_jnl_backup_type;
986 __le16 s_desc_size; /* size of group descriptor */ 990 __le16 s_desc_size; /* size of group descriptor */
987/*100*/ __le32 s_default_mount_opts; 991/*100*/ __le32 s_default_mount_opts;
988 __le32 s_first_meta_bg; /* First metablock block group */ 992 __le32 s_first_meta_bg; /* First metablock block group */
@@ -1000,12 +1004,34 @@ struct ext4_super_block {
1000 __le64 s_mmp_block; /* Block for multi-mount protection */ 1004 __le64 s_mmp_block; /* Block for multi-mount protection */
1001 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ 1005 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
1002 __u8 s_log_groups_per_flex; /* FLEX_BG group size */ 1006 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
1003 __u8 s_reserved_char_pad2; 1007 __u8 s_reserved_char_pad;
1004 __le16 s_reserved_pad; 1008 __le16 s_reserved_pad;
1005 __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ 1009 __le64 s_kbytes_written; /* nr of lifetime kilobytes written */
1006 __u32 s_reserved[160]; /* Padding to the end of the block */ 1010 __le32 s_snapshot_inum; /* Inode number of active snapshot */
1011 __le32 s_snapshot_id; /* sequential ID of active snapshot */
1012 __le64 s_snapshot_r_blocks_count; /* reserved blocks for active
1013 snapshot's future use */
1014 __le32 s_snapshot_list; /* inode number of the head of the
1015 on-disk snapshot list */
1016#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
1017 __le32 s_error_count; /* number of fs errors */
1018 __le32 s_first_error_time; /* first time an error happened */
1019 __le32 s_first_error_ino; /* inode involved in first error */
1020 __le64 s_first_error_block; /* block involved of first error */
1021 __u8 s_first_error_func[32]; /* function where the error happened */
1022 __le32 s_first_error_line; /* line number where error happened */
1023 __le32 s_last_error_time; /* most recent time of an error */
1024 __le32 s_last_error_ino; /* inode involved in last error */
1025 __le32 s_last_error_line; /* line number where error happened */
1026 __le64 s_last_error_block; /* block involved of last error */
1027 __u8 s_last_error_func[32]; /* function where the error happened */
1028#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
1029 __u8 s_mount_opts[64];
1030 __le32 s_reserved[112]; /* Padding to the end of the block */
1007}; 1031};
1008 1032
1033#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
1034
1009#ifdef __KERNEL__ 1035#ifdef __KERNEL__
1010 1036
1011/* 1037/*
@@ -1143,6 +1169,9 @@ struct ext4_sb_info {
1143 1169
1144 /* workqueue for dio unwritten */ 1170 /* workqueue for dio unwritten */
1145 struct workqueue_struct *dio_unwritten_wq; 1171 struct workqueue_struct *dio_unwritten_wq;
1172
1173 /* timer for periodic error stats printing */
1174 struct timer_list s_err_report;
1146}; 1175};
1147 1176
1148static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1177static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1313,6 +1342,10 @@ EXT4_INODE_BIT_FNS(state, state_flags)
1313#define EXT4_DEFM_JMODE_DATA 0x0020 1342#define EXT4_DEFM_JMODE_DATA 0x0020
1314#define EXT4_DEFM_JMODE_ORDERED 0x0040 1343#define EXT4_DEFM_JMODE_ORDERED 0x0040
1315#define EXT4_DEFM_JMODE_WBACK 0x0060 1344#define EXT4_DEFM_JMODE_WBACK 0x0060
1345#define EXT4_DEFM_NOBARRIER 0x0100
1346#define EXT4_DEFM_BLOCK_VALIDITY 0x0200
1347#define EXT4_DEFM_DISCARD 0x0400
1348#define EXT4_DEFM_NODELALLOC 0x0800
1316 1349
1317/* 1350/*
1318 * Default journal batch times 1351 * Default journal batch times
@@ -1379,6 +1412,43 @@ struct ext4_dir_entry_2 {
1379#define EXT4_MAX_REC_LEN ((1<<16)-1) 1412#define EXT4_MAX_REC_LEN ((1<<16)-1)
1380 1413
1381/* 1414/*
1415 * If we ever get support for fs block sizes > page_size, we'll need
1416 * to remove the #if statements in the next two functions...
1417 */
1418static inline unsigned int
1419ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
1420{
1421 unsigned len = le16_to_cpu(dlen);
1422
1423#if (PAGE_CACHE_SIZE >= 65536)
1424 if (len == EXT4_MAX_REC_LEN || len == 0)
1425 return blocksize;
1426 return (len & 65532) | ((len & 3) << 16);
1427#else
1428 return len;
1429#endif
1430}
1431
1432static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
1433{
1434 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
1435 BUG();
1436#if (PAGE_CACHE_SIZE >= 65536)
1437 if (len < 65536)
1438 return cpu_to_le16(len);
1439 if (len == blocksize) {
1440 if (blocksize == 65536)
1441 return cpu_to_le16(EXT4_MAX_REC_LEN);
1442 else
1443 return cpu_to_le16(0);
1444 }
1445 return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
1446#else
1447 return cpu_to_le16(len);
1448#endif
1449}
1450
1451/*
1382 * Hash Tree Directory indexing 1452 * Hash Tree Directory indexing
1383 * (c) Daniel Phillips, 2001 1453 * (c) Daniel Phillips, 2001
1384 */ 1454 */
@@ -1510,9 +1580,11 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
1510 ext4_init_block_bitmap(sb, NULL, group, desc) 1580 ext4_init_block_bitmap(sb, NULL, group, desc)
1511 1581
1512/* dir.c */ 1582/* dir.c */
1513extern int ext4_check_dir_entry(const char *, struct inode *, 1583extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
1514 struct ext4_dir_entry_2 *, 1584 struct ext4_dir_entry_2 *,
1515 struct buffer_head *, unsigned int); 1585 struct buffer_head *, unsigned int);
1586#define ext4_check_dir_entry(dir, de, bh, offset) \
1587 __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset))
1516extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 1588extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1517 __u32 minor_hash, 1589 __u32 minor_hash,
1518 struct ext4_dir_entry_2 *dirent); 1590 struct ext4_dir_entry_2 *dirent);
@@ -1601,8 +1673,6 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
1601extern int ext4_ext_migrate(struct inode *); 1673extern int ext4_ext_migrate(struct inode *);
1602 1674
1603/* namei.c */ 1675/* namei.c */
1604extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
1605extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
1606extern int ext4_orphan_add(handle_t *, struct inode *); 1676extern int ext4_orphan_add(handle_t *, struct inode *);
1607extern int ext4_orphan_del(handle_t *, struct inode *); 1677extern int ext4_orphan_del(handle_t *, struct inode *);
1608extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, 1678extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
@@ -1616,25 +1686,38 @@ extern int ext4_group_extend(struct super_block *sb,
1616 ext4_fsblk_t n_blocks_count); 1686 ext4_fsblk_t n_blocks_count);
1617 1687
1618/* super.c */ 1688/* super.c */
1619extern void __ext4_error(struct super_block *, const char *, const char *, ...) 1689extern void __ext4_error(struct super_block *, const char *, unsigned int,
1620 __attribute__ ((format (printf, 3, 4))); 1690 const char *, ...)
1621#define ext4_error(sb, message...) __ext4_error(sb, __func__, ## message) 1691 __attribute__ ((format (printf, 4, 5)));
1622extern void ext4_error_inode(const char *, struct inode *, const char *, ...) 1692#define ext4_error(sb, message...) __ext4_error(sb, __func__, \
1623 __attribute__ ((format (printf, 3, 4))); 1693 __LINE__, ## message)
1624extern void ext4_error_file(const char *, struct file *, const char *, ...) 1694extern void ext4_error_inode(struct inode *, const char *, unsigned int,
1625 __attribute__ ((format (printf, 3, 4))); 1695 ext4_fsblk_t, const char *, ...)
1626extern void __ext4_std_error(struct super_block *, const char *, int); 1696 __attribute__ ((format (printf, 5, 6)));
1627extern void ext4_abort(struct super_block *, const char *, const char *, ...) 1697extern void ext4_error_file(struct file *, const char *, unsigned int,
1628 __attribute__ ((format (printf, 3, 4))); 1698 const char *, ...)
1629extern void __ext4_warning(struct super_block *, const char *, 1699 __attribute__ ((format (printf, 4, 5)));
1700extern void __ext4_std_error(struct super_block *, const char *,
1701 unsigned int, int);
1702extern void __ext4_abort(struct super_block *, const char *, unsigned int,
1703 const char *, ...)
1704 __attribute__ ((format (printf, 4, 5)));
1705#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \
1706 __LINE__, ## message)
1707extern void __ext4_warning(struct super_block *, const char *, unsigned int,
1630 const char *, ...) 1708 const char *, ...)
1631 __attribute__ ((format (printf, 3, 4))); 1709 __attribute__ ((format (printf, 4, 5)));
1632#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message) 1710#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \
1711 __LINE__, ## message)
1633extern void ext4_msg(struct super_block *, const char *, const char *, ...) 1712extern void ext4_msg(struct super_block *, const char *, const char *, ...)
1634 __attribute__ ((format (printf, 3, 4))); 1713 __attribute__ ((format (printf, 3, 4)));
1635extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, 1714extern void __ext4_grp_locked_error(const char *, unsigned int, \
1636 const char *, const char *, ...) 1715 struct super_block *, ext4_group_t, \
1637 __attribute__ ((format (printf, 4, 5))); 1716 unsigned long, ext4_fsblk_t, \
1717 const char *, ...)
1718 __attribute__ ((format (printf, 7, 8)));
1719#define ext4_grp_locked_error(sb, grp, message...) \
1720 __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message)
1638extern void ext4_update_dynamic_rev(struct super_block *sb); 1721extern void ext4_update_dynamic_rev(struct super_block *sb);
1639extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, 1722extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
1640 __u32 compat); 1723 __u32 compat);
@@ -1768,7 +1851,7 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
1768#define ext4_std_error(sb, errno) \ 1851#define ext4_std_error(sb, errno) \
1769do { \ 1852do { \
1770 if ((errno)) \ 1853 if ((errno)) \
1771 __ext4_std_error((sb), __func__, (errno)); \ 1854 __ext4_std_error((sb), __func__, __LINE__, (errno)); \
1772} while (0) 1855} while (0)
1773 1856
1774#ifdef CONFIG_SMP 1857#ifdef CONFIG_SMP
@@ -1860,6 +1943,12 @@ static inline void ext4_unlock_group(struct super_block *sb,
1860 spin_unlock(ext4_group_lock_ptr(sb, group)); 1943 spin_unlock(ext4_group_lock_ptr(sb, group));
1861} 1944}
1862 1945
1946static inline void ext4_mark_super_dirty(struct super_block *sb)
1947{
1948 if (EXT4_SB(sb)->s_journal == NULL)
1949 sb->s_dirt =1;
1950}
1951
1863/* 1952/*
1864 * Inodes and files operations 1953 * Inodes and files operations
1865 */ 1954 */
@@ -1905,9 +1994,6 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1905 ssize_t len); 1994 ssize_t len);
1906extern int ext4_map_blocks(handle_t *handle, struct inode *inode, 1995extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
1907 struct ext4_map_blocks *map, int flags); 1996 struct ext4_map_blocks *map, int flags);
1908extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1909 sector_t block, unsigned int max_blocks,
1910 struct buffer_head *bh, int flags);
1911extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 1997extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1912 __u64 start, __u64 len); 1998 __u64 start, __u64 len);
1913/* move_extent.c */ 1999/* move_extent.c */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 53d2764d71ca..6e272ef6ba96 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,29 +6,29 @@
6 6
7#include <trace/events/ext4.h> 7#include <trace/events/ext4.h>
8 8
9int __ext4_journal_get_undo_access(const char *where, handle_t *handle, 9int __ext4_journal_get_undo_access(const char *where, unsigned int line,
10 struct buffer_head *bh) 10 handle_t *handle, struct buffer_head *bh)
11{ 11{
12 int err = 0; 12 int err = 0;
13 13
14 if (ext4_handle_valid(handle)) { 14 if (ext4_handle_valid(handle)) {
15 err = jbd2_journal_get_undo_access(handle, bh); 15 err = jbd2_journal_get_undo_access(handle, bh);
16 if (err) 16 if (err)
17 ext4_journal_abort_handle(where, __func__, bh, 17 ext4_journal_abort_handle(where, line, __func__, bh,
18 handle, err); 18 handle, err);
19 } 19 }
20 return err; 20 return err;
21} 21}
22 22
23int __ext4_journal_get_write_access(const char *where, handle_t *handle, 23int __ext4_journal_get_write_access(const char *where, unsigned int line,
24 struct buffer_head *bh) 24 handle_t *handle, struct buffer_head *bh)
25{ 25{
26 int err = 0; 26 int err = 0;
27 27
28 if (ext4_handle_valid(handle)) { 28 if (ext4_handle_valid(handle)) {
29 err = jbd2_journal_get_write_access(handle, bh); 29 err = jbd2_journal_get_write_access(handle, bh);
30 if (err) 30 if (err)
31 ext4_journal_abort_handle(where, __func__, bh, 31 ext4_journal_abort_handle(where, line, __func__, bh,
32 handle, err); 32 handle, err);
33 } 33 }
34 return err; 34 return err;
@@ -46,9 +46,9 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle,
46 * If the handle isn't valid we're not journaling, but we still need to 46 * If the handle isn't valid we're not journaling, but we still need to
47 * call into ext4_journal_revoke() to put the buffer head. 47 * call into ext4_journal_revoke() to put the buffer head.
48 */ 48 */
49int __ext4_forget(const char *where, handle_t *handle, int is_metadata, 49int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
50 struct inode *inode, struct buffer_head *bh, 50 int is_metadata, struct inode *inode,
51 ext4_fsblk_t blocknr) 51 struct buffer_head *bh, ext4_fsblk_t blocknr)
52{ 52{
53 int err; 53 int err;
54 54
@@ -79,8 +79,8 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
79 BUFFER_TRACE(bh, "call jbd2_journal_forget"); 79 BUFFER_TRACE(bh, "call jbd2_journal_forget");
80 err = jbd2_journal_forget(handle, bh); 80 err = jbd2_journal_forget(handle, bh);
81 if (err) 81 if (err)
82 ext4_journal_abort_handle(where, __func__, bh, 82 ext4_journal_abort_handle(where, line, __func__,
83 handle, err); 83 bh, handle, err);
84 return err; 84 return err;
85 } 85 }
86 return 0; 86 return 0;
@@ -92,15 +92,16 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
92 BUFFER_TRACE(bh, "call jbd2_journal_revoke"); 92 BUFFER_TRACE(bh, "call jbd2_journal_revoke");
93 err = jbd2_journal_revoke(handle, blocknr, bh); 93 err = jbd2_journal_revoke(handle, blocknr, bh);
94 if (err) { 94 if (err) {
95 ext4_journal_abort_handle(where, __func__, bh, handle, err); 95 ext4_journal_abort_handle(where, line, __func__,
96 ext4_abort(inode->i_sb, __func__, 96 bh, handle, err);
97 __ext4_abort(inode->i_sb, where, line,
97 "error %d when attempting revoke", err); 98 "error %d when attempting revoke", err);
98 } 99 }
99 BUFFER_TRACE(bh, "exit"); 100 BUFFER_TRACE(bh, "exit");
100 return err; 101 return err;
101} 102}
102 103
103int __ext4_journal_get_create_access(const char *where, 104int __ext4_journal_get_create_access(const char *where, unsigned int line,
104 handle_t *handle, struct buffer_head *bh) 105 handle_t *handle, struct buffer_head *bh)
105{ 106{
106 int err = 0; 107 int err = 0;
@@ -108,22 +109,23 @@ int __ext4_journal_get_create_access(const char *where,
108 if (ext4_handle_valid(handle)) { 109 if (ext4_handle_valid(handle)) {
109 err = jbd2_journal_get_create_access(handle, bh); 110 err = jbd2_journal_get_create_access(handle, bh);
110 if (err) 111 if (err)
111 ext4_journal_abort_handle(where, __func__, bh, 112 ext4_journal_abort_handle(where, line, __func__,
112 handle, err); 113 bh, handle, err);
113 } 114 }
114 return err; 115 return err;
115} 116}
116 117
117int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, 118int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
118 struct inode *inode, struct buffer_head *bh) 119 handle_t *handle, struct inode *inode,
120 struct buffer_head *bh)
119{ 121{
120 int err = 0; 122 int err = 0;
121 123
122 if (ext4_handle_valid(handle)) { 124 if (ext4_handle_valid(handle)) {
123 err = jbd2_journal_dirty_metadata(handle, bh); 125 err = jbd2_journal_dirty_metadata(handle, bh);
124 if (err) 126 if (err)
125 ext4_journal_abort_handle(where, __func__, bh, 127 ext4_journal_abort_handle(where, line, __func__,
126 handle, err); 128 bh, handle, err);
127 } else { 129 } else {
128 if (inode) 130 if (inode)
129 mark_buffer_dirty_inode(bh, inode); 131 mark_buffer_dirty_inode(bh, inode);
@@ -132,14 +134,33 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
132 if (inode && inode_needs_sync(inode)) { 134 if (inode && inode_needs_sync(inode)) {
133 sync_dirty_buffer(bh); 135 sync_dirty_buffer(bh);
134 if (buffer_req(bh) && !buffer_uptodate(bh)) { 136 if (buffer_req(bh) && !buffer_uptodate(bh)) {
135 ext4_error(inode->i_sb, 137 struct ext4_super_block *es;
136 "IO error syncing inode, " 138
137 "inode=%lu, block=%llu", 139 es = EXT4_SB(inode->i_sb)->s_es;
138 inode->i_ino, 140 es->s_last_error_block =
139 (unsigned long long) bh->b_blocknr); 141 cpu_to_le64(bh->b_blocknr);
142 ext4_error_inode(inode, where, line,
143 bh->b_blocknr,
144 "IO error syncing itable block");
140 err = -EIO; 145 err = -EIO;
141 } 146 }
142 } 147 }
143 } 148 }
144 return err; 149 return err;
145} 150}
151
152int __ext4_handle_dirty_super(const char *where, unsigned int line,
153 handle_t *handle, struct super_block *sb)
154{
155 struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
156 int err = 0;
157
158 if (ext4_handle_valid(handle)) {
159 err = jbd2_journal_dirty_metadata(handle, bh);
160 if (err)
161 ext4_journal_abort_handle(where, line, __func__,
162 bh, handle, err);
163 } else
164 sb->s_dirt = 1;
165 return err;
166}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index dade0c024797..b0bd792c58c5 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -122,39 +122,47 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
122/* 122/*
123 * Wrapper functions with which ext4 calls into JBD. 123 * Wrapper functions with which ext4 calls into JBD.
124 */ 124 */
125void ext4_journal_abort_handle(const char *caller, const char *err_fn, 125void ext4_journal_abort_handle(const char *caller, unsigned int line,
126 const char *err_fn,
126 struct buffer_head *bh, handle_t *handle, int err); 127 struct buffer_head *bh, handle_t *handle, int err);
127 128
128int __ext4_journal_get_undo_access(const char *where, handle_t *handle, 129int __ext4_journal_get_undo_access(const char *where, unsigned int line,
129 struct buffer_head *bh); 130 handle_t *handle, struct buffer_head *bh);
130 131
131int __ext4_journal_get_write_access(const char *where, handle_t *handle, 132int __ext4_journal_get_write_access(const char *where, unsigned int line,
132 struct buffer_head *bh); 133 handle_t *handle, struct buffer_head *bh);
133 134
134int __ext4_forget(const char *where, handle_t *handle, int is_metadata, 135int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
135 struct inode *inode, struct buffer_head *bh, 136 int is_metadata, struct inode *inode,
136 ext4_fsblk_t blocknr); 137 struct buffer_head *bh, ext4_fsblk_t blocknr);
137 138
138int __ext4_journal_get_create_access(const char *where, 139int __ext4_journal_get_create_access(const char *where, unsigned int line,
139 handle_t *handle, struct buffer_head *bh); 140 handle_t *handle, struct buffer_head *bh);
140 141
141int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, 142int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
142 struct inode *inode, struct buffer_head *bh); 143 handle_t *handle, struct inode *inode,
144 struct buffer_head *bh);
145
146int __ext4_handle_dirty_super(const char *where, unsigned int line,
147 handle_t *handle, struct super_block *sb);
143 148
144#define ext4_journal_get_undo_access(handle, bh) \ 149#define ext4_journal_get_undo_access(handle, bh) \
145 __ext4_journal_get_undo_access(__func__, (handle), (bh)) 150 __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
146#define ext4_journal_get_write_access(handle, bh) \ 151#define ext4_journal_get_write_access(handle, bh) \
147 __ext4_journal_get_write_access(__func__, (handle), (bh)) 152 __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
148#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ 153#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
149 __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\ 154 __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
150 (block_nr)) 155 (bh), (block_nr))
151#define ext4_journal_get_create_access(handle, bh) \ 156#define ext4_journal_get_create_access(handle, bh) \
152 __ext4_journal_get_create_access(__func__, (handle), (bh)) 157 __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh))
153#define ext4_handle_dirty_metadata(handle, inode, bh) \ 158#define ext4_handle_dirty_metadata(handle, inode, bh) \
154 __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) 159 __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
160 (bh))
161#define ext4_handle_dirty_super(handle, sb) \
162 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
155 163
156handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 164handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
157int __ext4_journal_stop(const char *where, handle_t *handle); 165int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
158 166
159#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) 167#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
160 168
@@ -207,7 +215,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
207} 215}
208 216
209#define ext4_journal_stop(handle) \ 217#define ext4_journal_stop(handle) \
210 __ext4_journal_stop(__func__, (handle)) 218 __ext4_journal_stop(__func__, __LINE__, (handle))
211 219
212static inline handle_t *ext4_journal_current_handle(void) 220static inline handle_t *ext4_journal_current_handle(void)
213{ 221{
@@ -308,17 +316,15 @@ static inline int ext4_should_writeback_data(struct inode *inode)
308 * This function controls whether or not we should try to go down the 316 * This function controls whether or not we should try to go down the
309 * dioread_nolock code paths, which makes it safe to avoid taking 317 * dioread_nolock code paths, which makes it safe to avoid taking
310 * i_mutex for direct I/O reads. This only works for extent-based 318 * i_mutex for direct I/O reads. This only works for extent-based
311 * files, and it doesn't work for nobh or if data journaling is 319 * files, and it doesn't work if data journaling is enabled, since the
312 * enabled, since the dioread_nolock code uses b_private to pass 320 * dioread_nolock code uses b_private to pass information back to the
313 * information back to the I/O completion handler, and this conflicts 321 * I/O completion handler, and this conflicts with the jbd's use of
314 * with the jbd's use of b_private. 322 * b_private.
315 */ 323 */
316static inline int ext4_should_dioread_nolock(struct inode *inode) 324static inline int ext4_should_dioread_nolock(struct inode *inode)
317{ 325{
318 if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) 326 if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
319 return 0; 327 return 0;
320 if (test_opt(inode->i_sb, NOBH))
321 return 0;
322 if (!S_ISREG(inode->i_mode)) 328 if (!S_ISREG(inode->i_mode))
323 return 0; 329 return 0;
324 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 330 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bf029c7d5518..06328d3e5717 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -401,9 +401,9 @@ static int ext4_valid_extent_entries(struct inode *inode,
401 return 1; 401 return 1;
402} 402}
403 403
404static int __ext4_ext_check(const char *function, struct inode *inode, 404static int __ext4_ext_check(const char *function, unsigned int line,
405 struct ext4_extent_header *eh, 405 struct inode *inode, struct ext4_extent_header *eh,
406 int depth) 406 int depth)
407{ 407{
408 const char *error_msg; 408 const char *error_msg;
409 int max = 0; 409 int max = 0;
@@ -436,7 +436,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
436 return 0; 436 return 0;
437 437
438corrupted: 438corrupted:
439 ext4_error_inode(function, inode, 439 ext4_error_inode(inode, function, line, 0,
440 "bad header/extent: %s - magic %x, " 440 "bad header/extent: %s - magic %x, "
441 "entries %u, max %u(%u), depth %u(%u)", 441 "entries %u, max %u(%u), depth %u(%u)",
442 error_msg, le16_to_cpu(eh->eh_magic), 442 error_msg, le16_to_cpu(eh->eh_magic),
@@ -447,7 +447,7 @@ corrupted:
447} 447}
448 448
449#define ext4_ext_check(inode, eh, depth) \ 449#define ext4_ext_check(inode, eh, depth) \
450 __ext4_ext_check(__func__, inode, eh, depth) 450 __ext4_ext_check(__func__, __LINE__, inode, eh, depth)
451 451
452int ext4_ext_check_inode(struct inode *inode) 452int ext4_ext_check_inode(struct inode *inode)
453{ 453{
@@ -1083,7 +1083,6 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1083{ 1083{
1084 struct ext4_ext_path *curp = path; 1084 struct ext4_ext_path *curp = path;
1085 struct ext4_extent_header *neh; 1085 struct ext4_extent_header *neh;
1086 struct ext4_extent_idx *fidx;
1087 struct buffer_head *bh; 1086 struct buffer_head *bh;
1088 ext4_fsblk_t newblock; 1087 ext4_fsblk_t newblock;
1089 int err = 0; 1088 int err = 0;
@@ -1144,10 +1143,10 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1144 ext4_idx_store_pblock(curp->p_idx, newblock); 1143 ext4_idx_store_pblock(curp->p_idx, newblock);
1145 1144
1146 neh = ext_inode_hdr(inode); 1145 neh = ext_inode_hdr(inode);
1147 fidx = EXT_FIRST_INDEX(neh);
1148 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", 1146 ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n",
1149 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), 1147 le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max),
1150 le32_to_cpu(fidx->ei_block), idx_pblock(fidx)); 1148 le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block),
1149 idx_pblock(EXT_FIRST_INDEX(neh)));
1151 1150
1152 neh->eh_depth = cpu_to_le16(path->p_depth + 1); 1151 neh->eh_depth = cpu_to_le16(path->p_depth + 1);
1153 err = ext4_ext_dirty(handle, inode, curp); 1152 err = ext4_ext_dirty(handle, inode, curp);
@@ -2954,7 +2953,6 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2954 struct ext4_extent *ex1 = NULL; 2953 struct ext4_extent *ex1 = NULL;
2955 struct ext4_extent *ex2 = NULL; 2954 struct ext4_extent *ex2 = NULL;
2956 struct ext4_extent *ex3 = NULL; 2955 struct ext4_extent *ex3 = NULL;
2957 struct ext4_extent_header *eh;
2958 ext4_lblk_t ee_block, eof_block; 2956 ext4_lblk_t ee_block, eof_block;
2959 unsigned int allocated, ee_len, depth; 2957 unsigned int allocated, ee_len, depth;
2960 ext4_fsblk_t newblock; 2958 ext4_fsblk_t newblock;
@@ -2971,7 +2969,6 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2971 eof_block = map->m_lblk + map->m_len; 2969 eof_block = map->m_lblk + map->m_len;
2972 2970
2973 depth = ext_depth(inode); 2971 depth = ext_depth(inode);
2974 eh = path[depth].p_hdr;
2975 ex = path[depth].p_ext; 2972 ex = path[depth].p_ext;
2976 ee_block = le32_to_cpu(ex->ee_block); 2973 ee_block = le32_to_cpu(ex->ee_block);
2977 ee_len = ext4_ext_get_actual_len(ex); 2974 ee_len = ext4_ext_get_actual_len(ex);
@@ -3058,7 +3055,6 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3058 err = PTR_ERR(path); 3055 err = PTR_ERR(path);
3059 goto out; 3056 goto out;
3060 } 3057 }
3061 eh = path[depth].p_hdr;
3062 ex = path[depth].p_ext; 3058 ex = path[depth].p_ext;
3063 if (ex2 != &newex) 3059 if (ex2 != &newex)
3064 ex2 = ex; 3060 ex2 = ex;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5313ae4cda2d..ee92b66d4558 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -70,7 +70,8 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
70 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 70 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
71 size_t length = iov_length(iov, nr_segs); 71 size_t length = iov_length(iov, nr_segs);
72 72
73 if (pos > sbi->s_bitmap_maxbytes) 73 if ((pos > sbi->s_bitmap_maxbytes ||
74 (pos == sbi->s_bitmap_maxbytes && length > 0)))
74 return -EFBIG; 75 return -EFBIG;
75 76
76 if (pos + length > sbi->s_bitmap_maxbytes) { 77 if (pos + length > sbi->s_bitmap_maxbytes) {
@@ -123,7 +124,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
123 if (!IS_ERR(cp)) { 124 if (!IS_ERR(cp)) {
124 memcpy(sbi->s_es->s_last_mounted, cp, 125 memcpy(sbi->s_es->s_last_mounted, cp,
125 sizeof(sbi->s_es->s_last_mounted)); 126 sizeof(sbi->s_es->s_last_mounted));
126 sb->s_dirt = 1; 127 ext4_mark_super_dirty(sb);
127 } 128 }
128 } 129 }
129 return dquot_file_open(inode, filp); 130 return dquot_file_open(inode, filp);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 25c4b3173fd9..ac377505ed57 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -279,7 +279,7 @@ out:
279 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 279 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
280 if (!fatal) 280 if (!fatal)
281 fatal = err; 281 fatal = err;
282 sb->s_dirt = 1; 282 ext4_mark_super_dirty(sb);
283 } else 283 } else
284 ext4_error(sb, "bit already cleared for inode %lu", ino); 284 ext4_error(sb, "bit already cleared for inode %lu", ino);
285 285
@@ -965,7 +965,7 @@ got:
965 percpu_counter_dec(&sbi->s_freeinodes_counter); 965 percpu_counter_dec(&sbi->s_freeinodes_counter);
966 if (S_ISDIR(mode)) 966 if (S_ISDIR(mode))
967 percpu_counter_inc(&sbi->s_dirs_counter); 967 percpu_counter_inc(&sbi->s_dirs_counter);
968 sb->s_dirt = 1; 968 ext4_mark_super_dirty(sb);
969 969
970 if (sbi->s_log_groups_per_flex) { 970 if (sbi->s_log_groups_per_flex) {
971 flex_group = ext4_flex_group(sbi, group); 971 flex_group = ext4_flex_group(sbi, group);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0afc8c1d8cf3..a0ab3754d0d6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -221,6 +221,7 @@ void ext4_delete_inode(struct inode *inode)
221 "couldn't extend journal (err %d)", err); 221 "couldn't extend journal (err %d)", err);
222 stop_handle: 222 stop_handle:
223 ext4_journal_stop(handle); 223 ext4_journal_stop(handle);
224 ext4_orphan_del(NULL, inode);
224 goto no_delete; 225 goto no_delete;
225 } 226 }
226 } 227 }
@@ -337,9 +338,11 @@ static int ext4_block_to_path(struct inode *inode,
337 return n; 338 return n;
338} 339}
339 340
340static int __ext4_check_blockref(const char *function, struct inode *inode, 341static int __ext4_check_blockref(const char *function, unsigned int line,
342 struct inode *inode,
341 __le32 *p, unsigned int max) 343 __le32 *p, unsigned int max)
342{ 344{
345 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
343 __le32 *bref = p; 346 __le32 *bref = p;
344 unsigned int blk; 347 unsigned int blk;
345 348
@@ -348,8 +351,9 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
348 if (blk && 351 if (blk &&
349 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 352 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
350 blk, 1))) { 353 blk, 1))) {
351 ext4_error_inode(function, inode, 354 es->s_last_error_block = cpu_to_le64(blk);
352 "invalid block reference %u", blk); 355 ext4_error_inode(inode, function, line, blk,
356 "invalid block");
353 return -EIO; 357 return -EIO;
354 } 358 }
355 } 359 }
@@ -358,11 +362,13 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
358 362
359 363
360#define ext4_check_indirect_blockref(inode, bh) \ 364#define ext4_check_indirect_blockref(inode, bh) \
361 __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \ 365 __ext4_check_blockref(__func__, __LINE__, inode, \
366 (__le32 *)(bh)->b_data, \
362 EXT4_ADDR_PER_BLOCK((inode)->i_sb)) 367 EXT4_ADDR_PER_BLOCK((inode)->i_sb))
363 368
364#define ext4_check_inode_blockref(inode) \ 369#define ext4_check_inode_blockref(inode) \
365 __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \ 370 __ext4_check_blockref(__func__, __LINE__, inode, \
371 EXT4_I(inode)->i_data, \
366 EXT4_NDIR_BLOCKS) 372 EXT4_NDIR_BLOCKS)
367 373
368/** 374/**
@@ -1128,20 +1134,24 @@ void ext4_da_update_reserve_space(struct inode *inode,
1128 ext4_discard_preallocations(inode); 1134 ext4_discard_preallocations(inode);
1129} 1135}
1130 1136
1131static int check_block_validity(struct inode *inode, const char *func, 1137static int __check_block_validity(struct inode *inode, const char *func,
1138 unsigned int line,
1132 struct ext4_map_blocks *map) 1139 struct ext4_map_blocks *map)
1133{ 1140{
1134 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, 1141 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
1135 map->m_len)) { 1142 map->m_len)) {
1136 ext4_error_inode(func, inode, 1143 ext4_error_inode(inode, func, line, map->m_pblk,
1137 "lblock %lu mapped to illegal pblock %llu " 1144 "lblock %lu mapped to illegal pblock "
1138 "(length %d)", (unsigned long) map->m_lblk, 1145 "(length %d)", (unsigned long) map->m_lblk,
1139 map->m_pblk, map->m_len); 1146 map->m_len);
1140 return -EIO; 1147 return -EIO;
1141 } 1148 }
1142 return 0; 1149 return 0;
1143} 1150}
1144 1151
1152#define check_block_validity(inode, map) \
1153 __check_block_validity((inode), __func__, __LINE__, (map))
1154
1145/* 1155/*
1146 * Return the number of contiguous dirty pages in a given inode 1156 * Return the number of contiguous dirty pages in a given inode
1147 * starting at page frame idx. 1157 * starting at page frame idx.
@@ -1244,7 +1254,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
1244 up_read((&EXT4_I(inode)->i_data_sem)); 1254 up_read((&EXT4_I(inode)->i_data_sem));
1245 1255
1246 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 1256 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1247 int ret = check_block_validity(inode, __func__, map); 1257 int ret = check_block_validity(inode, map);
1248 if (ret != 0) 1258 if (ret != 0)
1249 return ret; 1259 return ret;
1250 } 1260 }
@@ -1324,9 +1334,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
1324 1334
1325 up_write((&EXT4_I(inode)->i_data_sem)); 1335 up_write((&EXT4_I(inode)->i_data_sem));
1326 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 1336 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1327 int ret = check_block_validity(inode, 1337 int ret = check_block_validity(inode, map);
1328 "ext4_map_blocks_after_alloc",
1329 map);
1330 if (ret != 0) 1338 if (ret != 0)
1331 return ret; 1339 return ret;
1332 } 1340 }
@@ -1519,9 +1527,25 @@ static int walk_page_buffers(handle_t *handle,
1519static int do_journal_get_write_access(handle_t *handle, 1527static int do_journal_get_write_access(handle_t *handle,
1520 struct buffer_head *bh) 1528 struct buffer_head *bh)
1521{ 1529{
1530 int dirty = buffer_dirty(bh);
1531 int ret;
1532
1522 if (!buffer_mapped(bh) || buffer_freed(bh)) 1533 if (!buffer_mapped(bh) || buffer_freed(bh))
1523 return 0; 1534 return 0;
1524 return ext4_journal_get_write_access(handle, bh); 1535 /*
1536 * __block_prepare_write() could have dirtied some buffers. Clean
1537 * the dirty bit as jbd2_journal_get_write_access() could complain
1538 * otherwise about fs integrity issues. Setting of the dirty bit
1539 * by __block_prepare_write() isn't a real problem here as we clear
1540 * the bit before releasing a page lock and thus writeback cannot
1541 * ever write the buffer.
1542 */
1543 if (dirty)
1544 clear_buffer_dirty(bh);
1545 ret = ext4_journal_get_write_access(handle, bh);
1546 if (!ret && dirty)
1547 ret = ext4_handle_dirty_metadata(handle, NULL, bh);
1548 return ret;
1525} 1549}
1526 1550
1527/* 1551/*
@@ -2194,7 +2218,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2194 BUG_ON(!handle); 2218 BUG_ON(!handle);
2195 2219
2196 /* 2220 /*
2197 * Call ext4_get_blocks() to allocate any delayed allocation 2221 * Call ext4_map_blocks() to allocate any delayed allocation
2198 * blocks, or to convert an uninitialized extent to be 2222 * blocks, or to convert an uninitialized extent to be
2199 * initialized (in the case where we have written into 2223 * initialized (in the case where we have written into
2200 * one or more preallocated blocks). 2224 * one or more preallocated blocks).
@@ -2203,7 +2227,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2203 * indicate that we are on the delayed allocation path. This 2227 * indicate that we are on the delayed allocation path. This
2204 * affects functions in many different parts of the allocation 2228 * affects functions in many different parts of the allocation
2205 * call path. This flag exists primarily because we don't 2229 * call path. This flag exists primarily because we don't
2206 * want to change *many* call functions, so ext4_get_blocks() 2230 * want to change *many* call functions, so ext4_map_blocks()
2207 * will set the magic i_delalloc_reserved_flag once the 2231 * will set the magic i_delalloc_reserved_flag once the
2208 * inode's allocation semaphore is taken. 2232 * inode's allocation semaphore is taken.
2209 * 2233 *
@@ -2221,6 +2245,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2221 2245
2222 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); 2246 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
2223 if (blks < 0) { 2247 if (blks < 0) {
2248 struct super_block *sb = mpd->inode->i_sb;
2249
2224 err = blks; 2250 err = blks;
2225 /* 2251 /*
2226 * If get block returns with error we simply 2252 * If get block returns with error we simply
@@ -2231,7 +2257,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2231 return 0; 2257 return 0;
2232 2258
2233 if (err == -ENOSPC && 2259 if (err == -ENOSPC &&
2234 ext4_count_free_blocks(mpd->inode->i_sb)) { 2260 ext4_count_free_blocks(sb)) {
2235 mpd->retval = err; 2261 mpd->retval = err;
2236 return 0; 2262 return 0;
2237 } 2263 }
@@ -2243,16 +2269,17 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2243 * writepage and writepages will again try to write 2269 * writepage and writepages will again try to write
2244 * the same. 2270 * the same.
2245 */ 2271 */
2246 ext4_msg(mpd->inode->i_sb, KERN_CRIT, 2272 if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
2247 "delayed block allocation failed for inode %lu at " 2273 ext4_msg(sb, KERN_CRIT,
2248 "logical offset %llu with max blocks %zd with " 2274 "delayed block allocation failed for inode %lu "
2249 "error %d", mpd->inode->i_ino, 2275 "at logical offset %llu with max blocks %zd "
2250 (unsigned long long) next, 2276 "with error %d", mpd->inode->i_ino,
2251 mpd->b_size >> mpd->inode->i_blkbits, err); 2277 (unsigned long long) next,
2252 printk(KERN_CRIT "This should not happen!! " 2278 mpd->b_size >> mpd->inode->i_blkbits, err);
2253 "Data will be lost\n"); 2279 ext4_msg(sb, KERN_CRIT,
2254 if (err == -ENOSPC) { 2280 "This should not happen!! Data will be lost\n");
2255 ext4_print_free_blocks(mpd->inode); 2281 if (err == -ENOSPC)
2282 ext4_print_free_blocks(mpd->inode);
2256 } 2283 }
2257 /* invalidate all the pages */ 2284 /* invalidate all the pages */
2258 ext4_da_block_invalidatepages(mpd, next, 2285 ext4_da_block_invalidatepages(mpd, next,
@@ -2320,7 +2347,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2320 * XXX Don't go larger than mballoc is willing to allocate 2347 * XXX Don't go larger than mballoc is willing to allocate
2321 * This is a stopgap solution. We eventually need to fold 2348 * This is a stopgap solution. We eventually need to fold
2322 * mpage_da_submit_io() into this function and then call 2349 * mpage_da_submit_io() into this function and then call
2323 * ext4_get_blocks() multiple times in a loop 2350 * ext4_map_blocks() multiple times in a loop
2324 */ 2351 */
2325 if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) 2352 if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
2326 goto flush_it; 2353 goto flush_it;
@@ -2553,18 +2580,16 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2553/* 2580/*
2554 * This function is used as a standard get_block_t calback function 2581 * This function is used as a standard get_block_t calback function
2555 * when there is no desire to allocate any blocks. It is used as a 2582 * when there is no desire to allocate any blocks. It is used as a
2556 * callback function for block_prepare_write(), nobh_writepage(), and 2583 * callback function for block_prepare_write() and block_write_full_page().
2557 * block_write_full_page(). These functions should only try to map a 2584 * These functions should only try to map a single block at a time.
2558 * single block at a time.
2559 * 2585 *
2560 * Since this function doesn't do block allocations even if the caller 2586 * Since this function doesn't do block allocations even if the caller
2561 * requests it by passing in create=1, it is critically important that 2587 * requests it by passing in create=1, it is critically important that
2562 * any caller checks to make sure that any buffer heads are returned 2588 * any caller checks to make sure that any buffer heads are returned
2563 * by this function are either all already mapped or marked for 2589 * by this function are either all already mapped or marked for
2564 * delayed allocation before calling nobh_writepage() or 2590 * delayed allocation before calling block_write_full_page(). Otherwise,
2565 * block_write_full_page(). Otherwise, b_blocknr could be left 2591 * b_blocknr could be left unitialized, and the page write functions will
2566 * unitialized, and the page write functions will be taken by 2592 * be taken by surprise.
2567 * surprise.
2568 */ 2593 */
2569static int noalloc_get_block_write(struct inode *inode, sector_t iblock, 2594static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
2570 struct buffer_head *bh_result, int create) 2595 struct buffer_head *bh_result, int create)
@@ -2749,9 +2774,7 @@ static int ext4_writepage(struct page *page,
2749 return __ext4_journalled_writepage(page, len); 2774 return __ext4_journalled_writepage(page, len);
2750 } 2775 }
2751 2776
2752 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2777 if (page_bufs && buffer_uninit(page_bufs)) {
2753 ret = nobh_writepage(page, noalloc_get_block_write, wbc);
2754 else if (page_bufs && buffer_uninit(page_bufs)) {
2755 ext4_set_bh_endio(page_bufs, inode); 2778 ext4_set_bh_endio(page_bufs, inode);
2756 ret = block_write_full_page_endio(page, noalloc_get_block_write, 2779 ret = block_write_full_page_endio(page, noalloc_get_block_write,
2757 wbc, ext4_end_io_buffer_write); 2780 wbc, ext4_end_io_buffer_write);
@@ -3146,13 +3169,10 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3146 int ret, retries = 0; 3169 int ret, retries = 0;
3147 struct page *page; 3170 struct page *page;
3148 pgoff_t index; 3171 pgoff_t index;
3149 unsigned from, to;
3150 struct inode *inode = mapping->host; 3172 struct inode *inode = mapping->host;
3151 handle_t *handle; 3173 handle_t *handle;
3152 3174
3153 index = pos >> PAGE_CACHE_SHIFT; 3175 index = pos >> PAGE_CACHE_SHIFT;
3154 from = pos & (PAGE_CACHE_SIZE - 1);
3155 to = from + len;
3156 3176
3157 if (ext4_nonda_switch(inode->i_sb)) { 3177 if (ext4_nonda_switch(inode->i_sb)) {
3158 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; 3178 *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
@@ -3668,6 +3688,8 @@ static int ext4_end_io_nolock(ext4_io_end_t *io)
3668 return ret; 3688 return ret;
3669 } 3689 }
3670 3690
3691 if (io->iocb)
3692 aio_complete(io->iocb, io->result, 0);
3671 /* clear the DIO AIO unwritten flag */ 3693 /* clear the DIO AIO unwritten flag */
3672 io->flag = 0; 3694 io->flag = 0;
3673 return ret; 3695 return ret;
@@ -3767,6 +3789,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
3767 io->offset = 0; 3789 io->offset = 0;
3768 io->size = 0; 3790 io->size = 0;
3769 io->page = NULL; 3791 io->page = NULL;
3792 io->iocb = NULL;
3793 io->result = 0;
3770 INIT_WORK(&io->work, ext4_end_io_work); 3794 INIT_WORK(&io->work, ext4_end_io_work);
3771 INIT_LIST_HEAD(&io->list); 3795 INIT_LIST_HEAD(&io->list);
3772 } 3796 }
@@ -3796,12 +3820,18 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3796 if (io_end->flag != EXT4_IO_UNWRITTEN){ 3820 if (io_end->flag != EXT4_IO_UNWRITTEN){
3797 ext4_free_io_end(io_end); 3821 ext4_free_io_end(io_end);
3798 iocb->private = NULL; 3822 iocb->private = NULL;
3799 goto out; 3823out:
3824 if (is_async)
3825 aio_complete(iocb, ret, 0);
3826 return;
3800 } 3827 }
3801 3828
3802 io_end->offset = offset; 3829 io_end->offset = offset;
3803 io_end->size = size; 3830 io_end->size = size;
3804 io_end->flag = EXT4_IO_UNWRITTEN; 3831 if (is_async) {
3832 io_end->iocb = iocb;
3833 io_end->result = ret;
3834 }
3805 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 3835 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3806 3836
3807 /* queue the work to convert unwritten extents to written */ 3837 /* queue the work to convert unwritten extents to written */
@@ -3813,9 +3843,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3813 list_add_tail(&io_end->list, &ei->i_completed_io_list); 3843 list_add_tail(&io_end->list, &ei->i_completed_io_list);
3814 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 3844 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
3815 iocb->private = NULL; 3845 iocb->private = NULL;
3816out:
3817 if (is_async)
3818 aio_complete(iocb, ret, 0);
3819} 3846}
3820 3847
3821static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) 3848static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
@@ -3941,7 +3968,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3941 return -ENOMEM; 3968 return -ENOMEM;
3942 /* 3969 /*
3943 * we save the io structure for current async 3970 * we save the io structure for current async
3944 * direct IO, so that later ext4_get_blocks() 3971 * direct IO, so that later ext4_map_blocks()
3945 * could flag the io structure whether there 3972 * could flag the io structure whether there
3946 * is a unwritten extents needs to be converted 3973 * is a unwritten extents needs to be converted
3947 * when IO is completed. 3974 * when IO is completed.
@@ -4132,17 +4159,6 @@ int ext4_block_truncate_page(handle_t *handle,
4132 length = blocksize - (offset & (blocksize - 1)); 4159 length = blocksize - (offset & (blocksize - 1));
4133 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 4160 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
4134 4161
4135 /*
4136 * For "nobh" option, we can only work if we don't need to
4137 * read-in the page - otherwise we create buffers to do the IO.
4138 */
4139 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
4140 ext4_should_writeback_data(inode) && PageUptodate(page)) {
4141 zero_user(page, offset, length);
4142 set_page_dirty(page);
4143 goto unlock;
4144 }
4145
4146 if (!page_has_buffers(page)) 4162 if (!page_has_buffers(page))
4147 create_empty_buffers(page, blocksize, 0); 4163 create_empty_buffers(page, blocksize, 0);
4148 4164
@@ -4492,9 +4508,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4492 * (should be rare). 4508 * (should be rare).
4493 */ 4509 */
4494 if (!bh) { 4510 if (!bh) {
4495 EXT4_ERROR_INODE(inode, 4511 EXT4_ERROR_INODE_BLOCK(inode, nr,
4496 "Read failure block=%llu", 4512 "Read failure");
4497 (unsigned long long) nr);
4498 continue; 4513 continue;
4499 } 4514 }
4500 4515
@@ -4506,27 +4521,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4506 depth); 4521 depth);
4507 4522
4508 /* 4523 /*
4509 * We've probably journalled the indirect block several
4510 * times during the truncate. But it's no longer
4511 * needed and we now drop it from the transaction via
4512 * jbd2_journal_revoke().
4513 *
4514 * That's easy if it's exclusively part of this
4515 * transaction. But if it's part of the committing
4516 * transaction then jbd2_journal_forget() will simply
4517 * brelse() it. That means that if the underlying
4518 * block is reallocated in ext4_get_block(),
4519 * unmap_underlying_metadata() will find this block
4520 * and will try to get rid of it. damn, damn.
4521 *
4522 * If this block has already been committed to the
4523 * journal, a revoke record will be written. And
4524 * revoke records must be emitted *before* clearing
4525 * this block's bit in the bitmaps.
4526 */
4527 ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
4528
4529 /*
4530 * Everything below this this pointer has been 4524 * Everything below this this pointer has been
4531 * released. Now let this top-of-subtree go. 4525 * released. Now let this top-of-subtree go.
4532 * 4526 *
@@ -4550,8 +4544,20 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4550 blocks_for_truncate(inode)); 4544 blocks_for_truncate(inode));
4551 } 4545 }
4552 4546
4547 /*
4548 * The forget flag here is critical because if
4549 * we are journaling (and not doing data
4550 * journaling), we have to make sure a revoke
4551 * record is written to prevent the journal
4552 * replay from overwriting the (former)
4553 * indirect block if it gets reallocated as a
4554 * data block. This must happen in the same
4555 * transaction where the data blocks are
4556 * actually freed.
4557 */
4553 ext4_free_blocks(handle, inode, 0, nr, 1, 4558 ext4_free_blocks(handle, inode, 0, nr, 1,
4554 EXT4_FREE_BLOCKS_METADATA); 4559 EXT4_FREE_BLOCKS_METADATA|
4560 EXT4_FREE_BLOCKS_FORGET);
4555 4561
4556 if (parent_bh) { 4562 if (parent_bh) {
4557 /* 4563 /*
@@ -4809,8 +4815,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
4809 4815
4810 bh = sb_getblk(sb, block); 4816 bh = sb_getblk(sb, block);
4811 if (!bh) { 4817 if (!bh) {
4812 EXT4_ERROR_INODE(inode, "unable to read inode block - " 4818 EXT4_ERROR_INODE_BLOCK(inode, block,
4813 "block %llu", block); 4819 "unable to read itable block");
4814 return -EIO; 4820 return -EIO;
4815 } 4821 }
4816 if (!buffer_uptodate(bh)) { 4822 if (!buffer_uptodate(bh)) {
@@ -4908,8 +4914,8 @@ make_io:
4908 submit_bh(READ_META, bh); 4914 submit_bh(READ_META, bh);
4909 wait_on_buffer(bh); 4915 wait_on_buffer(bh);
4910 if (!buffer_uptodate(bh)) { 4916 if (!buffer_uptodate(bh)) {
4911 EXT4_ERROR_INODE(inode, "unable to read inode " 4917 EXT4_ERROR_INODE_BLOCK(inode, block,
4912 "block %llu", block); 4918 "unable to read itable block");
4913 brelse(bh); 4919 brelse(bh);
4914 return -EIO; 4920 return -EIO;
4915 } 4921 }
@@ -4980,7 +4986,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
4980 /* we are using combined 48 bit field */ 4986 /* we are using combined 48 bit field */
4981 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | 4987 i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
4982 le32_to_cpu(raw_inode->i_blocks_lo); 4988 le32_to_cpu(raw_inode->i_blocks_lo);
4983 if (ei->i_flags & EXT4_HUGE_FILE_FL) { 4989 if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
4984 /* i_blocks represent file system block size */ 4990 /* i_blocks represent file system block size */
4985 return i_blocks << (inode->i_blkbits - 9); 4991 return i_blocks << (inode->i_blkbits - 9);
4986 } else { 4992 } else {
@@ -5076,7 +5082,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5076 transaction_t *transaction; 5082 transaction_t *transaction;
5077 tid_t tid; 5083 tid_t tid;
5078 5084
5079 spin_lock(&journal->j_state_lock); 5085 read_lock(&journal->j_state_lock);
5080 if (journal->j_running_transaction) 5086 if (journal->j_running_transaction)
5081 transaction = journal->j_running_transaction; 5087 transaction = journal->j_running_transaction;
5082 else 5088 else
@@ -5085,7 +5091,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5085 tid = transaction->t_tid; 5091 tid = transaction->t_tid;
5086 else 5092 else
5087 tid = journal->j_commit_sequence; 5093 tid = journal->j_commit_sequence;
5088 spin_unlock(&journal->j_state_lock); 5094 read_unlock(&journal->j_state_lock);
5089 ei->i_sync_tid = tid; 5095 ei->i_sync_tid = tid;
5090 ei->i_datasync_tid = tid; 5096 ei->i_datasync_tid = tid;
5091 } 5097 }
@@ -5130,7 +5136,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5130 ei->i_file_acl); 5136 ei->i_file_acl);
5131 ret = -EIO; 5137 ret = -EIO;
5132 goto bad_inode; 5138 goto bad_inode;
5133 } else if (ei->i_flags & EXT4_EXTENTS_FL) { 5139 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
5134 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 5140 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
5135 (S_ISLNK(inode->i_mode) && 5141 (S_ISLNK(inode->i_mode) &&
5136 !ext4_inode_is_fast_symlink(inode))) 5142 !ext4_inode_is_fast_symlink(inode)))
@@ -5410,9 +5416,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5410 if (wbc->sync_mode == WB_SYNC_ALL) 5416 if (wbc->sync_mode == WB_SYNC_ALL)
5411 sync_dirty_buffer(iloc.bh); 5417 sync_dirty_buffer(iloc.bh);
5412 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5418 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5413 EXT4_ERROR_INODE(inode, 5419 EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
5414 "IO error syncing inode (block=%llu)", 5420 "IO error syncing inode");
5415 (unsigned long long) iloc.bh->b_blocknr);
5416 err = -EIO; 5421 err = -EIO;
5417 } 5422 }
5418 brelse(iloc.bh); 5423 brelse(iloc.bh);
@@ -5487,10 +5492,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5487 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 5492 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
5488 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5493 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5489 5494
5490 if (attr->ia_size > sbi->s_bitmap_maxbytes) { 5495 if (attr->ia_size > sbi->s_bitmap_maxbytes)
5491 error = -EFBIG; 5496 return -EFBIG;
5492 goto err_out;
5493 }
5494 } 5497 }
5495 } 5498 }
5496 5499
@@ -5692,7 +5695,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
5692 * Calculate the journal credits for a chunk of data modification. 5695 * Calculate the journal credits for a chunk of data modification.
5693 * 5696 *
5694 * This is called from DIO, fallocate or whoever calling 5697 * This is called from DIO, fallocate or whoever calling
5695 * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks. 5698 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
5696 * 5699 *
5697 * journal buffers for data blocks are not included here, as DIO 5700 * journal buffers for data blocks are not included here, as DIO
5698 * and fallocate do no need to journal data buffers. 5701 * and fallocate do no need to journal data buffers.
@@ -5758,7 +5761,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
5758{ 5761{
5759 struct ext4_inode *raw_inode; 5762 struct ext4_inode *raw_inode;
5760 struct ext4_xattr_ibody_header *header; 5763 struct ext4_xattr_ibody_header *header;
5761 struct ext4_xattr_entry *entry;
5762 5764
5763 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) 5765 if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
5764 return 0; 5766 return 0;
@@ -5766,7 +5768,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
5766 raw_inode = ext4_raw_inode(&iloc); 5768 raw_inode = ext4_raw_inode(&iloc);
5767 5769
5768 header = IHDR(inode, raw_inode); 5770 header = IHDR(inode, raw_inode);
5769 entry = IFIRST(header);
5770 5771
5771 /* No extended attributes present */ 5772 /* No extended attributes present */
5772 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || 5773 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 0e83dfd351d5..4b4ad4b7ce57 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -446,10 +446,11 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
446 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 446 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
447 blocknr += first + i; 447 blocknr += first + i;
448 ext4_grp_locked_error(sb, e4b->bd_group, 448 ext4_grp_locked_error(sb, e4b->bd_group,
449 __func__, "double-free of inode" 449 inode ? inode->i_ino : 0,
450 " %lu's block %llu(bit %u in group %u)", 450 blocknr,
451 inode ? inode->i_ino : 0, blocknr, 451 "freeing block already freed "
452 first + i, e4b->bd_group); 452 "(bit %u)",
453 first + i);
453 } 454 }
454 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); 455 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
455 } 456 }
@@ -712,9 +713,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
712 grp->bb_fragments = fragments; 713 grp->bb_fragments = fragments;
713 714
714 if (free != grp->bb_free) { 715 if (free != grp->bb_free) {
715 ext4_grp_locked_error(sb, group, __func__, 716 ext4_grp_locked_error(sb, group, 0, 0,
716 "EXT4-fs: group %u: %u blocks in bitmap, %u in gd", 717 "%u blocks in bitmap, %u in gd",
717 group, free, grp->bb_free); 718 free, grp->bb_free);
718 /* 719 /*
719 * If we intent to continue, we consider group descritor 720 * If we intent to continue, we consider group descritor
720 * corrupt and update bb_free using bitmap value 721 * corrupt and update bb_free using bitmap value
@@ -1296,10 +1297,10 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1296 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1297 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1297 blocknr += block; 1298 blocknr += block;
1298 ext4_grp_locked_error(sb, e4b->bd_group, 1299 ext4_grp_locked_error(sb, e4b->bd_group,
1299 __func__, "double-free of inode" 1300 inode ? inode->i_ino : 0,
1300 " %lu's block %llu(bit %u in group %u)", 1301 blocknr,
1301 inode ? inode->i_ino : 0, blocknr, block, 1302 "freeing already freed block "
1302 e4b->bd_group); 1303 "(bit %u)", block);
1303 } 1304 }
1304 mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); 1305 mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
1305 e4b->bd_info->bb_counters[order]++; 1306 e4b->bd_info->bb_counters[order]++;
@@ -1788,8 +1789,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1788 * free blocks even though group info says we 1789 * free blocks even though group info says we
1789 * we have free blocks 1790 * we have free blocks
1790 */ 1791 */
1791 ext4_grp_locked_error(sb, e4b->bd_group, 1792 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1792 __func__, "%d free blocks as per " 1793 "%d free blocks as per "
1793 "group info. But bitmap says 0", 1794 "group info. But bitmap says 0",
1794 free); 1795 free);
1795 break; 1796 break;
@@ -1798,8 +1799,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1798 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); 1799 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
1799 BUG_ON(ex.fe_len <= 0); 1800 BUG_ON(ex.fe_len <= 0);
1800 if (free < ex.fe_len) { 1801 if (free < ex.fe_len) {
1801 ext4_grp_locked_error(sb, e4b->bd_group, 1802 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1802 __func__, "%d free blocks as per " 1803 "%d free blocks as per "
1803 "group info. But got %d blocks", 1804 "group info. But got %d blocks",
1804 free, ex.fe_len); 1805 free, ex.fe_len);
1805 /* 1806 /*
@@ -1821,8 +1822,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1821 1822
1822/* 1823/*
1823 * This is a special case for storages like raid5 1824 * This is a special case for storages like raid5
1824 * we try to find stripe-aligned chunks for stripe-size requests 1825 * we try to find stripe-aligned chunks for stripe-size-multiple requests
1825 * XXX should do so at least for multiples of stripe size as well
1826 */ 1826 */
1827static noinline_for_stack 1827static noinline_for_stack
1828void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, 1828void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
@@ -1999,7 +1999,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1999 ext4_group_t ngroups, group, i; 1999 ext4_group_t ngroups, group, i;
2000 int cr; 2000 int cr;
2001 int err = 0; 2001 int err = 0;
2002 int bsbits;
2003 struct ext4_sb_info *sbi; 2002 struct ext4_sb_info *sbi;
2004 struct super_block *sb; 2003 struct super_block *sb;
2005 struct ext4_buddy e4b; 2004 struct ext4_buddy e4b;
@@ -2041,8 +2040,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2041 ac->ac_2order = i - 1; 2040 ac->ac_2order = i - 1;
2042 } 2041 }
2043 2042
2044 bsbits = ac->ac_sb->s_blocksize_bits;
2045
2046 /* if stream allocation is enabled, use global goal */ 2043 /* if stream allocation is enabled, use global goal */
2047 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 2044 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2048 /* TBD: may be hot point */ 2045 /* TBD: may be hot point */
@@ -2094,8 +2091,8 @@ repeat:
2094 ac->ac_groups_scanned++; 2091 ac->ac_groups_scanned++;
2095 if (cr == 0) 2092 if (cr == 0)
2096 ext4_mb_simple_scan_group(ac, &e4b); 2093 ext4_mb_simple_scan_group(ac, &e4b);
2097 else if (cr == 1 && 2094 else if (cr == 1 && sbi->s_stripe &&
2098 ac->ac_g_ex.fe_len == sbi->s_stripe) 2095 !(ac->ac_g_ex.fe_len % sbi->s_stripe))
2099 ext4_mb_scan_aligned(ac, &e4b); 2096 ext4_mb_scan_aligned(ac, &e4b);
2100 else 2097 else
2101 ext4_mb_complex_scan_group(ac, &e4b); 2098 ext4_mb_complex_scan_group(ac, &e4b);
@@ -2221,7 +2218,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2221 2218
2222 rc = seq_open(file, &ext4_mb_seq_groups_ops); 2219 rc = seq_open(file, &ext4_mb_seq_groups_ops);
2223 if (rc == 0) { 2220 if (rc == 0) {
2224 struct seq_file *m = (struct seq_file *)file->private_data; 2221 struct seq_file *m = file->private_data;
2225 m->private = sb; 2222 m->private = sb;
2226 } 2223 }
2227 return rc; 2224 return rc;
@@ -2560,6 +2557,22 @@ int ext4_mb_release(struct super_block *sb)
2560 return 0; 2557 return 0;
2561} 2558}
2562 2559
2560static inline void ext4_issue_discard(struct super_block *sb,
2561 ext4_group_t block_group, ext4_grpblk_t block, int count)
2562{
2563 int ret;
2564 ext4_fsblk_t discard_block;
2565
2566 discard_block = block + ext4_group_first_block_no(sb, block_group);
2567 trace_ext4_discard_blocks(sb,
2568 (unsigned long long) discard_block, count);
2569 ret = sb_issue_discard(sb, discard_block, count);
2570 if (ret == EOPNOTSUPP) {
2571 ext4_warning(sb, "discard not supported, disabling");
2572 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
2573 }
2574}
2575
2563/* 2576/*
2564 * This function is called by the jbd2 layer once the commit has finished, 2577 * This function is called by the jbd2 layer once the commit has finished,
2565 * so we know we can free the blocks that were released with that commit. 2578 * so we know we can free the blocks that were released with that commit.
@@ -2579,22 +2592,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2579 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2592 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2580 entry->count, entry->group, entry); 2593 entry->count, entry->group, entry);
2581 2594
2582 if (test_opt(sb, DISCARD)) { 2595 if (test_opt(sb, DISCARD))
2583 int ret; 2596 ext4_issue_discard(sb, entry->group,
2584 ext4_fsblk_t discard_block; 2597 entry->start_blk, entry->count);
2585
2586 discard_block = entry->start_blk +
2587 ext4_group_first_block_no(sb, entry->group);
2588 trace_ext4_discard_blocks(sb,
2589 (unsigned long long)discard_block,
2590 entry->count);
2591 ret = sb_issue_discard(sb, discard_block, entry->count);
2592 if (ret == EOPNOTSUPP) {
2593 ext4_warning(sb,
2594 "discard not supported, disabling");
2595 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
2596 }
2597 }
2598 2598
2599 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2599 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2600 /* we expect to find existing buddy because it's pinned */ 2600 /* we expect to find existing buddy because it's pinned */
@@ -2712,7 +2712,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2712 handle_t *handle, unsigned int reserv_blks) 2712 handle_t *handle, unsigned int reserv_blks)
2713{ 2713{
2714 struct buffer_head *bitmap_bh = NULL; 2714 struct buffer_head *bitmap_bh = NULL;
2715 struct ext4_super_block *es;
2716 struct ext4_group_desc *gdp; 2715 struct ext4_group_desc *gdp;
2717 struct buffer_head *gdp_bh; 2716 struct buffer_head *gdp_bh;
2718 struct ext4_sb_info *sbi; 2717 struct ext4_sb_info *sbi;
@@ -2725,8 +2724,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2725 2724
2726 sb = ac->ac_sb; 2725 sb = ac->ac_sb;
2727 sbi = EXT4_SB(sb); 2726 sbi = EXT4_SB(sb);
2728 es = sbi->s_es;
2729
2730 2727
2731 err = -EIO; 2728 err = -EIO;
2732 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); 2729 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
@@ -2812,7 +2809,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2812 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); 2809 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
2813 2810
2814out_err: 2811out_err:
2815 sb->s_dirt = 1; 2812 ext4_mark_super_dirty(sb);
2816 brelse(bitmap_bh); 2813 brelse(bitmap_bh);
2817 return err; 2814 return err;
2818} 2815}
@@ -2850,7 +2847,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2850 int bsbits, max; 2847 int bsbits, max;
2851 ext4_lblk_t end; 2848 ext4_lblk_t end;
2852 loff_t size, orig_size, start_off; 2849 loff_t size, orig_size, start_off;
2853 ext4_lblk_t start, orig_start; 2850 ext4_lblk_t start;
2854 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 2851 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
2855 struct ext4_prealloc_space *pa; 2852 struct ext4_prealloc_space *pa;
2856 2853
@@ -2881,6 +2878,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2881 size = size << bsbits; 2878 size = size << bsbits;
2882 if (size < i_size_read(ac->ac_inode)) 2879 if (size < i_size_read(ac->ac_inode))
2883 size = i_size_read(ac->ac_inode); 2880 size = i_size_read(ac->ac_inode);
2881 orig_size = size;
2884 2882
2885 /* max size of free chunks */ 2883 /* max size of free chunks */
2886 max = 2 << bsbits; 2884 max = 2 << bsbits;
@@ -2922,8 +2920,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2922 start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; 2920 start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
2923 size = ac->ac_o_ex.fe_len << bsbits; 2921 size = ac->ac_o_ex.fe_len << bsbits;
2924 } 2922 }
2925 orig_size = size = size >> bsbits; 2923 size = size >> bsbits;
2926 orig_start = start = start_off >> bsbits; 2924 start = start_off >> bsbits;
2927 2925
2928 /* don't cover already allocated blocks in selected range */ 2926 /* don't cover already allocated blocks in selected range */
2929 if (ar->pleft && start <= ar->lleft) { 2927 if (ar->pleft && start <= ar->lleft) {
@@ -3547,7 +3545,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3547 ext4_group_t group; 3545 ext4_group_t group;
3548 ext4_grpblk_t bit; 3546 ext4_grpblk_t bit;
3549 unsigned long long grp_blk_start; 3547 unsigned long long grp_blk_start;
3550 sector_t start;
3551 int err = 0; 3548 int err = 0;
3552 int free = 0; 3549 int free = 0;
3553 3550
@@ -3567,10 +3564,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3567 if (bit >= end) 3564 if (bit >= end)
3568 break; 3565 break;
3569 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3566 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3570 start = ext4_group_first_block_no(sb, group) + bit;
3571 mb_debug(1, " free preallocated %u/%u in group %u\n", 3567 mb_debug(1, " free preallocated %u/%u in group %u\n",
3572 (unsigned) start, (unsigned) next - bit, 3568 (unsigned) ext4_group_first_block_no(sb, group) + bit,
3573 (unsigned) group); 3569 (unsigned) next - bit, (unsigned) group);
3574 free += next - bit; 3570 free += next - bit;
3575 3571
3576 if (ac) { 3572 if (ac) {
@@ -3581,7 +3577,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3581 trace_ext4_mballoc_discard(ac); 3577 trace_ext4_mballoc_discard(ac);
3582 } 3578 }
3583 3579
3584 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, 3580 trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit,
3585 next - bit); 3581 next - bit);
3586 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3582 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3587 bit = next + 1; 3583 bit = next + 1;
@@ -3591,8 +3587,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3591 pa, (unsigned long) pa->pa_lstart, 3587 pa, (unsigned long) pa->pa_lstart,
3592 (unsigned long) pa->pa_pstart, 3588 (unsigned long) pa->pa_pstart,
3593 (unsigned long) pa->pa_len); 3589 (unsigned long) pa->pa_len);
3594 ext4_grp_locked_error(sb, group, 3590 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
3595 __func__, "free %u, pa_free %u",
3596 free, pa->pa_free); 3591 free, pa->pa_free);
3597 /* 3592 /*
3598 * pa is already deleted so we use the value obtained 3593 * pa is already deleted so we use the value obtained
@@ -3613,7 +3608,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3613 ext4_group_t group; 3608 ext4_group_t group;
3614 ext4_grpblk_t bit; 3609 ext4_grpblk_t bit;
3615 3610
3616 trace_ext4_mb_release_group_pa(ac, pa); 3611 trace_ext4_mb_release_group_pa(sb, ac, pa);
3617 BUG_ON(pa->pa_deleted == 0); 3612 BUG_ON(pa->pa_deleted == 0);
3618 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3613 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3619 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3614 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -3889,6 +3884,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3889 struct super_block *sb = ac->ac_sb; 3884 struct super_block *sb = ac->ac_sb;
3890 ext4_group_t ngroups, i; 3885 ext4_group_t ngroups, i;
3891 3886
3887 if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
3888 return;
3889
3892 printk(KERN_ERR "EXT4-fs: Can't allocate:" 3890 printk(KERN_ERR "EXT4-fs: Can't allocate:"
3893 " Allocation context details:\n"); 3891 " Allocation context details:\n");
3894 printk(KERN_ERR "EXT4-fs: status %d flags %d\n", 3892 printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
@@ -4255,7 +4253,7 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4255 * to usual allocation 4253 * to usual allocation
4256 */ 4254 */
4257ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, 4255ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4258 struct ext4_allocation_request *ar, int *errp) 4256 struct ext4_allocation_request *ar, int *errp)
4259{ 4257{
4260 int freed; 4258 int freed;
4261 struct ext4_allocation_context *ac = NULL; 4259 struct ext4_allocation_context *ac = NULL;
@@ -4299,7 +4297,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4299 inquota = ar->len; 4297 inquota = ar->len;
4300 if (ar->len == 0) { 4298 if (ar->len == 0) {
4301 *errp = -EDQUOT; 4299 *errp = -EDQUOT;
4302 goto out3; 4300 goto out;
4303 } 4301 }
4304 } 4302 }
4305 4303
@@ -4307,13 +4305,13 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4307 if (!ac) { 4305 if (!ac) {
4308 ar->len = 0; 4306 ar->len = 0;
4309 *errp = -ENOMEM; 4307 *errp = -ENOMEM;
4310 goto out1; 4308 goto out;
4311 } 4309 }
4312 4310
4313 *errp = ext4_mb_initialize_context(ac, ar); 4311 *errp = ext4_mb_initialize_context(ac, ar);
4314 if (*errp) { 4312 if (*errp) {
4315 ar->len = 0; 4313 ar->len = 0;
4316 goto out2; 4314 goto out;
4317 } 4315 }
4318 4316
4319 ac->ac_op = EXT4_MB_HISTORY_PREALLOC; 4317 ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
@@ -4322,7 +4320,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4322 ext4_mb_normalize_request(ac, ar); 4320 ext4_mb_normalize_request(ac, ar);
4323repeat: 4321repeat:
4324 /* allocate space in core */ 4322 /* allocate space in core */
4325 ext4_mb_regular_allocator(ac); 4323 *errp = ext4_mb_regular_allocator(ac);
4324 if (*errp)
4325 goto errout;
4326 4326
4327 /* as we've just preallocated more space than 4327 /* as we've just preallocated more space than
4328 * user requested orinally, we store allocated 4328 * user requested orinally, we store allocated
@@ -4333,7 +4333,7 @@ repeat:
4333 } 4333 }
4334 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4334 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4335 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); 4335 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
4336 if (*errp == -EAGAIN) { 4336 if (*errp == -EAGAIN) {
4337 /* 4337 /*
4338 * drop the reference that we took 4338 * drop the reference that we took
4339 * in ext4_mb_use_best_found 4339 * in ext4_mb_use_best_found
@@ -4344,12 +4344,10 @@ repeat:
4344 ac->ac_b_ex.fe_len = 0; 4344 ac->ac_b_ex.fe_len = 0;
4345 ac->ac_status = AC_STATUS_CONTINUE; 4345 ac->ac_status = AC_STATUS_CONTINUE;
4346 goto repeat; 4346 goto repeat;
4347 } else if (*errp) { 4347 } else if (*errp)
4348 errout:
4348 ext4_discard_allocated_blocks(ac); 4349 ext4_discard_allocated_blocks(ac);
4349 ac->ac_b_ex.fe_len = 0; 4350 else {
4350 ar->len = 0;
4351 ext4_mb_show_ac(ac);
4352 } else {
4353 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4351 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4354 ar->len = ac->ac_b_ex.fe_len; 4352 ar->len = ac->ac_b_ex.fe_len;
4355 } 4353 }
@@ -4358,19 +4356,19 @@ repeat:
4358 if (freed) 4356 if (freed)
4359 goto repeat; 4357 goto repeat;
4360 *errp = -ENOSPC; 4358 *errp = -ENOSPC;
4359 }
4360
4361 if (*errp) {
4361 ac->ac_b_ex.fe_len = 0; 4362 ac->ac_b_ex.fe_len = 0;
4362 ar->len = 0; 4363 ar->len = 0;
4363 ext4_mb_show_ac(ac); 4364 ext4_mb_show_ac(ac);
4364 } 4365 }
4365
4366 ext4_mb_release_context(ac); 4366 ext4_mb_release_context(ac);
4367 4367out:
4368out2: 4368 if (ac)
4369 kmem_cache_free(ext4_ac_cachep, ac); 4369 kmem_cache_free(ext4_ac_cachep, ac);
4370out1:
4371 if (inquota && ar->len < inquota) 4370 if (inquota && ar->len < inquota)
4372 dquot_free_block(ar->inode, inquota - ar->len); 4371 dquot_free_block(ar->inode, inquota - ar->len);
4373out3:
4374 if (!ar->len) { 4372 if (!ar->len) {
4375 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4373 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
4376 /* release all the reserved blocks if non delalloc */ 4374 /* release all the reserved blocks if non delalloc */
@@ -4402,6 +4400,7 @@ static noinline_for_stack int
4402ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, 4400ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4403 struct ext4_free_data *new_entry) 4401 struct ext4_free_data *new_entry)
4404{ 4402{
4403 ext4_group_t group = e4b->bd_group;
4405 ext4_grpblk_t block; 4404 ext4_grpblk_t block;
4406 struct ext4_free_data *entry; 4405 struct ext4_free_data *entry;
4407 struct ext4_group_info *db = e4b->bd_info; 4406 struct ext4_group_info *db = e4b->bd_info;
@@ -4434,9 +4433,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4434 else if (block >= (entry->start_blk + entry->count)) 4433 else if (block >= (entry->start_blk + entry->count))
4435 n = &(*n)->rb_right; 4434 n = &(*n)->rb_right;
4436 else { 4435 else {
4437 ext4_grp_locked_error(sb, e4b->bd_group, __func__, 4436 ext4_grp_locked_error(sb, group, 0,
4438 "Double free of blocks %d (%d %d)", 4437 ext4_group_first_block_no(sb, group) + block,
4439 block, entry->start_blk, entry->count); 4438 "Block already on to-be-freed list");
4440 return 0; 4439 return 0;
4441 } 4440 }
4442 } 4441 }
@@ -4494,7 +4493,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4494 struct super_block *sb = inode->i_sb; 4493 struct super_block *sb = inode->i_sb;
4495 struct ext4_allocation_context *ac = NULL; 4494 struct ext4_allocation_context *ac = NULL;
4496 struct ext4_group_desc *gdp; 4495 struct ext4_group_desc *gdp;
4497 struct ext4_super_block *es;
4498 unsigned long freed = 0; 4496 unsigned long freed = 0;
4499 unsigned int overflow; 4497 unsigned int overflow;
4500 ext4_grpblk_t bit; 4498 ext4_grpblk_t bit;
@@ -4513,7 +4511,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4513 } 4511 }
4514 4512
4515 sbi = EXT4_SB(sb); 4513 sbi = EXT4_SB(sb);
4516 es = EXT4_SB(sb)->s_es;
4517 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 4514 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4518 !ext4_data_block_valid(sbi, block, count)) { 4515 !ext4_data_block_valid(sbi, block, count)) {
4519 ext4_error(sb, "Freeing blocks not in datazone - " 4516 ext4_error(sb, "Freeing blocks not in datazone - "
@@ -4647,6 +4644,8 @@ do_more:
4647 mb_clear_bits(bitmap_bh->b_data, bit, count); 4644 mb_clear_bits(bitmap_bh->b_data, bit, count);
4648 mb_free_blocks(inode, &e4b, bit, count); 4645 mb_free_blocks(inode, &e4b, bit, count);
4649 ext4_mb_return_to_preallocation(inode, &e4b, block, count); 4646 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4647 if (test_opt(sb, DISCARD))
4648 ext4_issue_discard(sb, block_group, bit, count);
4650 } 4649 }
4651 4650
4652 ret = ext4_free_blks_count(sb, gdp) + count; 4651 ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4680,7 +4679,7 @@ do_more:
4680 put_bh(bitmap_bh); 4679 put_bh(bitmap_bh);
4681 goto do_more; 4680 goto do_more;
4682 } 4681 }
4683 sb->s_dirt = 1; 4682 ext4_mark_super_dirty(sb);
4684error_return: 4683error_return:
4685 if (freed) 4684 if (freed)
4686 dquot_free_block(inode, freed); 4685 dquot_free_block(inode, freed);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 6f3a27ec30bf..1765c2c50a9b 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
376 * We have the extent map build with the tmp inode. 376 * We have the extent map build with the tmp inode.
377 * Now copy the i_data across 377 * Now copy the i_data across
378 */ 378 */
379 ei->i_flags |= EXT4_EXTENTS_FL; 379 ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS);
380 memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); 380 memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
381 381
382 /* 382 /*
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 52abfa12762a..5f1ed9fc913c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -148,17 +148,17 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
148 */ 148 */
149static int 149static int
150mext_check_null_inode(struct inode *inode1, struct inode *inode2, 150mext_check_null_inode(struct inode *inode1, struct inode *inode2,
151 const char *function) 151 const char *function, unsigned int line)
152{ 152{
153 int ret = 0; 153 int ret = 0;
154 154
155 if (inode1 == NULL) { 155 if (inode1 == NULL) {
156 __ext4_error(inode2->i_sb, function, 156 __ext4_error(inode2->i_sb, function, line,
157 "Both inodes should not be NULL: " 157 "Both inodes should not be NULL: "
158 "inode1 NULL inode2 %lu", inode2->i_ino); 158 "inode1 NULL inode2 %lu", inode2->i_ino);
159 ret = -EIO; 159 ret = -EIO;
160 } else if (inode2 == NULL) { 160 } else if (inode2 == NULL) {
161 __ext4_error(inode1->i_sb, function, 161 __ext4_error(inode1->i_sb, function, line,
162 "Both inodes should not be NULL: " 162 "Both inodes should not be NULL: "
163 "inode1 %lu inode2 NULL", inode1->i_ino); 163 "inode1 %lu inode2 NULL", inode1->i_ino);
164 ret = -EIO; 164 ret = -EIO;
@@ -1084,7 +1084,7 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1084 1084
1085 BUG_ON(inode1 == NULL && inode2 == NULL); 1085 BUG_ON(inode1 == NULL && inode2 == NULL);
1086 1086
1087 ret = mext_check_null_inode(inode1, inode2, __func__); 1087 ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
1088 if (ret < 0) 1088 if (ret < 0)
1089 goto out; 1089 goto out;
1090 1090
@@ -1121,7 +1121,7 @@ mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
1121 1121
1122 BUG_ON(inode1 == NULL && inode2 == NULL); 1122 BUG_ON(inode1 == NULL && inode2 == NULL);
1123 1123
1124 ret = mext_check_null_inode(inode1, inode2, __func__); 1124 ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__);
1125 if (ret < 0) 1125 if (ret < 0)
1126 goto out; 1126 goto out;
1127 1127
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a43e6617b351..314c0d3b3fa9 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -179,30 +179,6 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
179static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, 179static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
180 struct inode *inode); 180 struct inode *inode);
181 181
182unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
183{
184 unsigned len = le16_to_cpu(dlen);
185
186 if (len == EXT4_MAX_REC_LEN || len == 0)
187 return blocksize;
188 return (len & 65532) | ((len & 3) << 16);
189}
190
191__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
192{
193 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
194 BUG();
195 if (len < 65536)
196 return cpu_to_le16(len);
197 if (len == blocksize) {
198 if (blocksize == 65536)
199 return cpu_to_le16(EXT4_MAX_REC_LEN);
200 else
201 return cpu_to_le16(0);
202 }
203 return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
204}
205
206/* 182/*
207 * p is at least 6 bytes before the end of page 183 * p is at least 6 bytes before the end of page
208 */ 184 */
@@ -605,7 +581,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
605 dir->i_sb->s_blocksize - 581 dir->i_sb->s_blocksize -
606 EXT4_DIR_REC_LEN(0)); 582 EXT4_DIR_REC_LEN(0));
607 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { 583 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
608 if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, 584 if (!ext4_check_dir_entry(dir, de, bh,
609 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 585 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
610 +((char *)de - bh->b_data))) { 586 +((char *)de - bh->b_data))) {
611 /* On error, skip the f_pos to the next block. */ 587 /* On error, skip the f_pos to the next block. */
@@ -844,8 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh,
844 if ((char *) de + namelen <= dlimit && 820 if ((char *) de + namelen <= dlimit &&
845 ext4_match (namelen, name, de)) { 821 ext4_match (namelen, name, de)) {
846 /* found a match - just to be sure, do a full check */ 822 /* found a match - just to be sure, do a full check */
847 if (!ext4_check_dir_entry("ext4_find_entry", 823 if (!ext4_check_dir_entry(dir, de, bh, offset))
848 dir, de, bh, offset))
849 return -1; 824 return -1;
850 *res_dir = de; 825 *res_dir = de;
851 return 1; 826 return 1;
@@ -1019,7 +994,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
1019 int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) 994 int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
1020 + ((char *) de - bh->b_data); 995 + ((char *) de - bh->b_data);
1021 996
1022 if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) { 997 if (!ext4_check_dir_entry(dir, de, bh, off)) {
1023 brelse(bh); 998 brelse(bh);
1024 *err = ERR_BAD_DX_DIR; 999 *err = ERR_BAD_DX_DIR;
1025 goto errout; 1000 goto errout;
@@ -1088,7 +1063,6 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1088struct dentry *ext4_get_parent(struct dentry *child) 1063struct dentry *ext4_get_parent(struct dentry *child)
1089{ 1064{
1090 __u32 ino; 1065 __u32 ino;
1091 struct inode *inode;
1092 static const struct qstr dotdot = { 1066 static const struct qstr dotdot = {
1093 .name = "..", 1067 .name = "..",
1094 .len = 2, 1068 .len = 2,
@@ -1097,7 +1071,6 @@ struct dentry *ext4_get_parent(struct dentry *child)
1097 struct buffer_head *bh; 1071 struct buffer_head *bh;
1098 1072
1099 bh = ext4_find_entry(child->d_inode, &dotdot, &de); 1073 bh = ext4_find_entry(child->d_inode, &dotdot, &de);
1100 inode = NULL;
1101 if (!bh) 1074 if (!bh)
1102 return ERR_PTR(-ENOENT); 1075 return ERR_PTR(-ENOENT);
1103 ino = le32_to_cpu(de->inode); 1076 ino = le32_to_cpu(de->inode);
@@ -1305,8 +1278,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1305 de = (struct ext4_dir_entry_2 *)bh->b_data; 1278 de = (struct ext4_dir_entry_2 *)bh->b_data;
1306 top = bh->b_data + blocksize - reclen; 1279 top = bh->b_data + blocksize - reclen;
1307 while ((char *) de <= top) { 1280 while ((char *) de <= top) {
1308 if (!ext4_check_dir_entry("ext4_add_entry", dir, de, 1281 if (!ext4_check_dir_entry(dir, de, bh, offset))
1309 bh, offset))
1310 return -EIO; 1282 return -EIO;
1311 if (ext4_match(namelen, name, de)) 1283 if (ext4_match(namelen, name, de))
1312 return -EEXIST; 1284 return -EEXIST;
@@ -1673,7 +1645,7 @@ static int ext4_delete_entry(handle_t *handle,
1673 pde = NULL; 1645 pde = NULL;
1674 de = (struct ext4_dir_entry_2 *) bh->b_data; 1646 de = (struct ext4_dir_entry_2 *) bh->b_data;
1675 while (i < bh->b_size) { 1647 while (i < bh->b_size) {
1676 if (!ext4_check_dir_entry("ext4_delete_entry", dir, de, bh, i)) 1648 if (!ext4_check_dir_entry(dir, de, bh, i))
1677 return -EIO; 1649 return -EIO;
1678 if (de == de_del) { 1650 if (de == de_del) {
1679 BUFFER_TRACE(bh, "get_write_access"); 1651 BUFFER_TRACE(bh, "get_write_access");
@@ -1956,7 +1928,7 @@ static int empty_dir(struct inode *inode)
1956 } 1928 }
1957 de = (struct ext4_dir_entry_2 *) bh->b_data; 1929 de = (struct ext4_dir_entry_2 *) bh->b_data;
1958 } 1930 }
1959 if (!ext4_check_dir_entry("empty_dir", inode, de, bh, offset)) { 1931 if (!ext4_check_dir_entry(inode, de, bh, offset)) {
1960 de = (struct ext4_dir_entry_2 *)(bh->b_data + 1932 de = (struct ext4_dir_entry_2 *)(bh->b_data +
1961 sb->s_blocksize); 1933 sb->s_blocksize);
1962 offset = (offset | (sb->s_blocksize - 1)) + 1; 1934 offset = (offset | (sb->s_blocksize - 1)) + 1;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 6df797eb9aeb..ca5c8aa00a2f 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -921,8 +921,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
921 &sbi->s_flex_groups[flex_group].free_inodes); 921 &sbi->s_flex_groups[flex_group].free_inodes);
922 } 922 }
923 923
924 ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); 924 ext4_handle_dirty_super(handle, sb);
925 sb->s_dirt = 1;
926 925
927exit_journal: 926exit_journal:
928 mutex_unlock(&sbi->s_resize_lock); 927 mutex_unlock(&sbi->s_resize_lock);
@@ -953,7 +952,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
953 ext4_fsblk_t n_blocks_count) 952 ext4_fsblk_t n_blocks_count)
954{ 953{
955 ext4_fsblk_t o_blocks_count; 954 ext4_fsblk_t o_blocks_count;
956 ext4_group_t o_groups_count;
957 ext4_grpblk_t last; 955 ext4_grpblk_t last;
958 ext4_grpblk_t add; 956 ext4_grpblk_t add;
959 struct buffer_head *bh; 957 struct buffer_head *bh;
@@ -965,7 +963,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
965 * yet: we're going to revalidate es->s_blocks_count after 963 * yet: we're going to revalidate es->s_blocks_count after
966 * taking the s_resize_lock below. */ 964 * taking the s_resize_lock below. */
967 o_blocks_count = ext4_blocks_count(es); 965 o_blocks_count = ext4_blocks_count(es);
968 o_groups_count = EXT4_SB(sb)->s_groups_count;
969 966
970 if (test_opt(sb, DEBUG)) 967 if (test_opt(sb, DEBUG))
971 printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n", 968 printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n",
@@ -1045,13 +1042,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1045 goto exit_put; 1042 goto exit_put;
1046 } 1043 }
1047 ext4_blocks_count_set(es, o_blocks_count + add); 1044 ext4_blocks_count_set(es, o_blocks_count + add);
1048 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
1049 sb->s_dirt = 1;
1050 mutex_unlock(&EXT4_SB(sb)->s_resize_lock); 1045 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1051 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, 1046 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
1052 o_blocks_count + add); 1047 o_blocks_count + add);
1053 /* We add the blocks to the bitmap and set the group need init bit */ 1048 /* We add the blocks to the bitmap and set the group need init bit */
1054 ext4_add_groupblocks(handle, sb, o_blocks_count, add); 1049 ext4_add_groupblocks(handle, sb, o_blocks_count, add);
1050 ext4_handle_dirty_super(handle, sb);
1055 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, 1051 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
1056 o_blocks_count + add); 1052 o_blocks_count + add);
1057 if ((err = ext4_journal_stop(handle))) 1053 if ((err = ext4_journal_stop(handle)))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e72d3235b2fd..8d65575f8c8c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -241,14 +241,14 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
241 if (sb->s_flags & MS_RDONLY) 241 if (sb->s_flags & MS_RDONLY)
242 return ERR_PTR(-EROFS); 242 return ERR_PTR(-EROFS);
243 243
244 vfs_check_frozen(sb, SB_FREEZE_WRITE); 244 vfs_check_frozen(sb, SB_FREEZE_TRANS);
245 /* Special case here: if the journal has aborted behind our 245 /* Special case here: if the journal has aborted behind our
246 * backs (eg. EIO in the commit thread), then we still need to 246 * backs (eg. EIO in the commit thread), then we still need to
247 * take the FS itself readonly cleanly. */ 247 * take the FS itself readonly cleanly. */
248 journal = EXT4_SB(sb)->s_journal; 248 journal = EXT4_SB(sb)->s_journal;
249 if (journal) { 249 if (journal) {
250 if (is_journal_aborted(journal)) { 250 if (is_journal_aborted(journal)) {
251 ext4_abort(sb, __func__, "Detected aborted journal"); 251 ext4_abort(sb, "Detected aborted journal");
252 return ERR_PTR(-EROFS); 252 return ERR_PTR(-EROFS);
253 } 253 }
254 return jbd2_journal_start(journal, nblocks); 254 return jbd2_journal_start(journal, nblocks);
@@ -262,7 +262,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
262 * that sync() will call the filesystem's write_super callback if 262 * that sync() will call the filesystem's write_super callback if
263 * appropriate. 263 * appropriate.
264 */ 264 */
265int __ext4_journal_stop(const char *where, handle_t *handle) 265int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
266{ 266{
267 struct super_block *sb; 267 struct super_block *sb;
268 int err; 268 int err;
@@ -279,12 +279,13 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
279 if (!err) 279 if (!err)
280 err = rc; 280 err = rc;
281 if (err) 281 if (err)
282 __ext4_std_error(sb, where, err); 282 __ext4_std_error(sb, where, line, err);
283 return err; 283 return err;
284} 284}
285 285
286void ext4_journal_abort_handle(const char *caller, const char *err_fn, 286void ext4_journal_abort_handle(const char *caller, unsigned int line,
287 struct buffer_head *bh, handle_t *handle, int err) 287 const char *err_fn, struct buffer_head *bh,
288 handle_t *handle, int err)
288{ 289{
289 char nbuf[16]; 290 char nbuf[16];
290 const char *errstr = ext4_decode_error(NULL, err, nbuf); 291 const char *errstr = ext4_decode_error(NULL, err, nbuf);
@@ -300,12 +301,47 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
300 if (is_handle_aborted(handle)) 301 if (is_handle_aborted(handle))
301 return; 302 return;
302 303
303 printk(KERN_ERR "%s: aborting transaction: %s in %s\n", 304 printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n",
304 caller, errstr, err_fn); 305 caller, line, errstr, err_fn);
305 306
306 jbd2_journal_abort_handle(handle); 307 jbd2_journal_abort_handle(handle);
307} 308}
308 309
310static void __save_error_info(struct super_block *sb, const char *func,
311 unsigned int line)
312{
313 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
314
315 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
316 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
317 es->s_last_error_time = cpu_to_le32(get_seconds());
318 strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
319 es->s_last_error_line = cpu_to_le32(line);
320 if (!es->s_first_error_time) {
321 es->s_first_error_time = es->s_last_error_time;
322 strncpy(es->s_first_error_func, func,
323 sizeof(es->s_first_error_func));
324 es->s_first_error_line = cpu_to_le32(line);
325 es->s_first_error_ino = es->s_last_error_ino;
326 es->s_first_error_block = es->s_last_error_block;
327 }
328 /*
329 * Start the daily error reporting function if it hasn't been
330 * started already
331 */
332 if (!es->s_error_count)
333 mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
334 es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1);
335}
336
337static void save_error_info(struct super_block *sb, const char *func,
338 unsigned int line)
339{
340 __save_error_info(sb, func, line);
341 ext4_commit_super(sb, 1);
342}
343
344
309/* Deal with the reporting of failure conditions on a filesystem such as 345/* Deal with the reporting of failure conditions on a filesystem such as
310 * inconsistencies detected or read IO failures. 346 * inconsistencies detected or read IO failures.
311 * 347 *
@@ -323,11 +359,6 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
323 359
324static void ext4_handle_error(struct super_block *sb) 360static void ext4_handle_error(struct super_block *sb)
325{ 361{
326 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
327
328 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
329 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
330
331 if (sb->s_flags & MS_RDONLY) 362 if (sb->s_flags & MS_RDONLY)
332 return; 363 return;
333 364
@@ -342,19 +373,19 @@ static void ext4_handle_error(struct super_block *sb)
342 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 373 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
343 sb->s_flags |= MS_RDONLY; 374 sb->s_flags |= MS_RDONLY;
344 } 375 }
345 ext4_commit_super(sb, 1);
346 if (test_opt(sb, ERRORS_PANIC)) 376 if (test_opt(sb, ERRORS_PANIC))
347 panic("EXT4-fs (device %s): panic forced after error\n", 377 panic("EXT4-fs (device %s): panic forced after error\n",
348 sb->s_id); 378 sb->s_id);
349} 379}
350 380
351void __ext4_error(struct super_block *sb, const char *function, 381void __ext4_error(struct super_block *sb, const char *function,
352 const char *fmt, ...) 382 unsigned int line, const char *fmt, ...)
353{ 383{
354 va_list args; 384 va_list args;
355 385
356 va_start(args, fmt); 386 va_start(args, fmt);
357 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); 387 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ",
388 sb->s_id, function, line, current->comm);
358 vprintk(fmt, args); 389 vprintk(fmt, args);
359 printk("\n"); 390 printk("\n");
360 va_end(args); 391 va_end(args);
@@ -362,14 +393,22 @@ void __ext4_error(struct super_block *sb, const char *function,
362 ext4_handle_error(sb); 393 ext4_handle_error(sb);
363} 394}
364 395
365void ext4_error_inode(const char *function, struct inode *inode, 396void ext4_error_inode(struct inode *inode, const char *function,
397 unsigned int line, ext4_fsblk_t block,
366 const char *fmt, ...) 398 const char *fmt, ...)
367{ 399{
368 va_list args; 400 va_list args;
401 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
369 402
403 es->s_last_error_ino = cpu_to_le32(inode->i_ino);
404 es->s_last_error_block = cpu_to_le64(block);
405 save_error_info(inode->i_sb, function, line);
370 va_start(args, fmt); 406 va_start(args, fmt);
371 printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ", 407 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
372 inode->i_sb->s_id, function, inode->i_ino, current->comm); 408 inode->i_sb->s_id, function, line, inode->i_ino);
409 if (block)
410 printk("block %llu: ", block);
411 printk("comm %s: ", current->comm);
373 vprintk(fmt, args); 412 vprintk(fmt, args);
374 printk("\n"); 413 printk("\n");
375 va_end(args); 414 va_end(args);
@@ -377,20 +416,26 @@ void ext4_error_inode(const char *function, struct inode *inode,
377 ext4_handle_error(inode->i_sb); 416 ext4_handle_error(inode->i_sb);
378} 417}
379 418
380void ext4_error_file(const char *function, struct file *file, 419void ext4_error_file(struct file *file, const char *function,
381 const char *fmt, ...) 420 unsigned int line, const char *fmt, ...)
382{ 421{
383 va_list args; 422 va_list args;
423 struct ext4_super_block *es;
384 struct inode *inode = file->f_dentry->d_inode; 424 struct inode *inode = file->f_dentry->d_inode;
385 char pathname[80], *path; 425 char pathname[80], *path;
386 426
427 es = EXT4_SB(inode->i_sb)->s_es;
428 es->s_last_error_ino = cpu_to_le32(inode->i_ino);
429 save_error_info(inode->i_sb, function, line);
387 va_start(args, fmt); 430 va_start(args, fmt);
388 path = d_path(&(file->f_path), pathname, sizeof(pathname)); 431 path = d_path(&(file->f_path), pathname, sizeof(pathname));
389 if (!path) 432 if (!path)
390 path = "(unknown)"; 433 path = "(unknown)";
391 printk(KERN_CRIT 434 printk(KERN_CRIT
392 "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ", 435 "EXT4-fs error (device %s): %s:%d: inode #%lu "
393 inode->i_sb->s_id, function, inode->i_ino, current->comm, path); 436 "(comm %s path %s): ",
437 inode->i_sb->s_id, function, line, inode->i_ino,
438 current->comm, path);
394 vprintk(fmt, args); 439 vprintk(fmt, args);
395 printk("\n"); 440 printk("\n");
396 va_end(args); 441 va_end(args);
@@ -435,7 +480,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
435/* __ext4_std_error decodes expected errors from journaling functions 480/* __ext4_std_error decodes expected errors from journaling functions
436 * automatically and invokes the appropriate error response. */ 481 * automatically and invokes the appropriate error response. */
437 482
438void __ext4_std_error(struct super_block *sb, const char *function, int errno) 483void __ext4_std_error(struct super_block *sb, const char *function,
484 unsigned int line, int errno)
439{ 485{
440 char nbuf[16]; 486 char nbuf[16];
441 const char *errstr; 487 const char *errstr;
@@ -448,8 +494,9 @@ void __ext4_std_error(struct super_block *sb, const char *function, int errno)
448 return; 494 return;
449 495
450 errstr = ext4_decode_error(sb, errno, nbuf); 496 errstr = ext4_decode_error(sb, errno, nbuf);
451 printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n", 497 printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
452 sb->s_id, function, errstr); 498 sb->s_id, function, line, errstr);
499 save_error_info(sb, function, line);
453 500
454 ext4_handle_error(sb); 501 ext4_handle_error(sb);
455} 502}
@@ -464,29 +511,29 @@ void __ext4_std_error(struct super_block *sb, const char *function, int errno)
464 * case we take the easy way out and panic immediately. 511 * case we take the easy way out and panic immediately.
465 */ 512 */
466 513
467void ext4_abort(struct super_block *sb, const char *function, 514void __ext4_abort(struct super_block *sb, const char *function,
468 const char *fmt, ...) 515 unsigned int line, const char *fmt, ...)
469{ 516{
470 va_list args; 517 va_list args;
471 518
519 save_error_info(sb, function, line);
472 va_start(args, fmt); 520 va_start(args, fmt);
473 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); 521 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id,
522 function, line);
474 vprintk(fmt, args); 523 vprintk(fmt, args);
475 printk("\n"); 524 printk("\n");
476 va_end(args); 525 va_end(args);
477 526
527 if ((sb->s_flags & MS_RDONLY) == 0) {
528 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
529 sb->s_flags |= MS_RDONLY;
530 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
531 if (EXT4_SB(sb)->s_journal)
532 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
533 save_error_info(sb, function, line);
534 }
478 if (test_opt(sb, ERRORS_PANIC)) 535 if (test_opt(sb, ERRORS_PANIC))
479 panic("EXT4-fs panic from previous error\n"); 536 panic("EXT4-fs panic from previous error\n");
480
481 if (sb->s_flags & MS_RDONLY)
482 return;
483
484 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
485 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
486 sb->s_flags |= MS_RDONLY;
487 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
488 if (EXT4_SB(sb)->s_journal)
489 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
490} 537}
491 538
492void ext4_msg (struct super_block * sb, const char *prefix, 539void ext4_msg (struct super_block * sb, const char *prefix,
@@ -502,38 +549,47 @@ void ext4_msg (struct super_block * sb, const char *prefix,
502} 549}
503 550
504void __ext4_warning(struct super_block *sb, const char *function, 551void __ext4_warning(struct super_block *sb, const char *function,
505 const char *fmt, ...) 552 unsigned int line, const char *fmt, ...)
506{ 553{
507 va_list args; 554 va_list args;
508 555
509 va_start(args, fmt); 556 va_start(args, fmt);
510 printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ", 557 printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ",
511 sb->s_id, function); 558 sb->s_id, function, line);
512 vprintk(fmt, args); 559 vprintk(fmt, args);
513 printk("\n"); 560 printk("\n");
514 va_end(args); 561 va_end(args);
515} 562}
516 563
517void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp, 564void __ext4_grp_locked_error(const char *function, unsigned int line,
518 const char *function, const char *fmt, ...) 565 struct super_block *sb, ext4_group_t grp,
566 unsigned long ino, ext4_fsblk_t block,
567 const char *fmt, ...)
519__releases(bitlock) 568__releases(bitlock)
520__acquires(bitlock) 569__acquires(bitlock)
521{ 570{
522 va_list args; 571 va_list args;
523 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 572 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
524 573
574 es->s_last_error_ino = cpu_to_le32(ino);
575 es->s_last_error_block = cpu_to_le64(block);
576 __save_error_info(sb, function, line);
525 va_start(args, fmt); 577 va_start(args, fmt);
526 printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); 578 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u",
579 sb->s_id, function, line, grp);
580 if (ino)
581 printk("inode %lu: ", ino);
582 if (block)
583 printk("block %llu:", (unsigned long long) block);
527 vprintk(fmt, args); 584 vprintk(fmt, args);
528 printk("\n"); 585 printk("\n");
529 va_end(args); 586 va_end(args);
530 587
531 if (test_opt(sb, ERRORS_CONT)) { 588 if (test_opt(sb, ERRORS_CONT)) {
532 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
533 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
534 ext4_commit_super(sb, 0); 589 ext4_commit_super(sb, 0);
535 return; 590 return;
536 } 591 }
592
537 ext4_unlock_group(sb, grp); 593 ext4_unlock_group(sb, grp);
538 ext4_handle_error(sb); 594 ext4_handle_error(sb);
539 /* 595 /*
@@ -660,8 +716,7 @@ static void ext4_put_super(struct super_block *sb)
660 err = jbd2_journal_destroy(sbi->s_journal); 716 err = jbd2_journal_destroy(sbi->s_journal);
661 sbi->s_journal = NULL; 717 sbi->s_journal = NULL;
662 if (err < 0) 718 if (err < 0)
663 ext4_abort(sb, __func__, 719 ext4_abort(sb, "Couldn't clean up the journal");
664 "Couldn't clean up the journal");
665 } 720 }
666 721
667 ext4_release_system_zone(sb); 722 ext4_release_system_zone(sb);
@@ -946,14 +1001,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
946 seq_puts(seq, ",journal_async_commit"); 1001 seq_puts(seq, ",journal_async_commit");
947 else if (test_opt(sb, JOURNAL_CHECKSUM)) 1002 else if (test_opt(sb, JOURNAL_CHECKSUM))
948 seq_puts(seq, ",journal_checksum"); 1003 seq_puts(seq, ",journal_checksum");
949 if (test_opt(sb, NOBH))
950 seq_puts(seq, ",nobh");
951 if (test_opt(sb, I_VERSION)) 1004 if (test_opt(sb, I_VERSION))
952 seq_puts(seq, ",i_version"); 1005 seq_puts(seq, ",i_version");
953 if (!test_opt(sb, DELALLOC)) 1006 if (!test_opt(sb, DELALLOC) &&
1007 !(def_mount_opts & EXT4_DEFM_NODELALLOC))
954 seq_puts(seq, ",nodelalloc"); 1008 seq_puts(seq, ",nodelalloc");
955 1009
956
957 if (sbi->s_stripe) 1010 if (sbi->s_stripe)
958 seq_printf(seq, ",stripe=%lu", sbi->s_stripe); 1011 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
959 /* 1012 /*
@@ -977,7 +1030,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
977 if (test_opt(sb, NO_AUTO_DA_ALLOC)) 1030 if (test_opt(sb, NO_AUTO_DA_ALLOC))
978 seq_puts(seq, ",noauto_da_alloc"); 1031 seq_puts(seq, ",noauto_da_alloc");
979 1032
980 if (test_opt(sb, DISCARD)) 1033 if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD))
981 seq_puts(seq, ",discard"); 1034 seq_puts(seq, ",discard");
982 1035
983 if (test_opt(sb, NOLOAD)) 1036 if (test_opt(sb, NOLOAD))
@@ -986,6 +1039,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
986 if (test_opt(sb, DIOREAD_NOLOCK)) 1039 if (test_opt(sb, DIOREAD_NOLOCK))
987 seq_puts(seq, ",dioread_nolock"); 1040 seq_puts(seq, ",dioread_nolock");
988 1041
1042 if (test_opt(sb, BLOCK_VALIDITY) &&
1043 !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
1044 seq_puts(seq, ",block_validity");
1045
989 ext4_show_quota_options(seq, sb); 1046 ext4_show_quota_options(seq, sb);
990 1047
991 return 0; 1048 return 0;
@@ -1065,6 +1122,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot);
1065static int ext4_write_info(struct super_block *sb, int type); 1122static int ext4_write_info(struct super_block *sb, int type);
1066static int ext4_quota_on(struct super_block *sb, int type, int format_id, 1123static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1067 char *path); 1124 char *path);
1125static int ext4_quota_off(struct super_block *sb, int type);
1068static int ext4_quota_on_mount(struct super_block *sb, int type); 1126static int ext4_quota_on_mount(struct super_block *sb, int type);
1069static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 1127static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1070 size_t len, loff_t off); 1128 size_t len, loff_t off);
@@ -1086,7 +1144,7 @@ static const struct dquot_operations ext4_quota_operations = {
1086 1144
1087static const struct quotactl_ops ext4_qctl_operations = { 1145static const struct quotactl_ops ext4_qctl_operations = {
1088 .quota_on = ext4_quota_on, 1146 .quota_on = ext4_quota_on,
1089 .quota_off = dquot_quota_off, 1147 .quota_off = ext4_quota_off,
1090 .quota_sync = dquot_quota_sync, 1148 .quota_sync = dquot_quota_sync,
1091 .get_info = dquot_get_dqinfo, 1149 .get_info = dquot_get_dqinfo,
1092 .set_info = dquot_set_dqinfo, 1150 .set_info = dquot_set_dqinfo,
@@ -1624,10 +1682,12 @@ set_qf_format:
1624 *n_blocks_count = option; 1682 *n_blocks_count = option;
1625 break; 1683 break;
1626 case Opt_nobh: 1684 case Opt_nobh:
1627 set_opt(sbi->s_mount_opt, NOBH); 1685 ext4_msg(sb, KERN_WARNING,
1686 "Ignoring deprecated nobh option");
1628 break; 1687 break;
1629 case Opt_bh: 1688 case Opt_bh:
1630 clear_opt(sbi->s_mount_opt, NOBH); 1689 ext4_msg(sb, KERN_WARNING,
1690 "Ignoring deprecated bh option");
1631 break; 1691 break;
1632 case Opt_i_version: 1692 case Opt_i_version:
1633 set_opt(sbi->s_mount_opt, I_VERSION); 1693 set_opt(sbi->s_mount_opt, I_VERSION);
@@ -2249,6 +2309,8 @@ static ssize_t session_write_kbytes_show(struct ext4_attr *a,
2249{ 2309{
2250 struct super_block *sb = sbi->s_buddy_cache->i_sb; 2310 struct super_block *sb = sbi->s_buddy_cache->i_sb;
2251 2311
2312 if (!sb->s_bdev->bd_part)
2313 return snprintf(buf, PAGE_SIZE, "0\n");
2252 return snprintf(buf, PAGE_SIZE, "%lu\n", 2314 return snprintf(buf, PAGE_SIZE, "%lu\n",
2253 (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 2315 (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
2254 sbi->s_sectors_written_start) >> 1); 2316 sbi->s_sectors_written_start) >> 1);
@@ -2259,6 +2321,8 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
2259{ 2321{
2260 struct super_block *sb = sbi->s_buddy_cache->i_sb; 2322 struct super_block *sb = sbi->s_buddy_cache->i_sb;
2261 2323
2324 if (!sb->s_bdev->bd_part)
2325 return snprintf(buf, PAGE_SIZE, "0\n");
2262 return snprintf(buf, PAGE_SIZE, "%llu\n", 2326 return snprintf(buf, PAGE_SIZE, "%llu\n",
2263 (unsigned long long)(sbi->s_kbytes_written + 2327 (unsigned long long)(sbi->s_kbytes_written +
2264 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 2328 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
@@ -2431,6 +2495,53 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
2431 return 1; 2495 return 1;
2432} 2496}
2433 2497
2498/*
2499 * This function is called once a day if we have errors logged
2500 * on the file system
2501 */
2502static void print_daily_error_info(unsigned long arg)
2503{
2504 struct super_block *sb = (struct super_block *) arg;
2505 struct ext4_sb_info *sbi;
2506 struct ext4_super_block *es;
2507
2508 sbi = EXT4_SB(sb);
2509 es = sbi->s_es;
2510
2511 if (es->s_error_count)
2512 ext4_msg(sb, KERN_NOTICE, "error count: %u",
2513 le32_to_cpu(es->s_error_count));
2514 if (es->s_first_error_time) {
2515 printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d",
2516 sb->s_id, le32_to_cpu(es->s_first_error_time),
2517 (int) sizeof(es->s_first_error_func),
2518 es->s_first_error_func,
2519 le32_to_cpu(es->s_first_error_line));
2520 if (es->s_first_error_ino)
2521 printk(": inode %u",
2522 le32_to_cpu(es->s_first_error_ino));
2523 if (es->s_first_error_block)
2524 printk(": block %llu", (unsigned long long)
2525 le64_to_cpu(es->s_first_error_block));
2526 printk("\n");
2527 }
2528 if (es->s_last_error_time) {
2529 printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d",
2530 sb->s_id, le32_to_cpu(es->s_last_error_time),
2531 (int) sizeof(es->s_last_error_func),
2532 es->s_last_error_func,
2533 le32_to_cpu(es->s_last_error_line));
2534 if (es->s_last_error_ino)
2535 printk(": inode %u",
2536 le32_to_cpu(es->s_last_error_ino));
2537 if (es->s_last_error_block)
2538 printk(": block %llu", (unsigned long long)
2539 le64_to_cpu(es->s_last_error_block));
2540 printk("\n");
2541 }
2542 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
2543}
2544
2434static int ext4_fill_super(struct super_block *sb, void *data, int silent) 2545static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2435 __releases(kernel_lock) 2546 __releases(kernel_lock)
2436 __acquires(kernel_lock) 2547 __acquires(kernel_lock)
@@ -2448,7 +2559,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2448 struct inode *root; 2559 struct inode *root;
2449 char *cp; 2560 char *cp;
2450 const char *descr; 2561 const char *descr;
2451 int ret = -EINVAL; 2562 int ret = -ENOMEM;
2452 int blocksize; 2563 int blocksize;
2453 unsigned int db_count; 2564 unsigned int db_count;
2454 unsigned int i; 2565 unsigned int i;
@@ -2459,13 +2570,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2459 2570
2460 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 2571 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
2461 if (!sbi) 2572 if (!sbi)
2462 return -ENOMEM; 2573 goto out_free_orig;
2463 2574
2464 sbi->s_blockgroup_lock = 2575 sbi->s_blockgroup_lock =
2465 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); 2576 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
2466 if (!sbi->s_blockgroup_lock) { 2577 if (!sbi->s_blockgroup_lock) {
2467 kfree(sbi); 2578 kfree(sbi);
2468 return -ENOMEM; 2579 goto out_free_orig;
2469 } 2580 }
2470 sb->s_fs_info = sbi; 2581 sb->s_fs_info = sbi;
2471 sbi->s_mount_opt = 0; 2582 sbi->s_mount_opt = 0;
@@ -2473,8 +2584,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2473 sbi->s_resgid = EXT4_DEF_RESGID; 2584 sbi->s_resgid = EXT4_DEF_RESGID;
2474 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; 2585 sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
2475 sbi->s_sb_block = sb_block; 2586 sbi->s_sb_block = sb_block;
2476 sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part, 2587 if (sb->s_bdev->bd_part)
2477 sectors[1]); 2588 sbi->s_sectors_written_start =
2589 part_stat_read(sb->s_bdev->bd_part, sectors[1]);
2478 2590
2479 unlock_kernel(); 2591 unlock_kernel();
2480 2592
@@ -2482,6 +2594,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2482 for (cp = sb->s_id; (cp = strchr(cp, '/'));) 2594 for (cp = sb->s_id; (cp = strchr(cp, '/'));)
2483 *cp = '!'; 2595 *cp = '!';
2484 2596
2597 ret = -EINVAL;
2485 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); 2598 blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
2486 if (!blocksize) { 2599 if (!blocksize) {
2487 ext4_msg(sb, KERN_ERR, "unable to set blocksize"); 2600 ext4_msg(sb, KERN_ERR, "unable to set blocksize");
@@ -2546,6 +2659,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2546 set_opt(sbi->s_mount_opt, ERRORS_CONT); 2659 set_opt(sbi->s_mount_opt, ERRORS_CONT);
2547 else 2660 else
2548 set_opt(sbi->s_mount_opt, ERRORS_RO); 2661 set_opt(sbi->s_mount_opt, ERRORS_RO);
2662 if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)
2663 set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
2664 if (def_mount_opts & EXT4_DEFM_DISCARD)
2665 set_opt(sbi->s_mount_opt, DISCARD);
2549 2666
2550 sbi->s_resuid = le16_to_cpu(es->s_def_resuid); 2667 sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
2551 sbi->s_resgid = le16_to_cpu(es->s_def_resgid); 2668 sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
@@ -2553,15 +2670,23 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2553 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 2670 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2554 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 2671 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
2555 2672
2556 set_opt(sbi->s_mount_opt, BARRIER); 2673 if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
2674 set_opt(sbi->s_mount_opt, BARRIER);
2557 2675
2558 /* 2676 /*
2559 * enable delayed allocation by default 2677 * enable delayed allocation by default
2560 * Use -o nodelalloc to turn it off 2678 * Use -o nodelalloc to turn it off
2561 */ 2679 */
2562 if (!IS_EXT3_SB(sb)) 2680 if (!IS_EXT3_SB(sb) &&
2681 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
2563 set_opt(sbi->s_mount_opt, DELALLOC); 2682 set_opt(sbi->s_mount_opt, DELALLOC);
2564 2683
2684 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
2685 &journal_devnum, &journal_ioprio, NULL, 0)) {
2686 ext4_msg(sb, KERN_WARNING,
2687 "failed to parse options in superblock: %s",
2688 sbi->s_es->s_mount_opts);
2689 }
2565 if (!parse_options((char *) data, sb, &journal_devnum, 2690 if (!parse_options((char *) data, sb, &journal_devnum,
2566 &journal_ioprio, NULL, 0)) 2691 &journal_ioprio, NULL, 0))
2567 goto failed_mount; 2692 goto failed_mount;
@@ -2912,18 +3037,7 @@ no_journal:
2912 ext4_msg(sb, KERN_ERR, "insufficient memory"); 3037 ext4_msg(sb, KERN_ERR, "insufficient memory");
2913 goto failed_mount_wq; 3038 goto failed_mount_wq;
2914 } 3039 }
2915 if (test_opt(sb, NOBH)) { 3040
2916 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
2917 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
2918 "its supported only with writeback mode");
2919 clear_opt(sbi->s_mount_opt, NOBH);
2920 }
2921 if (test_opt(sb, DIOREAD_NOLOCK)) {
2922 ext4_msg(sb, KERN_WARNING, "dioread_nolock option is "
2923 "not supported with nobh mode");
2924 goto failed_mount_wq;
2925 }
2926 }
2927 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); 3041 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
2928 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3042 if (!EXT4_SB(sb)->dio_unwritten_wq) {
2929 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3043 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
@@ -3043,7 +3157,14 @@ no_journal:
3043 descr = "out journal"; 3157 descr = "out journal";
3044 3158
3045 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " 3159 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
3046 "Opts: %s", descr, orig_data); 3160 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
3161 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
3162
3163 init_timer(&sbi->s_err_report);
3164 sbi->s_err_report.function = print_daily_error_info;
3165 sbi->s_err_report.data = (unsigned long) sb;
3166 if (es->s_error_count)
3167 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
3047 3168
3048 lock_kernel(); 3169 lock_kernel();
3049 kfree(orig_data); 3170 kfree(orig_data);
@@ -3093,6 +3214,7 @@ out_fail:
3093 kfree(sbi->s_blockgroup_lock); 3214 kfree(sbi->s_blockgroup_lock);
3094 kfree(sbi); 3215 kfree(sbi);
3095 lock_kernel(); 3216 lock_kernel();
3217out_free_orig:
3096 kfree(orig_data); 3218 kfree(orig_data);
3097 return ret; 3219 return ret;
3098} 3220}
@@ -3110,7 +3232,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
3110 journal->j_min_batch_time = sbi->s_min_batch_time; 3232 journal->j_min_batch_time = sbi->s_min_batch_time;
3111 journal->j_max_batch_time = sbi->s_max_batch_time; 3233 journal->j_max_batch_time = sbi->s_max_batch_time;
3112 3234
3113 spin_lock(&journal->j_state_lock); 3235 write_lock(&journal->j_state_lock);
3114 if (test_opt(sb, BARRIER)) 3236 if (test_opt(sb, BARRIER))
3115 journal->j_flags |= JBD2_BARRIER; 3237 journal->j_flags |= JBD2_BARRIER;
3116 else 3238 else
@@ -3119,7 +3241,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
3119 journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; 3241 journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
3120 else 3242 else
3121 journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; 3243 journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
3122 spin_unlock(&journal->j_state_lock); 3244 write_unlock(&journal->j_state_lock);
3123} 3245}
3124 3246
3125static journal_t *ext4_get_journal(struct super_block *sb, 3247static journal_t *ext4_get_journal(struct super_block *sb,
@@ -3327,8 +3449,17 @@ static int ext4_load_journal(struct super_block *sb,
3327 3449
3328 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) 3450 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
3329 err = jbd2_journal_wipe(journal, !really_read_only); 3451 err = jbd2_journal_wipe(journal, !really_read_only);
3330 if (!err) 3452 if (!err) {
3453 char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
3454 if (save)
3455 memcpy(save, ((char *) es) +
3456 EXT4_S_ERR_START, EXT4_S_ERR_LEN);
3331 err = jbd2_journal_load(journal); 3457 err = jbd2_journal_load(journal);
3458 if (save)
3459 memcpy(((char *) es) + EXT4_S_ERR_START,
3460 save, EXT4_S_ERR_LEN);
3461 kfree(save);
3462 }
3332 3463
3333 if (err) { 3464 if (err) {
3334 ext4_msg(sb, KERN_ERR, "error loading journal"); 3465 ext4_msg(sb, KERN_ERR, "error loading journal");
@@ -3384,10 +3515,14 @@ static int ext4_commit_super(struct super_block *sb, int sync)
3384 */ 3515 */
3385 if (!(sb->s_flags & MS_RDONLY)) 3516 if (!(sb->s_flags & MS_RDONLY))
3386 es->s_wtime = cpu_to_le32(get_seconds()); 3517 es->s_wtime = cpu_to_le32(get_seconds());
3387 es->s_kbytes_written = 3518 if (sb->s_bdev->bd_part)
3388 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 3519 es->s_kbytes_written =
3520 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
3389 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 3521 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
3390 EXT4_SB(sb)->s_sectors_written_start) >> 1)); 3522 EXT4_SB(sb)->s_sectors_written_start) >> 1));
3523 else
3524 es->s_kbytes_written =
3525 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
3391 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 3526 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
3392 &EXT4_SB(sb)->s_freeblocks_counter)); 3527 &EXT4_SB(sb)->s_freeblocks_counter));
3393 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( 3528 es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
@@ -3491,7 +3626,7 @@ int ext4_force_commit(struct super_block *sb)
3491 3626
3492 journal = EXT4_SB(sb)->s_journal; 3627 journal = EXT4_SB(sb)->s_journal;
3493 if (journal) { 3628 if (journal) {
3494 vfs_check_frozen(sb, SB_FREEZE_WRITE); 3629 vfs_check_frozen(sb, SB_FREEZE_TRANS);
3495 ret = ext4_journal_force_commit(journal); 3630 ret = ext4_journal_force_commit(journal);
3496 } 3631 }
3497 3632
@@ -3616,7 +3751,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3616 } 3751 }
3617 3752
3618 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) 3753 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
3619 ext4_abort(sb, __func__, "Abort forced by user"); 3754 ext4_abort(sb, "Abort forced by user");
3620 3755
3621 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 3756 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
3622 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); 3757 (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -3981,6 +4116,18 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3981 return err; 4116 return err;
3982} 4117}
3983 4118
4119static int ext4_quota_off(struct super_block *sb, int type)
4120{
4121 /* Force all delayed allocation blocks to be allocated */
4122 if (test_opt(sb, DELALLOC)) {
4123 down_read(&sb->s_umount);
4124 sync_filesystem(sb);
4125 up_read(&sb->s_umount);
4126 }
4127
4128 return dquot_quota_off(sb, type);
4129}
4130
3984/* Read data from quotafile - avoid pagecache and such because we cannot afford 4131/* Read data from quotafile - avoid pagecache and such because we cannot afford
3985 * acquiring the locks... As quota files are never truncated and quota code 4132 * acquiring the locks... As quota files are never truncated and quota code
3986 * itself serializes the operations (and noone else should touch the files) 4133 * itself serializes the operations (and noone else should touch the files)
@@ -4030,7 +4177,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
4030 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 4177 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
4031 int err = 0; 4178 int err = 0;
4032 int offset = off & (sb->s_blocksize - 1); 4179 int offset = off & (sb->s_blocksize - 1);
4033 int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL;
4034 struct buffer_head *bh; 4180 struct buffer_head *bh;
4035 handle_t *handle = journal_current_handle(); 4181 handle_t *handle = journal_current_handle();
4036 4182
@@ -4055,24 +4201,16 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
4055 bh = ext4_bread(handle, inode, blk, 1, &err); 4201 bh = ext4_bread(handle, inode, blk, 1, &err);
4056 if (!bh) 4202 if (!bh)
4057 goto out; 4203 goto out;
4058 if (journal_quota) { 4204 err = ext4_journal_get_write_access(handle, bh);
4059 err = ext4_journal_get_write_access(handle, bh); 4205 if (err) {
4060 if (err) { 4206 brelse(bh);
4061 brelse(bh); 4207 goto out;
4062 goto out;
4063 }
4064 } 4208 }
4065 lock_buffer(bh); 4209 lock_buffer(bh);
4066 memcpy(bh->b_data+offset, data, len); 4210 memcpy(bh->b_data+offset, data, len);
4067 flush_dcache_page(bh->b_page); 4211 flush_dcache_page(bh->b_page);
4068 unlock_buffer(bh); 4212 unlock_buffer(bh);
4069 if (journal_quota) 4213 err = ext4_handle_dirty_metadata(handle, NULL, bh);
4070 err = ext4_handle_dirty_metadata(handle, NULL, bh);
4071 else {
4072 /* Always do at least ordered writes for quotas */
4073 err = ext4_jbd2_file_inode(handle, inode);
4074 mark_buffer_dirty(bh);
4075 }
4076 brelse(bh); 4214 brelse(bh);
4077out: 4215out:
4078 if (err) { 4216 if (err) {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 04338009793a..a6f314249574 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -458,8 +458,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
458 458
459 if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { 459 if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
460 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); 460 EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
461 sb->s_dirt = 1; 461 ext4_handle_dirty_super(handle, sb);
462 ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
463 } 462 }
464} 463}
465 464
diff --git a/fs/file.c b/fs/file.c
index 34bb7f71d994..cccaead962c2 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -178,7 +178,6 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
178 fdt->open_fds = (fd_set *)data; 178 fdt->open_fds = (fd_set *)data;
179 data += nr / BITS_PER_BYTE; 179 data += nr / BITS_PER_BYTE;
180 fdt->close_on_exec = (fd_set *)data; 180 fdt->close_on_exec = (fd_set *)data;
181 INIT_RCU_HEAD(&fdt->rcu);
182 fdt->next = NULL; 181 fdt->next = NULL;
183 182
184 return fdt; 183 return fdt;
@@ -312,7 +311,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
312 new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; 311 new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
313 new_fdt->open_fds = (fd_set *)&newf->open_fds_init; 312 new_fdt->open_fds = (fd_set *)&newf->open_fds_init;
314 new_fdt->fd = &newf->fd_array[0]; 313 new_fdt->fd = &newf->fd_array[0];
315 INIT_RCU_HEAD(&new_fdt->rcu);
316 new_fdt->next = NULL; 314 new_fdt->next = NULL;
317 315
318 spin_lock(&oldf->file_lock); 316 spin_lock(&oldf->file_lock);
@@ -430,7 +428,6 @@ struct files_struct init_files = {
430 .fd = &init_files.fd_array[0], 428 .fd = &init_files.fd_array[0],
431 .close_on_exec = (fd_set *)&init_files.close_on_exec_init, 429 .close_on_exec = (fd_set *)&init_files.close_on_exec_init,
432 .open_fds = (fd_set *)&init_files.open_fds_init, 430 .open_fds = (fd_set *)&init_files.open_fds_init,
433 .rcu = RCU_HEAD_INIT,
434 }, 431 },
435 .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), 432 .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
436}; 433};
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d5be1693ac93..30ac305e8293 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -530,7 +530,8 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
530{ 530{
531 int ret = 0; 531 int ret = 0;
532 532
533 wbc->wb_start = jiffies; /* livelock avoidance */ 533 if (!wbc->wb_start)
534 wbc->wb_start = jiffies; /* livelock avoidance */
534 spin_lock(&inode_lock); 535 spin_lock(&inode_lock);
535 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 536 if (!wbc->for_kupdate || list_empty(&wb->b_io))
536 queue_io(wb, wbc->older_than_this); 537 queue_io(wb, wbc->older_than_this);
@@ -559,7 +560,6 @@ static void __writeback_inodes_sb(struct super_block *sb,
559{ 560{
560 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 561 WARN_ON(!rwsem_is_locked(&sb->s_umount));
561 562
562 wbc->wb_start = jiffies; /* livelock avoidance */
563 spin_lock(&inode_lock); 563 spin_lock(&inode_lock);
564 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 564 if (!wbc->for_kupdate || list_empty(&wb->b_io))
565 queue_io(wb, wbc->older_than_this); 565 queue_io(wb, wbc->older_than_this);
@@ -625,6 +625,7 @@ static long wb_writeback(struct bdi_writeback *wb,
625 wbc.range_end = LLONG_MAX; 625 wbc.range_end = LLONG_MAX;
626 } 626 }
627 627
628 wbc.wb_start = jiffies; /* livelock avoidance */
628 for (;;) { 629 for (;;) {
629 /* 630 /*
630 * Stop writeback when nr_pages has been consumed 631 * Stop writeback when nr_pages has been consumed
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index cc94bb9563f2..3f6dfa989881 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -1,7 +1,6 @@
1 1
2config FSCACHE 2config FSCACHE
3 tristate "General filesystem local caching manager" 3 tristate "General filesystem local caching manager"
4 select SLOW_WORK
5 help 4 help
6 This option enables a generic filesystem caching manager that can be 5 This option enables a generic filesystem caching manager that can be
7 used by various network and other filesystems to cache data locally. 6 used by various network and other filesystems to cache data locally.
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index edd7434ab6e5..6a026441c5a6 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -82,6 +82,14 @@ extern unsigned fscache_defer_lookup;
82extern unsigned fscache_defer_create; 82extern unsigned fscache_defer_create;
83extern unsigned fscache_debug; 83extern unsigned fscache_debug;
84extern struct kobject *fscache_root; 84extern struct kobject *fscache_root;
85extern struct workqueue_struct *fscache_object_wq;
86extern struct workqueue_struct *fscache_op_wq;
87DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
88
89static inline bool fscache_object_congested(void)
90{
91 return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
92}
85 93
86extern int fscache_wait_bit(void *); 94extern int fscache_wait_bit(void *);
87extern int fscache_wait_bit_interruptible(void *); 95extern int fscache_wait_bit_interruptible(void *);
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index add6bdb53f04..f9d856773f79 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -15,6 +15,7 @@
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/completion.h> 16#include <linux/completion.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/seq_file.h>
18#include "internal.h" 19#include "internal.h"
19 20
20MODULE_DESCRIPTION("FS Cache Manager"); 21MODULE_DESCRIPTION("FS Cache Manager");
@@ -40,22 +41,105 @@ MODULE_PARM_DESC(fscache_debug,
40 "FS-Cache debugging mask"); 41 "FS-Cache debugging mask");
41 42
42struct kobject *fscache_root; 43struct kobject *fscache_root;
44struct workqueue_struct *fscache_object_wq;
45struct workqueue_struct *fscache_op_wq;
46
47DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait);
48
49/* these values serve as lower bounds, will be adjusted in fscache_init() */
50static unsigned fscache_object_max_active = 4;
51static unsigned fscache_op_max_active = 2;
52
53#ifdef CONFIG_SYSCTL
54static struct ctl_table_header *fscache_sysctl_header;
55
56static int fscache_max_active_sysctl(struct ctl_table *table, int write,
57 void __user *buffer,
58 size_t *lenp, loff_t *ppos)
59{
60 struct workqueue_struct **wqp = table->extra1;
61 unsigned int *datap = table->data;
62 int ret;
63
64 ret = proc_dointvec(table, write, buffer, lenp, ppos);
65 if (ret == 0)
66 workqueue_set_max_active(*wqp, *datap);
67 return ret;
68}
69
70ctl_table fscache_sysctls[] = {
71 {
72 .procname = "object_max_active",
73 .data = &fscache_object_max_active,
74 .maxlen = sizeof(unsigned),
75 .mode = 0644,
76 .proc_handler = fscache_max_active_sysctl,
77 .extra1 = &fscache_object_wq,
78 },
79 {
80 .procname = "operation_max_active",
81 .data = &fscache_op_max_active,
82 .maxlen = sizeof(unsigned),
83 .mode = 0644,
84 .proc_handler = fscache_max_active_sysctl,
85 .extra1 = &fscache_op_wq,
86 },
87 {}
88};
89
90ctl_table fscache_sysctls_root[] = {
91 {
92 .procname = "fscache",
93 .mode = 0555,
94 .child = fscache_sysctls,
95 },
96 {}
97};
98#endif
43 99
44/* 100/*
45 * initialise the fs caching module 101 * initialise the fs caching module
46 */ 102 */
47static int __init fscache_init(void) 103static int __init fscache_init(void)
48{ 104{
105 unsigned int nr_cpus = num_possible_cpus();
106 unsigned int cpu;
49 int ret; 107 int ret;
50 108
51 ret = slow_work_register_user(THIS_MODULE); 109 fscache_object_max_active =
52 if (ret < 0) 110 clamp_val(nr_cpus,
53 goto error_slow_work; 111 fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE);
112
113 ret = -ENOMEM;
114 fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND,
115 fscache_object_max_active);
116 if (!fscache_object_wq)
117 goto error_object_wq;
118
119 fscache_op_max_active =
120 clamp_val(fscache_object_max_active / 2,
121 fscache_op_max_active, WQ_UNBOUND_MAX_ACTIVE);
122
123 ret = -ENOMEM;
124 fscache_op_wq = alloc_workqueue("fscache_operation", WQ_UNBOUND,
125 fscache_op_max_active);
126 if (!fscache_op_wq)
127 goto error_op_wq;
128
129 for_each_possible_cpu(cpu)
130 init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu));
54 131
55 ret = fscache_proc_init(); 132 ret = fscache_proc_init();
56 if (ret < 0) 133 if (ret < 0)
57 goto error_proc; 134 goto error_proc;
58 135
136#ifdef CONFIG_SYSCTL
137 ret = -ENOMEM;
138 fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root);
139 if (!fscache_sysctl_header)
140 goto error_sysctl;
141#endif
142
59 fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar", 143 fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
60 sizeof(struct fscache_cookie), 144 sizeof(struct fscache_cookie),
61 0, 145 0,
@@ -78,10 +162,16 @@ static int __init fscache_init(void)
78error_kobj: 162error_kobj:
79 kmem_cache_destroy(fscache_cookie_jar); 163 kmem_cache_destroy(fscache_cookie_jar);
80error_cookie_jar: 164error_cookie_jar:
165#ifdef CONFIG_SYSCTL
166 unregister_sysctl_table(fscache_sysctl_header);
167error_sysctl:
168#endif
81 fscache_proc_cleanup(); 169 fscache_proc_cleanup();
82error_proc: 170error_proc:
83 slow_work_unregister_user(THIS_MODULE); 171 destroy_workqueue(fscache_op_wq);
84error_slow_work: 172error_op_wq:
173 destroy_workqueue(fscache_object_wq);
174error_object_wq:
85 return ret; 175 return ret;
86} 176}
87 177
@@ -96,8 +186,12 @@ static void __exit fscache_exit(void)
96 186
97 kobject_put(fscache_root); 187 kobject_put(fscache_root);
98 kmem_cache_destroy(fscache_cookie_jar); 188 kmem_cache_destroy(fscache_cookie_jar);
189#ifdef CONFIG_SYSCTL
190 unregister_sysctl_table(fscache_sysctl_header);
191#endif
99 fscache_proc_cleanup(); 192 fscache_proc_cleanup();
100 slow_work_unregister_user(THIS_MODULE); 193 destroy_workqueue(fscache_op_wq);
194 destroy_workqueue(fscache_object_wq);
101 printk(KERN_NOTICE "FS-Cache: Unloaded\n"); 195 printk(KERN_NOTICE "FS-Cache: Unloaded\n");
102} 196}
103 197
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 4a8eb31c5338..ebe29c581380 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -34,8 +34,8 @@ struct fscache_objlist_data {
34#define FSCACHE_OBJLIST_CONFIG_NOREADS 0x00000200 /* show objects without active reads */ 34#define FSCACHE_OBJLIST_CONFIG_NOREADS 0x00000200 /* show objects without active reads */
35#define FSCACHE_OBJLIST_CONFIG_EVENTS 0x00000400 /* show objects with events */ 35#define FSCACHE_OBJLIST_CONFIG_EVENTS 0x00000400 /* show objects with events */
36#define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */ 36#define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */
37#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with slow work */ 37#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with work */
38#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without slow work */ 38#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without work */
39 39
40 u8 buf[512]; /* key and aux data buffer */ 40 u8 buf[512]; /* key and aux data buffer */
41}; 41};
@@ -231,12 +231,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
231 READS, NOREADS); 231 READS, NOREADS);
232 FILTER(obj->events & obj->event_mask, 232 FILTER(obj->events & obj->event_mask,
233 EVENTS, NOEVENTS); 233 EVENTS, NOEVENTS);
234 FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW), 234 FILTER(work_busy(&obj->work), WORK, NOWORK);
235 WORK, NOWORK);
236 } 235 }
237 236
238 seq_printf(m, 237 seq_printf(m,
239 "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ", 238 "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ",
240 obj->debug_id, 239 obj->debug_id,
241 obj->parent ? obj->parent->debug_id : -1, 240 obj->parent ? obj->parent->debug_id : -1,
242 fscache_object_states_short[obj->state], 241 fscache_object_states_short[obj->state],
@@ -249,7 +248,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
249 obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK, 248 obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK,
250 obj->events, 249 obj->events,
251 obj->flags, 250 obj->flags,
252 obj->work.flags); 251 work_busy(&obj->work));
253 252
254 no_cookie = true; 253 no_cookie = true;
255 keylen = auxlen = 0; 254 keylen = auxlen = 0;
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 0b589a9b4ffc..b6b897c550ac 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,7 +14,6 @@
14 14
15#define FSCACHE_DEBUG_LEVEL COOKIE 15#define FSCACHE_DEBUG_LEVEL COOKIE
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/seq_file.h>
18#include "internal.h" 17#include "internal.h"
19 18
20const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { 19const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
@@ -50,12 +49,8 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
50 [FSCACHE_OBJECT_DEAD] = "DEAD", 49 [FSCACHE_OBJECT_DEAD] = "DEAD",
51}; 50};
52 51
53static void fscache_object_slow_work_put_ref(struct slow_work *); 52static int fscache_get_object(struct fscache_object *);
54static int fscache_object_slow_work_get_ref(struct slow_work *); 53static void fscache_put_object(struct fscache_object *);
55static void fscache_object_slow_work_execute(struct slow_work *);
56#ifdef CONFIG_SLOW_WORK_DEBUG
57static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
58#endif
59static void fscache_initialise_object(struct fscache_object *); 54static void fscache_initialise_object(struct fscache_object *);
60static void fscache_lookup_object(struct fscache_object *); 55static void fscache_lookup_object(struct fscache_object *);
61static void fscache_object_available(struct fscache_object *); 56static void fscache_object_available(struct fscache_object *);
@@ -64,17 +59,6 @@ static void fscache_withdraw_object(struct fscache_object *);
64static void fscache_enqueue_dependents(struct fscache_object *); 59static void fscache_enqueue_dependents(struct fscache_object *);
65static void fscache_dequeue_object(struct fscache_object *); 60static void fscache_dequeue_object(struct fscache_object *);
66 61
67const struct slow_work_ops fscache_object_slow_work_ops = {
68 .owner = THIS_MODULE,
69 .get_ref = fscache_object_slow_work_get_ref,
70 .put_ref = fscache_object_slow_work_put_ref,
71 .execute = fscache_object_slow_work_execute,
72#ifdef CONFIG_SLOW_WORK_DEBUG
73 .desc = fscache_object_slow_work_desc,
74#endif
75};
76EXPORT_SYMBOL(fscache_object_slow_work_ops);
77
78/* 62/*
79 * we need to notify the parent when an op completes that we had outstanding 63 * we need to notify the parent when an op completes that we had outstanding
80 * upon it 64 * upon it
@@ -345,7 +329,7 @@ unsupported_event:
345/* 329/*
346 * execute an object 330 * execute an object
347 */ 331 */
348static void fscache_object_slow_work_execute(struct slow_work *work) 332void fscache_object_work_func(struct work_struct *work)
349{ 333{
350 struct fscache_object *object = 334 struct fscache_object *object =
351 container_of(work, struct fscache_object, work); 335 container_of(work, struct fscache_object, work);
@@ -359,23 +343,9 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
359 if (object->events & object->event_mask) 343 if (object->events & object->event_mask)
360 fscache_enqueue_object(object); 344 fscache_enqueue_object(object);
361 clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); 345 clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
346 fscache_put_object(object);
362} 347}
363 348EXPORT_SYMBOL(fscache_object_work_func);
364/*
365 * describe an object for slow-work debugging
366 */
367#ifdef CONFIG_SLOW_WORK_DEBUG
368static void fscache_object_slow_work_desc(struct slow_work *work,
369 struct seq_file *m)
370{
371 struct fscache_object *object =
372 container_of(work, struct fscache_object, work);
373
374 seq_printf(m, "FSC: OBJ%x: %s",
375 object->debug_id,
376 fscache_object_states_short[object->state]);
377}
378#endif
379 349
380/* 350/*
381 * initialise an object 351 * initialise an object
@@ -393,7 +363,6 @@ static void fscache_initialise_object(struct fscache_object *object)
393 _enter(""); 363 _enter("");
394 ASSERT(object->cookie != NULL); 364 ASSERT(object->cookie != NULL);
395 ASSERT(object->cookie->parent != NULL); 365 ASSERT(object->cookie->parent != NULL);
396 ASSERT(list_empty(&object->work.link));
397 366
398 if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) | 367 if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) |
399 (1 << FSCACHE_OBJECT_EV_RELEASE) | 368 (1 << FSCACHE_OBJECT_EV_RELEASE) |
@@ -671,10 +640,8 @@ static void fscache_drop_object(struct fscache_object *object)
671 object->parent = NULL; 640 object->parent = NULL;
672 } 641 }
673 642
674 /* this just shifts the object release to the slow work processor */ 643 /* this just shifts the object release to the work processor */
675 fscache_stat(&fscache_n_cop_put_object); 644 fscache_put_object(object);
676 object->cache->ops->put_object(object);
677 fscache_stat_d(&fscache_n_cop_put_object);
678 645
679 _leave(""); 646 _leave("");
680} 647}
@@ -758,12 +725,10 @@ void fscache_withdrawing_object(struct fscache_cache *cache,
758} 725}
759 726
760/* 727/*
761 * allow the slow work item processor to get a ref on an object 728 * get a ref on an object
762 */ 729 */
763static int fscache_object_slow_work_get_ref(struct slow_work *work) 730static int fscache_get_object(struct fscache_object *object)
764{ 731{
765 struct fscache_object *object =
766 container_of(work, struct fscache_object, work);
767 int ret; 732 int ret;
768 733
769 fscache_stat(&fscache_n_cop_grab_object); 734 fscache_stat(&fscache_n_cop_grab_object);
@@ -773,13 +738,10 @@ static int fscache_object_slow_work_get_ref(struct slow_work *work)
773} 738}
774 739
775/* 740/*
776 * allow the slow work item processor to discard a ref on a work item 741 * discard a ref on a work item
777 */ 742 */
778static void fscache_object_slow_work_put_ref(struct slow_work *work) 743static void fscache_put_object(struct fscache_object *object)
779{ 744{
780 struct fscache_object *object =
781 container_of(work, struct fscache_object, work);
782
783 fscache_stat(&fscache_n_cop_put_object); 745 fscache_stat(&fscache_n_cop_put_object);
784 object->cache->ops->put_object(object); 746 object->cache->ops->put_object(object);
785 fscache_stat_d(&fscache_n_cop_put_object); 747 fscache_stat_d(&fscache_n_cop_put_object);
@@ -792,8 +754,48 @@ void fscache_enqueue_object(struct fscache_object *object)
792{ 754{
793 _enter("{OBJ%x}", object->debug_id); 755 _enter("{OBJ%x}", object->debug_id);
794 756
795 slow_work_enqueue(&object->work); 757 if (fscache_get_object(object) >= 0) {
758 wait_queue_head_t *cong_wq =
759 &get_cpu_var(fscache_object_cong_wait);
760
761 if (queue_work(fscache_object_wq, &object->work)) {
762 if (fscache_object_congested())
763 wake_up(cong_wq);
764 } else
765 fscache_put_object(object);
766
767 put_cpu_var(fscache_object_cong_wait);
768 }
769}
770
771/**
772 * fscache_object_sleep_till_congested - Sleep until object wq is congested
773 * @timoutp: Scheduler sleep timeout
774 *
775 * Allow an object handler to sleep until the object workqueue is congested.
776 *
777 * The caller must set up a wake up event before calling this and must have set
778 * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
779 * condition before calling this function as no test is made here.
780 *
781 * %true is returned if the object wq is congested, %false otherwise.
782 */
783bool fscache_object_sleep_till_congested(signed long *timeoutp)
784{
785 wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait);
786 DEFINE_WAIT(wait);
787
788 if (fscache_object_congested())
789 return true;
790
791 add_wait_queue_exclusive(cong_wq, &wait);
792 if (!fscache_object_congested())
793 *timeoutp = schedule_timeout(*timeoutp);
794 finish_wait(cong_wq, &wait);
795
796 return fscache_object_congested();
796} 797}
798EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested);
797 799
798/* 800/*
799 * enqueue the dependents of an object for metadata-type processing 801 * enqueue the dependents of an object for metadata-type processing
@@ -819,9 +821,7 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
819 821
820 /* sort onto appropriate lists */ 822 /* sort onto appropriate lists */
821 fscache_enqueue_object(dep); 823 fscache_enqueue_object(dep);
822 fscache_stat(&fscache_n_cop_put_object); 824 fscache_put_object(dep);
823 dep->cache->ops->put_object(dep);
824 fscache_stat_d(&fscache_n_cop_put_object);
825 825
826 if (!list_empty(&object->dependents)) 826 if (!list_empty(&object->dependents))
827 cond_resched_lock(&object->lock); 827 cond_resched_lock(&object->lock);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index f17cecafae44..b9f34eaede09 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -42,16 +42,12 @@ void fscache_enqueue_operation(struct fscache_operation *op)
42 42
43 fscache_stat(&fscache_n_op_enqueue); 43 fscache_stat(&fscache_n_op_enqueue);
44 switch (op->flags & FSCACHE_OP_TYPE) { 44 switch (op->flags & FSCACHE_OP_TYPE) {
45 case FSCACHE_OP_FAST: 45 case FSCACHE_OP_ASYNC:
46 _debug("queue fast"); 46 _debug("queue async");
47 atomic_inc(&op->usage); 47 atomic_inc(&op->usage);
48 if (!schedule_work(&op->fast_work)) 48 if (!queue_work(fscache_op_wq, &op->work))
49 fscache_put_operation(op); 49 fscache_put_operation(op);
50 break; 50 break;
51 case FSCACHE_OP_SLOW:
52 _debug("queue slow");
53 slow_work_enqueue(&op->slow_work);
54 break;
55 case FSCACHE_OP_MYTHREAD: 51 case FSCACHE_OP_MYTHREAD:
56 _debug("queue for caller's attention"); 52 _debug("queue for caller's attention");
57 break; 53 break;
@@ -455,36 +451,13 @@ void fscache_operation_gc(struct work_struct *work)
455} 451}
456 452
457/* 453/*
458 * allow the slow work item processor to get a ref on an operation 454 * execute an operation using fs_op_wq to provide processing context -
459 */ 455 * the caller holds a ref to this object, so we don't need to hold one
460static int fscache_op_get_ref(struct slow_work *work)
461{
462 struct fscache_operation *op =
463 container_of(work, struct fscache_operation, slow_work);
464
465 atomic_inc(&op->usage);
466 return 0;
467}
468
469/*
470 * allow the slow work item processor to discard a ref on an operation
471 */
472static void fscache_op_put_ref(struct slow_work *work)
473{
474 struct fscache_operation *op =
475 container_of(work, struct fscache_operation, slow_work);
476
477 fscache_put_operation(op);
478}
479
480/*
481 * execute an operation using the slow thread pool to provide processing context
482 * - the caller holds a ref to this object, so we don't need to hold one
483 */ 456 */
484static void fscache_op_execute(struct slow_work *work) 457void fscache_op_work_func(struct work_struct *work)
485{ 458{
486 struct fscache_operation *op = 459 struct fscache_operation *op =
487 container_of(work, struct fscache_operation, slow_work); 460 container_of(work, struct fscache_operation, work);
488 unsigned long start; 461 unsigned long start;
489 462
490 _enter("{OBJ%x OP%x,%d}", 463 _enter("{OBJ%x OP%x,%d}",
@@ -494,31 +467,7 @@ static void fscache_op_execute(struct slow_work *work)
494 start = jiffies; 467 start = jiffies;
495 op->processor(op); 468 op->processor(op);
496 fscache_hist(fscache_ops_histogram, start); 469 fscache_hist(fscache_ops_histogram, start);
470 fscache_put_operation(op);
497 471
498 _leave(""); 472 _leave("");
499} 473}
500
501/*
502 * describe an operation for slow-work debugging
503 */
504#ifdef CONFIG_SLOW_WORK_DEBUG
505static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
506{
507 struct fscache_operation *op =
508 container_of(work, struct fscache_operation, slow_work);
509
510 seq_printf(m, "FSC: OBJ%x OP%x: %s/%s fl=%lx",
511 op->object->debug_id, op->debug_id,
512 op->name, op->state, op->flags);
513}
514#endif
515
516const struct slow_work_ops fscache_op_slow_work_ops = {
517 .owner = THIS_MODULE,
518 .get_ref = fscache_op_get_ref,
519 .put_ref = fscache_op_put_ref,
520 .execute = fscache_op_execute,
521#ifdef CONFIG_SLOW_WORK_DEBUG
522 .desc = fscache_op_desc,
523#endif
524};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 723b889fd219..41c441c2058d 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -105,7 +105,7 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
105 105
106page_busy: 106page_busy:
107 /* we might want to wait here, but that could deadlock the allocator as 107 /* we might want to wait here, but that could deadlock the allocator as
108 * the slow-work threads writing to the cache may all end up sleeping 108 * the work threads writing to the cache may all end up sleeping
109 * on memory allocation */ 109 * on memory allocation */
110 fscache_stat(&fscache_n_store_vmscan_busy); 110 fscache_stat(&fscache_n_store_vmscan_busy);
111 return false; 111 return false;
@@ -188,9 +188,8 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
188 return -ENOMEM; 188 return -ENOMEM;
189 } 189 }
190 190
191 fscache_operation_init(op, NULL); 191 fscache_operation_init(op, fscache_attr_changed_op, NULL);
192 fscache_operation_init_slow(op, fscache_attr_changed_op); 192 op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
193 op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
194 fscache_set_op_name(op, "Attr"); 193 fscache_set_op_name(op, "Attr");
195 194
196 spin_lock(&cookie->lock); 195 spin_lock(&cookie->lock);
@@ -218,24 +217,6 @@ nobufs:
218EXPORT_SYMBOL(__fscache_attr_changed); 217EXPORT_SYMBOL(__fscache_attr_changed);
219 218
220/* 219/*
221 * handle secondary execution given to a retrieval op on behalf of the
222 * cache
223 */
224static void fscache_retrieval_work(struct work_struct *work)
225{
226 struct fscache_retrieval *op =
227 container_of(work, struct fscache_retrieval, op.fast_work);
228 unsigned long start;
229
230 _enter("{OP%x}", op->op.debug_id);
231
232 start = jiffies;
233 op->op.processor(&op->op);
234 fscache_hist(fscache_ops_histogram, start);
235 fscache_put_operation(&op->op);
236}
237
238/*
239 * release a retrieval op reference 220 * release a retrieval op reference
240 */ 221 */
241static void fscache_release_retrieval_op(struct fscache_operation *_op) 222static void fscache_release_retrieval_op(struct fscache_operation *_op)
@@ -269,13 +250,12 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
269 return NULL; 250 return NULL;
270 } 251 }
271 252
272 fscache_operation_init(&op->op, fscache_release_retrieval_op); 253 fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
273 op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING); 254 op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING);
274 op->mapping = mapping; 255 op->mapping = mapping;
275 op->end_io_func = end_io_func; 256 op->end_io_func = end_io_func;
276 op->context = context; 257 op->context = context;
277 op->start_time = jiffies; 258 op->start_time = jiffies;
278 INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
279 INIT_LIST_HEAD(&op->to_do); 259 INIT_LIST_HEAD(&op->to_do);
280 fscache_set_op_name(&op->op, "Retr"); 260 fscache_set_op_name(&op->op, "Retr");
281 return op; 261 return op;
@@ -795,9 +775,9 @@ int __fscache_write_page(struct fscache_cookie *cookie,
795 if (!op) 775 if (!op)
796 goto nomem; 776 goto nomem;
797 777
798 fscache_operation_init(&op->op, fscache_release_write_op); 778 fscache_operation_init(&op->op, fscache_write_op,
799 fscache_operation_init_slow(&op->op, fscache_write_op); 779 fscache_release_write_op);
800 op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING); 780 op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
801 fscache_set_op_name(&op->op, "Write1"); 781 fscache_set_op_name(&op->op, "Write1");
802 782
803 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); 783 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
@@ -852,7 +832,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
852 fscache_stat(&fscache_n_store_ops); 832 fscache_stat(&fscache_n_store_ops);
853 fscache_stat(&fscache_n_stores_ok); 833 fscache_stat(&fscache_n_stores_ok);
854 834
855 /* the slow work queue now carries its own ref on the object */ 835 /* the work queue now carries its own ref on the object */
856 fscache_put_operation(&op->op); 836 fscache_put_operation(&op->op);
857 _leave(" = 0"); 837 _leave(" = 0");
858 return 0; 838 return 0;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 9424796d6634..69ad053ffd78 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -239,7 +239,6 @@ static u64 fuse_get_unique(struct fuse_conn *fc)
239 239
240static void queue_request(struct fuse_conn *fc, struct fuse_req *req) 240static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
241{ 241{
242 req->in.h.unique = fuse_get_unique(fc);
243 req->in.h.len = sizeof(struct fuse_in_header) + 242 req->in.h.len = sizeof(struct fuse_in_header) +
244 len_args(req->in.numargs, (struct fuse_arg *) req->in.args); 243 len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
245 list_add_tail(&req->list, &fc->pending); 244 list_add_tail(&req->list, &fc->pending);
@@ -261,6 +260,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
261 req = list_entry(fc->bg_queue.next, struct fuse_req, list); 260 req = list_entry(fc->bg_queue.next, struct fuse_req, list);
262 list_del(&req->list); 261 list_del(&req->list);
263 fc->active_background++; 262 fc->active_background++;
263 req->in.h.unique = fuse_get_unique(fc);
264 queue_request(fc, req); 264 queue_request(fc, req);
265 } 265 }
266} 266}
@@ -398,6 +398,7 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
398 else if (fc->conn_error) 398 else if (fc->conn_error)
399 req->out.h.error = -ECONNREFUSED; 399 req->out.h.error = -ECONNREFUSED;
400 else { 400 else {
401 req->in.h.unique = fuse_get_unique(fc);
401 queue_request(fc, req); 402 queue_request(fc, req);
402 /* acquire extra reference, since request is still needed 403 /* acquire extra reference, since request is still needed
403 after request_end() */ 404 after request_end() */
@@ -450,6 +451,23 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
450} 451}
451EXPORT_SYMBOL_GPL(fuse_request_send_background); 452EXPORT_SYMBOL_GPL(fuse_request_send_background);
452 453
454static int fuse_request_send_notify_reply(struct fuse_conn *fc,
455 struct fuse_req *req, u64 unique)
456{
457 int err = -ENODEV;
458
459 req->isreply = 0;
460 req->in.h.unique = unique;
461 spin_lock(&fc->lock);
462 if (fc->connected) {
463 queue_request(fc, req);
464 err = 0;
465 }
466 spin_unlock(&fc->lock);
467
468 return err;
469}
470
453/* 471/*
454 * Called under fc->lock 472 * Called under fc->lock
455 * 473 *
@@ -535,13 +553,13 @@ static void fuse_copy_finish(struct fuse_copy_state *cs)
535 if (!cs->write) { 553 if (!cs->write) {
536 buf->ops->unmap(cs->pipe, buf, cs->mapaddr); 554 buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
537 } else { 555 } else {
538 kunmap_atomic(cs->mapaddr, KM_USER0); 556 kunmap(buf->page);
539 buf->len = PAGE_SIZE - cs->len; 557 buf->len = PAGE_SIZE - cs->len;
540 } 558 }
541 cs->currbuf = NULL; 559 cs->currbuf = NULL;
542 cs->mapaddr = NULL; 560 cs->mapaddr = NULL;
543 } else if (cs->mapaddr) { 561 } else if (cs->mapaddr) {
544 kunmap_atomic(cs->mapaddr, KM_USER0); 562 kunmap(cs->pg);
545 if (cs->write) { 563 if (cs->write) {
546 flush_dcache_page(cs->pg); 564 flush_dcache_page(cs->pg);
547 set_page_dirty_lock(cs->pg); 565 set_page_dirty_lock(cs->pg);
@@ -572,7 +590,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
572 590
573 BUG_ON(!cs->nr_segs); 591 BUG_ON(!cs->nr_segs);
574 cs->currbuf = buf; 592 cs->currbuf = buf;
575 cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); 593 cs->mapaddr = buf->ops->map(cs->pipe, buf, 0);
576 cs->len = buf->len; 594 cs->len = buf->len;
577 cs->buf = cs->mapaddr + buf->offset; 595 cs->buf = cs->mapaddr + buf->offset;
578 cs->pipebufs++; 596 cs->pipebufs++;
@@ -592,7 +610,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
592 buf->len = 0; 610 buf->len = 0;
593 611
594 cs->currbuf = buf; 612 cs->currbuf = buf;
595 cs->mapaddr = kmap_atomic(page, KM_USER0); 613 cs->mapaddr = kmap(page);
596 cs->buf = cs->mapaddr; 614 cs->buf = cs->mapaddr;
597 cs->len = PAGE_SIZE; 615 cs->len = PAGE_SIZE;
598 cs->pipebufs++; 616 cs->pipebufs++;
@@ -611,7 +629,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
611 return err; 629 return err;
612 BUG_ON(err != 1); 630 BUG_ON(err != 1);
613 offset = cs->addr % PAGE_SIZE; 631 offset = cs->addr % PAGE_SIZE;
614 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0); 632 cs->mapaddr = kmap(cs->pg);
615 cs->buf = cs->mapaddr + offset; 633 cs->buf = cs->mapaddr + offset;
616 cs->len = min(PAGE_SIZE - offset, cs->seglen); 634 cs->len = min(PAGE_SIZE - offset, cs->seglen);
617 cs->seglen -= cs->len; 635 cs->seglen -= cs->len;
@@ -1231,6 +1249,199 @@ err:
1231 return err; 1249 return err;
1232} 1250}
1233 1251
1252static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
1253 struct fuse_copy_state *cs)
1254{
1255 struct fuse_notify_store_out outarg;
1256 struct inode *inode;
1257 struct address_space *mapping;
1258 u64 nodeid;
1259 int err;
1260 pgoff_t index;
1261 unsigned int offset;
1262 unsigned int num;
1263 loff_t file_size;
1264 loff_t end;
1265
1266 err = -EINVAL;
1267 if (size < sizeof(outarg))
1268 goto out_finish;
1269
1270 err = fuse_copy_one(cs, &outarg, sizeof(outarg));
1271 if (err)
1272 goto out_finish;
1273
1274 err = -EINVAL;
1275 if (size - sizeof(outarg) != outarg.size)
1276 goto out_finish;
1277
1278 nodeid = outarg.nodeid;
1279
1280 down_read(&fc->killsb);
1281
1282 err = -ENOENT;
1283 if (!fc->sb)
1284 goto out_up_killsb;
1285
1286 inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
1287 if (!inode)
1288 goto out_up_killsb;
1289
1290 mapping = inode->i_mapping;
1291 index = outarg.offset >> PAGE_CACHE_SHIFT;
1292 offset = outarg.offset & ~PAGE_CACHE_MASK;
1293 file_size = i_size_read(inode);
1294 end = outarg.offset + outarg.size;
1295 if (end > file_size) {
1296 file_size = end;
1297 fuse_write_update_size(inode, file_size);
1298 }
1299
1300 num = outarg.size;
1301 while (num) {
1302 struct page *page;
1303 unsigned int this_num;
1304
1305 err = -ENOMEM;
1306 page = find_or_create_page(mapping, index,
1307 mapping_gfp_mask(mapping));
1308 if (!page)
1309 goto out_iput;
1310
1311 this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
1312 err = fuse_copy_page(cs, &page, offset, this_num, 0);
1313 if (!err && offset == 0 && (num != 0 || file_size == end))
1314 SetPageUptodate(page);
1315 unlock_page(page);
1316 page_cache_release(page);
1317
1318 if (err)
1319 goto out_iput;
1320
1321 num -= this_num;
1322 offset = 0;
1323 index++;
1324 }
1325
1326 err = 0;
1327
1328out_iput:
1329 iput(inode);
1330out_up_killsb:
1331 up_read(&fc->killsb);
1332out_finish:
1333 fuse_copy_finish(cs);
1334 return err;
1335}
1336
1337static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
1338{
1339 int i;
1340
1341 for (i = 0; i < req->num_pages; i++) {
1342 struct page *page = req->pages[i];
1343 page_cache_release(page);
1344 }
1345}
1346
1347static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
1348 struct fuse_notify_retrieve_out *outarg)
1349{
1350 int err;
1351 struct address_space *mapping = inode->i_mapping;
1352 struct fuse_req *req;
1353 pgoff_t index;
1354 loff_t file_size;
1355 unsigned int num;
1356 unsigned int offset;
1357 size_t total_len;
1358
1359 req = fuse_get_req(fc);
1360 if (IS_ERR(req))
1361 return PTR_ERR(req);
1362
1363 offset = outarg->offset & ~PAGE_CACHE_MASK;
1364
1365 req->in.h.opcode = FUSE_NOTIFY_REPLY;
1366 req->in.h.nodeid = outarg->nodeid;
1367 req->in.numargs = 2;
1368 req->in.argpages = 1;
1369 req->page_offset = offset;
1370 req->end = fuse_retrieve_end;
1371
1372 index = outarg->offset >> PAGE_CACHE_SHIFT;
1373 file_size = i_size_read(inode);
1374 num = outarg->size;
1375 if (outarg->offset > file_size)
1376 num = 0;
1377 else if (outarg->offset + num > file_size)
1378 num = file_size - outarg->offset;
1379
1380 while (num) {
1381 struct page *page;
1382 unsigned int this_num;
1383
1384 page = find_get_page(mapping, index);
1385 if (!page)
1386 break;
1387
1388 this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
1389 req->pages[req->num_pages] = page;
1390 req->num_pages++;
1391
1392 num -= this_num;
1393 total_len += this_num;
1394 }
1395 req->misc.retrieve_in.offset = outarg->offset;
1396 req->misc.retrieve_in.size = total_len;
1397 req->in.args[0].size = sizeof(req->misc.retrieve_in);
1398 req->in.args[0].value = &req->misc.retrieve_in;
1399 req->in.args[1].size = total_len;
1400
1401 err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique);
1402 if (err)
1403 fuse_retrieve_end(fc, req);
1404
1405 return err;
1406}
1407
1408static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
1409 struct fuse_copy_state *cs)
1410{
1411 struct fuse_notify_retrieve_out outarg;
1412 struct inode *inode;
1413 int err;
1414
1415 err = -EINVAL;
1416 if (size != sizeof(outarg))
1417 goto copy_finish;
1418
1419 err = fuse_copy_one(cs, &outarg, sizeof(outarg));
1420 if (err)
1421 goto copy_finish;
1422
1423 fuse_copy_finish(cs);
1424
1425 down_read(&fc->killsb);
1426 err = -ENOENT;
1427 if (fc->sb) {
1428 u64 nodeid = outarg.nodeid;
1429
1430 inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
1431 if (inode) {
1432 err = fuse_retrieve(fc, inode, &outarg);
1433 iput(inode);
1434 }
1435 }
1436 up_read(&fc->killsb);
1437
1438 return err;
1439
1440copy_finish:
1441 fuse_copy_finish(cs);
1442 return err;
1443}
1444
1234static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, 1445static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
1235 unsigned int size, struct fuse_copy_state *cs) 1446 unsigned int size, struct fuse_copy_state *cs)
1236{ 1447{
@@ -1244,6 +1455,12 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
1244 case FUSE_NOTIFY_INVAL_ENTRY: 1455 case FUSE_NOTIFY_INVAL_ENTRY:
1245 return fuse_notify_inval_entry(fc, size, cs); 1456 return fuse_notify_inval_entry(fc, size, cs);
1246 1457
1458 case FUSE_NOTIFY_STORE:
1459 return fuse_notify_store(fc, size, cs);
1460
1461 case FUSE_NOTIFY_RETRIEVE:
1462 return fuse_notify_retrieve(fc, size, cs);
1463
1247 default: 1464 default:
1248 fuse_copy_finish(cs); 1465 fuse_copy_finish(cs);
1249 return -EINVAL; 1466 return -EINVAL;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index ada0adeb3bb5..147c1f71bdb9 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -706,7 +706,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
706 return 0; 706 return 0;
707} 707}
708 708
709static void fuse_write_update_size(struct inode *inode, loff_t pos) 709void fuse_write_update_size(struct inode *inode, loff_t pos)
710{ 710{
711 struct fuse_conn *fc = get_fuse_conn(inode); 711 struct fuse_conn *fc = get_fuse_conn(inode);
712 struct fuse_inode *fi = get_fuse_inode(inode); 712 struct fuse_inode *fi = get_fuse_inode(inode);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 8f309f04064e..57d4a3a0f102 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -272,6 +272,7 @@ struct fuse_req {
272 struct fuse_write_in in; 272 struct fuse_write_in in;
273 struct fuse_write_out out; 273 struct fuse_write_out out;
274 } write; 274 } write;
275 struct fuse_notify_retrieve_in retrieve_in;
275 struct fuse_lk_in lk_in; 276 struct fuse_lk_in lk_in;
276 } misc; 277 } misc;
277 278
@@ -748,4 +749,6 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
748unsigned fuse_file_poll(struct file *file, poll_table *wait); 749unsigned fuse_file_poll(struct file *file, poll_table *wait);
749int fuse_dev_release(struct inode *inode, struct file *file); 750int fuse_dev_release(struct inode *inode, struct file *file);
750 751
752void fuse_write_update_size(struct inode *inode, loff_t pos);
753
751#endif /* _FS_FUSE_I_H */ 754#endif /* _FS_FUSE_I_H */
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index a47b43107112..cc9665522148 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -7,7 +7,6 @@ config GFS2_FS
7 select IP_SCTP if DLM_SCTP 7 select IP_SCTP if DLM_SCTP
8 select FS_POSIX_ACL 8 select FS_POSIX_ACL
9 select CRC32 9 select CRC32
10 select SLOW_WORK
11 select QUOTACTL 10 select QUOTACTL
12 help 11 help
13 A cluster filesystem. 12 A cluster filesystem.
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 8fcbce48a128..fdbf4b366fa5 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,7 +12,6 @@
12 12
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/workqueue.h> 14#include <linux/workqueue.h>
15#include <linux/slow-work.h>
16#include <linux/dlm.h> 15#include <linux/dlm.h>
17#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
18 17
@@ -383,7 +382,7 @@ struct gfs2_journal_extent {
383struct gfs2_jdesc { 382struct gfs2_jdesc {
384 struct list_head jd_list; 383 struct list_head jd_list;
385 struct list_head extent_list; 384 struct list_head extent_list;
386 struct slow_work jd_work; 385 struct work_struct jd_work;
387 struct inode *jd_inode; 386 struct inode *jd_inode;
388 unsigned long jd_flags; 387 unsigned long jd_flags;
389#define JDF_RECOVERY 1 388#define JDF_RECOVERY 1
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index fb2a5f93b7c3..b1e9630eb46a 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -15,7 +15,6 @@
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/gfs2_ondisk.h> 16#include <linux/gfs2_ondisk.h>
17#include <asm/atomic.h> 17#include <asm/atomic.h>
18#include <linux/slow-work.h>
19 18
20#include "gfs2.h" 19#include "gfs2.h"
21#include "incore.h" 20#include "incore.h"
@@ -24,6 +23,7 @@
24#include "util.h" 23#include "util.h"
25#include "glock.h" 24#include "glock.h"
26#include "quota.h" 25#include "quota.h"
26#include "recovery.h"
27 27
28static struct shrinker qd_shrinker = { 28static struct shrinker qd_shrinker = {
29 .shrink = gfs2_shrink_qd_memory, 29 .shrink = gfs2_shrink_qd_memory,
@@ -138,9 +138,11 @@ static int __init init_gfs2_fs(void)
138 if (error) 138 if (error)
139 goto fail_unregister; 139 goto fail_unregister;
140 140
141 error = slow_work_register_user(THIS_MODULE); 141 error = -ENOMEM;
142 if (error) 142 gfs_recovery_wq = alloc_workqueue("gfs_recovery",
143 goto fail_slow; 143 WQ_NON_REENTRANT | WQ_RESCUER, 0);
144 if (!gfs_recovery_wq)
145 goto fail_wq;
144 146
145 gfs2_register_debugfs(); 147 gfs2_register_debugfs();
146 148
@@ -148,7 +150,7 @@ static int __init init_gfs2_fs(void)
148 150
149 return 0; 151 return 0;
150 152
151fail_slow: 153fail_wq:
152 unregister_filesystem(&gfs2meta_fs_type); 154 unregister_filesystem(&gfs2meta_fs_type);
153fail_unregister: 155fail_unregister:
154 unregister_filesystem(&gfs2_fs_type); 156 unregister_filesystem(&gfs2_fs_type);
@@ -190,7 +192,7 @@ static void __exit exit_gfs2_fs(void)
190 gfs2_unregister_debugfs(); 192 gfs2_unregister_debugfs();
191 unregister_filesystem(&gfs2_fs_type); 193 unregister_filesystem(&gfs2_fs_type);
192 unregister_filesystem(&gfs2meta_fs_type); 194 unregister_filesystem(&gfs2meta_fs_type);
193 slow_work_unregister_user(THIS_MODULE); 195 destroy_workqueue(gfs_recovery_wq);
194 196
195 kmem_cache_destroy(gfs2_quotad_cachep); 197 kmem_cache_destroy(gfs2_quotad_cachep);
196 kmem_cache_destroy(gfs2_rgrpd_cachep); 198 kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 45a4a36195d8..4f44bdeb2f03 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,7 +17,6 @@
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/mount.h> 18#include <linux/mount.h>
19#include <linux/gfs2_ondisk.h> 19#include <linux/gfs2_ondisk.h>
20#include <linux/slow-work.h>
21#include <linux/quotaops.h> 20#include <linux/quotaops.h>
22 21
23#include "gfs2.h" 22#include "gfs2.h"
@@ -673,7 +672,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
673 break; 672 break;
674 673
675 INIT_LIST_HEAD(&jd->extent_list); 674 INIT_LIST_HEAD(&jd->extent_list);
676 slow_work_init(&jd->jd_work, &gfs2_recover_ops); 675 INIT_WORK(&jd->jd_work, gfs2_recover_func);
677 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); 676 jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
678 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { 677 if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
679 if (!jd->jd_inode) 678 if (!jd->jd_inode)
@@ -782,7 +781,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
782 if (sdp->sd_lockstruct.ls_first) { 781 if (sdp->sd_lockstruct.ls_first) {
783 unsigned int x; 782 unsigned int x;
784 for (x = 0; x < sdp->sd_journals; x++) { 783 for (x = 0; x < sdp->sd_journals; x++) {
785 error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x)); 784 error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x),
785 true);
786 if (error) { 786 if (error) {
787 fs_err(sdp, "error recovering journal %u: %d\n", 787 fs_err(sdp, "error recovering journal %u: %d\n",
788 x, error); 788 x, error);
@@ -792,7 +792,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
792 792
793 gfs2_others_may_mount(sdp); 793 gfs2_others_may_mount(sdp);
794 } else if (!sdp->sd_args.ar_spectator) { 794 } else if (!sdp->sd_args.ar_spectator) {
795 error = gfs2_recover_journal(sdp->sd_jdesc); 795 error = gfs2_recover_journal(sdp->sd_jdesc, true);
796 if (error) { 796 if (error) {
797 fs_err(sdp, "error recovering my journal: %d\n", error); 797 fs_err(sdp, "error recovering my journal: %d\n", error);
798 goto fail_jinode_gh; 798 goto fail_jinode_gh;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 8bb643cb2658..1bc6b5695e6d 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1449,10 +1449,10 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
1449 1449
1450 switch (sdp->sd_args.ar_quota) { 1450 switch (sdp->sd_args.ar_quota) {
1451 case GFS2_QUOTA_ON: 1451 case GFS2_QUOTA_ON:
1452 fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD); 1452 fqs->qs_flags |= (FS_QUOTA_UDQ_ENFD | FS_QUOTA_GDQ_ENFD);
1453 /*FALLTHRU*/ 1453 /*FALLTHRU*/
1454 case GFS2_QUOTA_ACCOUNT: 1454 case GFS2_QUOTA_ACCOUNT:
1455 fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT); 1455 fqs->qs_flags |= (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT);
1456 break; 1456 break;
1457 case GFS2_QUOTA_OFF: 1457 case GFS2_QUOTA_OFF:
1458 break; 1458 break;
@@ -1498,7 +1498,7 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id,
1498 1498
1499 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; 1499 qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
1500 fdq->d_version = FS_DQUOT_VERSION; 1500 fdq->d_version = FS_DQUOT_VERSION;
1501 fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA; 1501 fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
1502 fdq->d_id = id; 1502 fdq->d_id = id;
1503 fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit); 1503 fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit);
1504 fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn); 1504 fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn);
@@ -1533,12 +1533,12 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1533 switch(type) { 1533 switch(type) {
1534 case USRQUOTA: 1534 case USRQUOTA:
1535 type = QUOTA_USER; 1535 type = QUOTA_USER;
1536 if (fdq->d_flags != XFS_USER_QUOTA) 1536 if (fdq->d_flags != FS_USER_QUOTA)
1537 return -EINVAL; 1537 return -EINVAL;
1538 break; 1538 break;
1539 case GRPQUOTA: 1539 case GRPQUOTA:
1540 type = QUOTA_GROUP; 1540 type = QUOTA_GROUP;
1541 if (fdq->d_flags != XFS_GROUP_QUOTA) 1541 if (fdq->d_flags != FS_GROUP_QUOTA)
1542 return -EINVAL; 1542 return -EINVAL;
1543 break; 1543 break;
1544 default: 1544 default:
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 4b9bece3d437..f7f89a94a5a4 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -14,7 +14,6 @@
14#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/crc32.h> 16#include <linux/crc32.h>
17#include <linux/slow-work.h>
18 17
19#include "gfs2.h" 18#include "gfs2.h"
20#include "incore.h" 19#include "incore.h"
@@ -28,6 +27,8 @@
28#include "util.h" 27#include "util.h"
29#include "dir.h" 28#include "dir.h"
30 29
30struct workqueue_struct *gfs_recovery_wq;
31
31int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, 32int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
32 struct buffer_head **bh) 33 struct buffer_head **bh)
33{ 34{
@@ -443,23 +444,7 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
443 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); 444 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
444} 445}
445 446
446static int gfs2_recover_get_ref(struct slow_work *work) 447void gfs2_recover_func(struct work_struct *work)
447{
448 struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
449 if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
450 return -EBUSY;
451 return 0;
452}
453
454static void gfs2_recover_put_ref(struct slow_work *work)
455{
456 struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
457 clear_bit(JDF_RECOVERY, &jd->jd_flags);
458 smp_mb__after_clear_bit();
459 wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
460}
461
462static void gfs2_recover_work(struct slow_work *work)
463{ 448{
464 struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); 449 struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
465 struct gfs2_inode *ip = GFS2_I(jd->jd_inode); 450 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
@@ -578,7 +563,7 @@ static void gfs2_recover_work(struct slow_work *work)
578 gfs2_glock_dq_uninit(&j_gh); 563 gfs2_glock_dq_uninit(&j_gh);
579 564
580 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); 565 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
581 return; 566 goto done;
582 567
583fail_gunlock_tr: 568fail_gunlock_tr:
584 gfs2_glock_dq_uninit(&t_gh); 569 gfs2_glock_dq_uninit(&t_gh);
@@ -590,32 +575,35 @@ fail_gunlock_j:
590 } 575 }
591 576
592 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); 577 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
593
594fail: 578fail:
595 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); 579 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
580done:
581 clear_bit(JDF_RECOVERY, &jd->jd_flags);
582 smp_mb__after_clear_bit();
583 wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
596} 584}
597 585
598struct slow_work_ops gfs2_recover_ops = {
599 .owner = THIS_MODULE,
600 .get_ref = gfs2_recover_get_ref,
601 .put_ref = gfs2_recover_put_ref,
602 .execute = gfs2_recover_work,
603};
604
605
606static int gfs2_recovery_wait(void *word) 586static int gfs2_recovery_wait(void *word)
607{ 587{
608 schedule(); 588 schedule();
609 return 0; 589 return 0;
610} 590}
611 591
612int gfs2_recover_journal(struct gfs2_jdesc *jd) 592int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
613{ 593{
614 int rv; 594 int rv;
615 rv = slow_work_enqueue(&jd->jd_work); 595
616 if (rv) 596 if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
617 return rv; 597 return -EBUSY;
618 wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE); 598
599 /* we have JDF_RECOVERY, queue should always succeed */
600 rv = queue_work(gfs_recovery_wq, &jd->jd_work);
601 BUG_ON(!rv);
602
603 if (wait)
604 wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait,
605 TASK_UNINTERRUPTIBLE);
606
619 return 0; 607 return 0;
620} 608}
621 609
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 1616ac22569a..2226136c7647 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -12,6 +12,8 @@
12 12
13#include "incore.h" 13#include "incore.h"
14 14
15extern struct workqueue_struct *gfs_recovery_wq;
16
15static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk) 17static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
16{ 18{
17 if (++*blk == sdp->sd_jdesc->jd_blocks) 19 if (++*blk == sdp->sd_jdesc->jd_blocks)
@@ -27,8 +29,8 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
27 29
28extern int gfs2_find_jhead(struct gfs2_jdesc *jd, 30extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
29 struct gfs2_log_header_host *head); 31 struct gfs2_log_header_host *head);
30extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); 32extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait);
31extern struct slow_work_ops gfs2_recover_ops; 33extern void gfs2_recover_func(struct work_struct *work);
32 34
33#endif /* __RECOVERY_DOT_H__ */ 35#endif /* __RECOVERY_DOT_H__ */
34 36
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index d019d0d55e00..ccacffd2faaa 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -25,6 +25,7 @@
25#include "quota.h" 25#include "quota.h"
26#include "util.h" 26#include "util.h"
27#include "glops.h" 27#include "glops.h"
28#include "recovery.h"
28 29
29struct gfs2_attr { 30struct gfs2_attr {
30 struct attribute attr; 31 struct attribute attr;
@@ -376,7 +377,7 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
376 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { 377 list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
377 if (jd->jd_jid != jid) 378 if (jd->jd_jid != jid)
378 continue; 379 continue;
379 rv = slow_work_enqueue(&jd->jd_work); 380 rv = gfs2_recover_journal(jd, false);
380 break; 381 break;
381 } 382 }
382out: 383out:
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 93d1e47647bd..f19ce94693d8 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1281,13 +1281,9 @@ int journal_check_used_features (journal_t *journal, unsigned long compat,
1281int journal_check_available_features (journal_t *journal, unsigned long compat, 1281int journal_check_available_features (journal_t *journal, unsigned long compat,
1282 unsigned long ro, unsigned long incompat) 1282 unsigned long ro, unsigned long incompat)
1283{ 1283{
1284 journal_superblock_t *sb;
1285
1286 if (!compat && !ro && !incompat) 1284 if (!compat && !ro && !incompat)
1287 return 1; 1285 return 1;
1288 1286
1289 sb = journal->j_superblock;
1290
1291 /* We can support any known requested features iff the 1287 /* We can support any known requested features iff the
1292 * superblock is in version 2. Otherwise we fail to support any 1288 * superblock is in version 2. Otherwise we fail to support any
1293 * extended sb features. */ 1289 * extended sb features. */
@@ -1481,7 +1477,6 @@ int journal_flush(journal_t *journal)
1481 1477
1482int journal_wipe(journal_t *journal, int write) 1478int journal_wipe(journal_t *journal, int write)
1483{ 1479{
1484 journal_superblock_t *sb;
1485 int err = 0; 1480 int err = 0;
1486 1481
1487 J_ASSERT (!(journal->j_flags & JFS_LOADED)); 1482 J_ASSERT (!(journal->j_flags & JFS_LOADED));
@@ -1490,8 +1485,6 @@ int journal_wipe(journal_t *journal, int write)
1490 if (err) 1485 if (err)
1491 return err; 1486 return err;
1492 1487
1493 sb = journal->j_superblock;
1494
1495 if (!journal->j_tail) 1488 if (!journal->j_tail)
1496 goto no_recovery; 1489 goto no_recovery;
1497 1490
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 54c9bc9e1b17..81051dafebf5 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -283,12 +283,9 @@ int journal_recover(journal_t *journal)
283int journal_skip_recovery(journal_t *journal) 283int journal_skip_recovery(journal_t *journal)
284{ 284{
285 int err; 285 int err;
286 journal_superblock_t * sb;
287
288 struct recovery_info info; 286 struct recovery_info info;
289 287
290 memset (&info, 0, sizeof(info)); 288 memset (&info, 0, sizeof(info));
291 sb = journal->j_superblock;
292 289
293 err = do_one_pass(journal, &info, PASS_SCAN); 290 err = do_one_pass(journal, &info, PASS_SCAN);
294 291
@@ -297,7 +294,8 @@ int journal_skip_recovery(journal_t *journal)
297 ++journal->j_transaction_sequence; 294 ++journal->j_transaction_sequence;
298 } else { 295 } else {
299#ifdef CONFIG_JBD_DEBUG 296#ifdef CONFIG_JBD_DEBUG
300 int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); 297 int dropped = info.end_transaction -
298 be32_to_cpu(journal->j_superblock->s_sequence);
301#endif 299#endif
302 jbd_debug(1, 300 jbd_debug(1,
303 "JBD: ignoring %d transaction%s from the journal.\n", 301 "JBD: ignoring %d transaction%s from the journal.\n",
@@ -321,11 +319,6 @@ static int do_one_pass(journal_t *journal,
321 unsigned int sequence; 319 unsigned int sequence;
322 int blocktype; 320 int blocktype;
323 321
324 /* Precompute the maximum metadata descriptors in a descriptor block */
325 int MAX_BLOCKS_PER_DESC;
326 MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
327 / sizeof(journal_block_tag_t));
328
329 /* 322 /*
330 * First thing is to establish what we expect to find in the log 323 * First thing is to establish what we expect to find in the log
331 * (in terms of transaction IDs), and where (in terms of log 324 * (in terms of transaction IDs), and where (in terms of log
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 076d1cc44f95..1c23a0f4e8a3 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -118,13 +118,13 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
118void __jbd2_log_wait_for_space(journal_t *journal) 118void __jbd2_log_wait_for_space(journal_t *journal)
119{ 119{
120 int nblocks, space_left; 120 int nblocks, space_left;
121 assert_spin_locked(&journal->j_state_lock); 121 /* assert_spin_locked(&journal->j_state_lock); */
122 122
123 nblocks = jbd_space_needed(journal); 123 nblocks = jbd_space_needed(journal);
124 while (__jbd2_log_space_left(journal) < nblocks) { 124 while (__jbd2_log_space_left(journal) < nblocks) {
125 if (journal->j_flags & JBD2_ABORT) 125 if (journal->j_flags & JBD2_ABORT)
126 return; 126 return;
127 spin_unlock(&journal->j_state_lock); 127 write_unlock(&journal->j_state_lock);
128 mutex_lock(&journal->j_checkpoint_mutex); 128 mutex_lock(&journal->j_checkpoint_mutex);
129 129
130 /* 130 /*
@@ -138,7 +138,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
138 * filesystem, so abort the journal and leave a stack 138 * filesystem, so abort the journal and leave a stack
139 * trace for forensic evidence. 139 * trace for forensic evidence.
140 */ 140 */
141 spin_lock(&journal->j_state_lock); 141 write_lock(&journal->j_state_lock);
142 spin_lock(&journal->j_list_lock); 142 spin_lock(&journal->j_list_lock);
143 nblocks = jbd_space_needed(journal); 143 nblocks = jbd_space_needed(journal);
144 space_left = __jbd2_log_space_left(journal); 144 space_left = __jbd2_log_space_left(journal);
@@ -149,7 +149,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
149 if (journal->j_committing_transaction) 149 if (journal->j_committing_transaction)
150 tid = journal->j_committing_transaction->t_tid; 150 tid = journal->j_committing_transaction->t_tid;
151 spin_unlock(&journal->j_list_lock); 151 spin_unlock(&journal->j_list_lock);
152 spin_unlock(&journal->j_state_lock); 152 write_unlock(&journal->j_state_lock);
153 if (chkpt) { 153 if (chkpt) {
154 jbd2_log_do_checkpoint(journal); 154 jbd2_log_do_checkpoint(journal);
155 } else if (jbd2_cleanup_journal_tail(journal) == 0) { 155 } else if (jbd2_cleanup_journal_tail(journal) == 0) {
@@ -167,7 +167,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
167 WARN_ON(1); 167 WARN_ON(1);
168 jbd2_journal_abort(journal, 0); 168 jbd2_journal_abort(journal, 0);
169 } 169 }
170 spin_lock(&journal->j_state_lock); 170 write_lock(&journal->j_state_lock);
171 } else { 171 } else {
172 spin_unlock(&journal->j_list_lock); 172 spin_unlock(&journal->j_list_lock);
173 } 173 }
@@ -474,7 +474,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
474 * next transaction ID we will write, and where it will 474 * next transaction ID we will write, and where it will
475 * start. */ 475 * start. */
476 476
477 spin_lock(&journal->j_state_lock); 477 write_lock(&journal->j_state_lock);
478 spin_lock(&journal->j_list_lock); 478 spin_lock(&journal->j_list_lock);
479 transaction = journal->j_checkpoint_transactions; 479 transaction = journal->j_checkpoint_transactions;
480 if (transaction) { 480 if (transaction) {
@@ -496,7 +496,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
496 /* If the oldest pinned transaction is at the tail of the log 496 /* If the oldest pinned transaction is at the tail of the log
497 already then there's not much we can do right now. */ 497 already then there's not much we can do right now. */
498 if (journal->j_tail_sequence == first_tid) { 498 if (journal->j_tail_sequence == first_tid) {
499 spin_unlock(&journal->j_state_lock); 499 write_unlock(&journal->j_state_lock);
500 return 1; 500 return 1;
501 } 501 }
502 502
@@ -516,7 +516,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
516 journal->j_free += freed; 516 journal->j_free += freed;
517 journal->j_tail_sequence = first_tid; 517 journal->j_tail_sequence = first_tid;
518 journal->j_tail = blocknr; 518 journal->j_tail = blocknr;
519 spin_unlock(&journal->j_state_lock); 519 write_unlock(&journal->j_state_lock);
520 520
521 /* 521 /*
522 * If there is an external journal, we need to make sure that 522 * If there is an external journal, we need to make sure that
@@ -775,7 +775,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
775 J_ASSERT(transaction->t_log_list == NULL); 775 J_ASSERT(transaction->t_log_list == NULL);
776 J_ASSERT(transaction->t_checkpoint_list == NULL); 776 J_ASSERT(transaction->t_checkpoint_list == NULL);
777 J_ASSERT(transaction->t_checkpoint_io_list == NULL); 777 J_ASSERT(transaction->t_checkpoint_io_list == NULL);
778 J_ASSERT(transaction->t_updates == 0); 778 J_ASSERT(atomic_read(&transaction->t_updates) == 0);
779 J_ASSERT(journal->j_committing_transaction != transaction); 779 J_ASSERT(journal->j_committing_transaction != transaction);
780 J_ASSERT(journal->j_running_transaction != transaction); 780 J_ASSERT(journal->j_running_transaction != transaction);
781 781
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 75716d3d2be0..f52e5e8049f1 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -150,11 +150,11 @@ static int journal_submit_commit_record(journal_t *journal,
150 */ 150 */
151 if (ret == -EOPNOTSUPP && barrier_done) { 151 if (ret == -EOPNOTSUPP && barrier_done) {
152 printk(KERN_WARNING 152 printk(KERN_WARNING
153 "JBD: barrier-based sync failed on %s - " 153 "JBD2: Disabling barriers on %s, "
154 "disabling barriers\n", journal->j_devname); 154 "not supported by device\n", journal->j_devname);
155 spin_lock(&journal->j_state_lock); 155 write_lock(&journal->j_state_lock);
156 journal->j_flags &= ~JBD2_BARRIER; 156 journal->j_flags &= ~JBD2_BARRIER;
157 spin_unlock(&journal->j_state_lock); 157 write_unlock(&journal->j_state_lock);
158 158
159 /* And try again, without the barrier */ 159 /* And try again, without the barrier */
160 lock_buffer(bh); 160 lock_buffer(bh);
@@ -180,11 +180,11 @@ retry:
180 wait_on_buffer(bh); 180 wait_on_buffer(bh);
181 if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) { 181 if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
182 printk(KERN_WARNING 182 printk(KERN_WARNING
183 "JBD2: wait_on_commit_record: sync failed on %s - " 183 "JBD2: %s: disabling barries on %s - not supported "
184 "disabling barriers\n", journal->j_devname); 184 "by device\n", __func__, journal->j_devname);
185 spin_lock(&journal->j_state_lock); 185 write_lock(&journal->j_state_lock);
186 journal->j_flags &= ~JBD2_BARRIER; 186 journal->j_flags &= ~JBD2_BARRIER;
187 spin_unlock(&journal->j_state_lock); 187 write_unlock(&journal->j_state_lock);
188 188
189 lock_buffer(bh); 189 lock_buffer(bh);
190 clear_buffer_dirty(bh); 190 clear_buffer_dirty(bh);
@@ -400,7 +400,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
400 jbd_debug(1, "JBD: starting commit of transaction %d\n", 400 jbd_debug(1, "JBD: starting commit of transaction %d\n",
401 commit_transaction->t_tid); 401 commit_transaction->t_tid);
402 402
403 spin_lock(&journal->j_state_lock); 403 write_lock(&journal->j_state_lock);
404 commit_transaction->t_state = T_LOCKED; 404 commit_transaction->t_state = T_LOCKED;
405 405
406 /* 406 /*
@@ -417,23 +417,23 @@ void jbd2_journal_commit_transaction(journal_t *journal)
417 stats.run.rs_locked); 417 stats.run.rs_locked);
418 418
419 spin_lock(&commit_transaction->t_handle_lock); 419 spin_lock(&commit_transaction->t_handle_lock);
420 while (commit_transaction->t_updates) { 420 while (atomic_read(&commit_transaction->t_updates)) {
421 DEFINE_WAIT(wait); 421 DEFINE_WAIT(wait);
422 422
423 prepare_to_wait(&journal->j_wait_updates, &wait, 423 prepare_to_wait(&journal->j_wait_updates, &wait,
424 TASK_UNINTERRUPTIBLE); 424 TASK_UNINTERRUPTIBLE);
425 if (commit_transaction->t_updates) { 425 if (atomic_read(&commit_transaction->t_updates)) {
426 spin_unlock(&commit_transaction->t_handle_lock); 426 spin_unlock(&commit_transaction->t_handle_lock);
427 spin_unlock(&journal->j_state_lock); 427 write_unlock(&journal->j_state_lock);
428 schedule(); 428 schedule();
429 spin_lock(&journal->j_state_lock); 429 write_lock(&journal->j_state_lock);
430 spin_lock(&commit_transaction->t_handle_lock); 430 spin_lock(&commit_transaction->t_handle_lock);
431 } 431 }
432 finish_wait(&journal->j_wait_updates, &wait); 432 finish_wait(&journal->j_wait_updates, &wait);
433 } 433 }
434 spin_unlock(&commit_transaction->t_handle_lock); 434 spin_unlock(&commit_transaction->t_handle_lock);
435 435
436 J_ASSERT (commit_transaction->t_outstanding_credits <= 436 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
437 journal->j_max_transaction_buffers); 437 journal->j_max_transaction_buffers);
438 438
439 /* 439 /*
@@ -497,7 +497,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
497 start_time = ktime_get(); 497 start_time = ktime_get();
498 commit_transaction->t_log_start = journal->j_head; 498 commit_transaction->t_log_start = journal->j_head;
499 wake_up(&journal->j_wait_transaction_locked); 499 wake_up(&journal->j_wait_transaction_locked);
500 spin_unlock(&journal->j_state_lock); 500 write_unlock(&journal->j_state_lock);
501 501
502 jbd_debug (3, "JBD: commit phase 2\n"); 502 jbd_debug (3, "JBD: commit phase 2\n");
503 503
@@ -519,19 +519,20 @@ void jbd2_journal_commit_transaction(journal_t *journal)
519 * transaction! Now comes the tricky part: we need to write out 519 * transaction! Now comes the tricky part: we need to write out
520 * metadata. Loop over the transaction's entire buffer list: 520 * metadata. Loop over the transaction's entire buffer list:
521 */ 521 */
522 spin_lock(&journal->j_state_lock); 522 write_lock(&journal->j_state_lock);
523 commit_transaction->t_state = T_COMMIT; 523 commit_transaction->t_state = T_COMMIT;
524 spin_unlock(&journal->j_state_lock); 524 write_unlock(&journal->j_state_lock);
525 525
526 trace_jbd2_commit_logging(journal, commit_transaction); 526 trace_jbd2_commit_logging(journal, commit_transaction);
527 stats.run.rs_logging = jiffies; 527 stats.run.rs_logging = jiffies;
528 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, 528 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
529 stats.run.rs_logging); 529 stats.run.rs_logging);
530 stats.run.rs_blocks = commit_transaction->t_outstanding_credits; 530 stats.run.rs_blocks =
531 atomic_read(&commit_transaction->t_outstanding_credits);
531 stats.run.rs_blocks_logged = 0; 532 stats.run.rs_blocks_logged = 0;
532 533
533 J_ASSERT(commit_transaction->t_nr_buffers <= 534 J_ASSERT(commit_transaction->t_nr_buffers <=
534 commit_transaction->t_outstanding_credits); 535 atomic_read(&commit_transaction->t_outstanding_credits));
535 536
536 err = 0; 537 err = 0;
537 descriptor = NULL; 538 descriptor = NULL;
@@ -616,7 +617,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
616 * the free space in the log, but this counter is changed 617 * the free space in the log, but this counter is changed
617 * by jbd2_journal_next_log_block() also. 618 * by jbd2_journal_next_log_block() also.
618 */ 619 */
619 commit_transaction->t_outstanding_credits--; 620 atomic_dec(&commit_transaction->t_outstanding_credits);
620 621
621 /* Bump b_count to prevent truncate from stumbling over 622 /* Bump b_count to prevent truncate from stumbling over
622 the shadowed buffer! @@@ This can go if we ever get 623 the shadowed buffer! @@@ This can go if we ever get
@@ -977,7 +978,7 @@ restart_loop:
977 * __jbd2_journal_drop_transaction(). Otherwise we could race with 978 * __jbd2_journal_drop_transaction(). Otherwise we could race with
978 * other checkpointing code processing the transaction... 979 * other checkpointing code processing the transaction...
979 */ 980 */
980 spin_lock(&journal->j_state_lock); 981 write_lock(&journal->j_state_lock);
981 spin_lock(&journal->j_list_lock); 982 spin_lock(&journal->j_list_lock);
982 /* 983 /*
983 * Now recheck if some buffers did not get attached to the transaction 984 * Now recheck if some buffers did not get attached to the transaction
@@ -985,7 +986,7 @@ restart_loop:
985 */ 986 */
986 if (commit_transaction->t_forget) { 987 if (commit_transaction->t_forget) {
987 spin_unlock(&journal->j_list_lock); 988 spin_unlock(&journal->j_list_lock);
988 spin_unlock(&journal->j_state_lock); 989 write_unlock(&journal->j_state_lock);
989 goto restart_loop; 990 goto restart_loop;
990 } 991 }
991 992
@@ -1003,7 +1004,8 @@ restart_loop:
1003 * File the transaction statistics 1004 * File the transaction statistics
1004 */ 1005 */
1005 stats.ts_tid = commit_transaction->t_tid; 1006 stats.ts_tid = commit_transaction->t_tid;
1006 stats.run.rs_handle_count = commit_transaction->t_handle_count; 1007 stats.run.rs_handle_count =
1008 atomic_read(&commit_transaction->t_handle_count);
1007 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, 1009 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1008 commit_transaction->t_tid, &stats.run); 1010 commit_transaction->t_tid, &stats.run);
1009 1011
@@ -1037,7 +1039,7 @@ restart_loop:
1037 journal->j_average_commit_time*3) / 4; 1039 journal->j_average_commit_time*3) / 4;
1038 else 1040 else
1039 journal->j_average_commit_time = commit_time; 1041 journal->j_average_commit_time = commit_time;
1040 spin_unlock(&journal->j_state_lock); 1042 write_unlock(&journal->j_state_lock);
1041 1043
1042 if (commit_transaction->t_checkpoint_list == NULL && 1044 if (commit_transaction->t_checkpoint_list == NULL &&
1043 commit_transaction->t_checkpoint_io_list == NULL) { 1045 commit_transaction->t_checkpoint_io_list == NULL) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 036880895bfc..ad5866aaf0f9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -41,6 +41,7 @@
41#include <linux/hash.h> 41#include <linux/hash.h>
42#include <linux/log2.h> 42#include <linux/log2.h>
43#include <linux/vmalloc.h> 43#include <linux/vmalloc.h>
44#include <linux/backing-dev.h>
44 45
45#define CREATE_TRACE_POINTS 46#define CREATE_TRACE_POINTS
46#include <trace/events/jbd2.h> 47#include <trace/events/jbd2.h>
@@ -48,8 +49,6 @@
48#include <asm/uaccess.h> 49#include <asm/uaccess.h>
49#include <asm/page.h> 50#include <asm/page.h>
50 51
51EXPORT_SYMBOL(jbd2_journal_start);
52EXPORT_SYMBOL(jbd2_journal_restart);
53EXPORT_SYMBOL(jbd2_journal_extend); 52EXPORT_SYMBOL(jbd2_journal_extend);
54EXPORT_SYMBOL(jbd2_journal_stop); 53EXPORT_SYMBOL(jbd2_journal_stop);
55EXPORT_SYMBOL(jbd2_journal_lock_updates); 54EXPORT_SYMBOL(jbd2_journal_lock_updates);
@@ -143,7 +142,7 @@ static int kjournald2(void *arg)
143 /* 142 /*
144 * And now, wait forever for commit wakeup events. 143 * And now, wait forever for commit wakeup events.
145 */ 144 */
146 spin_lock(&journal->j_state_lock); 145 write_lock(&journal->j_state_lock);
147 146
148loop: 147loop:
149 if (journal->j_flags & JBD2_UNMOUNT) 148 if (journal->j_flags & JBD2_UNMOUNT)
@@ -154,10 +153,10 @@ loop:
154 153
155 if (journal->j_commit_sequence != journal->j_commit_request) { 154 if (journal->j_commit_sequence != journal->j_commit_request) {
156 jbd_debug(1, "OK, requests differ\n"); 155 jbd_debug(1, "OK, requests differ\n");
157 spin_unlock(&journal->j_state_lock); 156 write_unlock(&journal->j_state_lock);
158 del_timer_sync(&journal->j_commit_timer); 157 del_timer_sync(&journal->j_commit_timer);
159 jbd2_journal_commit_transaction(journal); 158 jbd2_journal_commit_transaction(journal);
160 spin_lock(&journal->j_state_lock); 159 write_lock(&journal->j_state_lock);
161 goto loop; 160 goto loop;
162 } 161 }
163 162
@@ -169,9 +168,9 @@ loop:
169 * be already stopped. 168 * be already stopped.
170 */ 169 */
171 jbd_debug(1, "Now suspending kjournald2\n"); 170 jbd_debug(1, "Now suspending kjournald2\n");
172 spin_unlock(&journal->j_state_lock); 171 write_unlock(&journal->j_state_lock);
173 refrigerator(); 172 refrigerator();
174 spin_lock(&journal->j_state_lock); 173 write_lock(&journal->j_state_lock);
175 } else { 174 } else {
176 /* 175 /*
177 * We assume on resume that commits are already there, 176 * We assume on resume that commits are already there,
@@ -191,9 +190,9 @@ loop:
191 if (journal->j_flags & JBD2_UNMOUNT) 190 if (journal->j_flags & JBD2_UNMOUNT)
192 should_sleep = 0; 191 should_sleep = 0;
193 if (should_sleep) { 192 if (should_sleep) {
194 spin_unlock(&journal->j_state_lock); 193 write_unlock(&journal->j_state_lock);
195 schedule(); 194 schedule();
196 spin_lock(&journal->j_state_lock); 195 write_lock(&journal->j_state_lock);
197 } 196 }
198 finish_wait(&journal->j_wait_commit, &wait); 197 finish_wait(&journal->j_wait_commit, &wait);
199 } 198 }
@@ -211,7 +210,7 @@ loop:
211 goto loop; 210 goto loop;
212 211
213end_loop: 212end_loop:
214 spin_unlock(&journal->j_state_lock); 213 write_unlock(&journal->j_state_lock);
215 del_timer_sync(&journal->j_commit_timer); 214 del_timer_sync(&journal->j_commit_timer);
216 journal->j_task = NULL; 215 journal->j_task = NULL;
217 wake_up(&journal->j_wait_done_commit); 216 wake_up(&journal->j_wait_done_commit);
@@ -234,16 +233,16 @@ static int jbd2_journal_start_thread(journal_t *journal)
234 233
235static void journal_kill_thread(journal_t *journal) 234static void journal_kill_thread(journal_t *journal)
236{ 235{
237 spin_lock(&journal->j_state_lock); 236 write_lock(&journal->j_state_lock);
238 journal->j_flags |= JBD2_UNMOUNT; 237 journal->j_flags |= JBD2_UNMOUNT;
239 238
240 while (journal->j_task) { 239 while (journal->j_task) {
241 wake_up(&journal->j_wait_commit); 240 wake_up(&journal->j_wait_commit);
242 spin_unlock(&journal->j_state_lock); 241 write_unlock(&journal->j_state_lock);
243 wait_event(journal->j_wait_done_commit, journal->j_task == NULL); 242 wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
244 spin_lock(&journal->j_state_lock); 243 write_lock(&journal->j_state_lock);
245 } 244 }
246 spin_unlock(&journal->j_state_lock); 245 write_unlock(&journal->j_state_lock);
247} 246}
248 247
249/* 248/*
@@ -310,7 +309,17 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
310 */ 309 */
311 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); 310 J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
312 311
313 new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); 312retry_alloc:
313 new_bh = alloc_buffer_head(GFP_NOFS);
314 if (!new_bh) {
315 /*
316 * Failure is not an option, but __GFP_NOFAIL is going
317 * away; so we retry ourselves here.
318 */
319 congestion_wait(BLK_RW_ASYNC, HZ/50);
320 goto retry_alloc;
321 }
322
314 /* keep subsequent assertions sane */ 323 /* keep subsequent assertions sane */
315 new_bh->b_state = 0; 324 new_bh->b_state = 0;
316 init_buffer(new_bh, NULL, NULL); 325 init_buffer(new_bh, NULL, NULL);
@@ -442,7 +451,7 @@ int __jbd2_log_space_left(journal_t *journal)
442{ 451{
443 int left = journal->j_free; 452 int left = journal->j_free;
444 453
445 assert_spin_locked(&journal->j_state_lock); 454 /* assert_spin_locked(&journal->j_state_lock); */
446 455
447 /* 456 /*
448 * Be pessimistic here about the number of those free blocks which 457 * Be pessimistic here about the number of those free blocks which
@@ -487,9 +496,9 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid)
487{ 496{
488 int ret; 497 int ret;
489 498
490 spin_lock(&journal->j_state_lock); 499 write_lock(&journal->j_state_lock);
491 ret = __jbd2_log_start_commit(journal, tid); 500 ret = __jbd2_log_start_commit(journal, tid);
492 spin_unlock(&journal->j_state_lock); 501 write_unlock(&journal->j_state_lock);
493 return ret; 502 return ret;
494} 503}
495 504
@@ -508,7 +517,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
508 transaction_t *transaction = NULL; 517 transaction_t *transaction = NULL;
509 tid_t tid; 518 tid_t tid;
510 519
511 spin_lock(&journal->j_state_lock); 520 read_lock(&journal->j_state_lock);
512 if (journal->j_running_transaction && !current->journal_info) { 521 if (journal->j_running_transaction && !current->journal_info) {
513 transaction = journal->j_running_transaction; 522 transaction = journal->j_running_transaction;
514 __jbd2_log_start_commit(journal, transaction->t_tid); 523 __jbd2_log_start_commit(journal, transaction->t_tid);
@@ -516,12 +525,12 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
516 transaction = journal->j_committing_transaction; 525 transaction = journal->j_committing_transaction;
517 526
518 if (!transaction) { 527 if (!transaction) {
519 spin_unlock(&journal->j_state_lock); 528 read_unlock(&journal->j_state_lock);
520 return 0; /* Nothing to retry */ 529 return 0; /* Nothing to retry */
521 } 530 }
522 531
523 tid = transaction->t_tid; 532 tid = transaction->t_tid;
524 spin_unlock(&journal->j_state_lock); 533 read_unlock(&journal->j_state_lock);
525 jbd2_log_wait_commit(journal, tid); 534 jbd2_log_wait_commit(journal, tid);
526 return 1; 535 return 1;
527} 536}
@@ -535,7 +544,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
535{ 544{
536 int ret = 0; 545 int ret = 0;
537 546
538 spin_lock(&journal->j_state_lock); 547 write_lock(&journal->j_state_lock);
539 if (journal->j_running_transaction) { 548 if (journal->j_running_transaction) {
540 tid_t tid = journal->j_running_transaction->t_tid; 549 tid_t tid = journal->j_running_transaction->t_tid;
541 550
@@ -554,7 +563,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
554 *ptid = journal->j_committing_transaction->t_tid; 563 *ptid = journal->j_committing_transaction->t_tid;
555 ret = 1; 564 ret = 1;
556 } 565 }
557 spin_unlock(&journal->j_state_lock); 566 write_unlock(&journal->j_state_lock);
558 return ret; 567 return ret;
559} 568}
560 569
@@ -566,26 +575,24 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
566{ 575{
567 int err = 0; 576 int err = 0;
568 577
578 read_lock(&journal->j_state_lock);
569#ifdef CONFIG_JBD2_DEBUG 579#ifdef CONFIG_JBD2_DEBUG
570 spin_lock(&journal->j_state_lock);
571 if (!tid_geq(journal->j_commit_request, tid)) { 580 if (!tid_geq(journal->j_commit_request, tid)) {
572 printk(KERN_EMERG 581 printk(KERN_EMERG
573 "%s: error: j_commit_request=%d, tid=%d\n", 582 "%s: error: j_commit_request=%d, tid=%d\n",
574 __func__, journal->j_commit_request, tid); 583 __func__, journal->j_commit_request, tid);
575 } 584 }
576 spin_unlock(&journal->j_state_lock);
577#endif 585#endif
578 spin_lock(&journal->j_state_lock);
579 while (tid_gt(tid, journal->j_commit_sequence)) { 586 while (tid_gt(tid, journal->j_commit_sequence)) {
580 jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", 587 jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
581 tid, journal->j_commit_sequence); 588 tid, journal->j_commit_sequence);
582 wake_up(&journal->j_wait_commit); 589 wake_up(&journal->j_wait_commit);
583 spin_unlock(&journal->j_state_lock); 590 read_unlock(&journal->j_state_lock);
584 wait_event(journal->j_wait_done_commit, 591 wait_event(journal->j_wait_done_commit,
585 !tid_gt(tid, journal->j_commit_sequence)); 592 !tid_gt(tid, journal->j_commit_sequence));
586 spin_lock(&journal->j_state_lock); 593 read_lock(&journal->j_state_lock);
587 } 594 }
588 spin_unlock(&journal->j_state_lock); 595 read_unlock(&journal->j_state_lock);
589 596
590 if (unlikely(is_journal_aborted(journal))) { 597 if (unlikely(is_journal_aborted(journal))) {
591 printk(KERN_EMERG "journal commit I/O error\n"); 598 printk(KERN_EMERG "journal commit I/O error\n");
@@ -602,7 +609,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
602{ 609{
603 unsigned long blocknr; 610 unsigned long blocknr;
604 611
605 spin_lock(&journal->j_state_lock); 612 write_lock(&journal->j_state_lock);
606 J_ASSERT(journal->j_free > 1); 613 J_ASSERT(journal->j_free > 1);
607 614
608 blocknr = journal->j_head; 615 blocknr = journal->j_head;
@@ -610,7 +617,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
610 journal->j_free--; 617 journal->j_free--;
611 if (journal->j_head == journal->j_last) 618 if (journal->j_head == journal->j_last)
612 journal->j_head = journal->j_first; 619 journal->j_head = journal->j_first;
613 spin_unlock(&journal->j_state_lock); 620 write_unlock(&journal->j_state_lock);
614 return jbd2_journal_bmap(journal, blocknr, retp); 621 return jbd2_journal_bmap(journal, blocknr, retp);
615} 622}
616 623
@@ -830,7 +837,7 @@ static journal_t * journal_init_common (void)
830 mutex_init(&journal->j_checkpoint_mutex); 837 mutex_init(&journal->j_checkpoint_mutex);
831 spin_lock_init(&journal->j_revoke_lock); 838 spin_lock_init(&journal->j_revoke_lock);
832 spin_lock_init(&journal->j_list_lock); 839 spin_lock_init(&journal->j_list_lock);
833 spin_lock_init(&journal->j_state_lock); 840 rwlock_init(&journal->j_state_lock);
834 841
835 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); 842 journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
836 journal->j_min_batch_time = 0; 843 journal->j_min_batch_time = 0;
@@ -1096,14 +1103,14 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
1096 set_buffer_uptodate(bh); 1103 set_buffer_uptodate(bh);
1097 } 1104 }
1098 1105
1099 spin_lock(&journal->j_state_lock); 1106 read_lock(&journal->j_state_lock);
1100 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", 1107 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
1101 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 1108 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
1102 1109
1103 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); 1110 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
1104 sb->s_start = cpu_to_be32(journal->j_tail); 1111 sb->s_start = cpu_to_be32(journal->j_tail);
1105 sb->s_errno = cpu_to_be32(journal->j_errno); 1112 sb->s_errno = cpu_to_be32(journal->j_errno);
1106 spin_unlock(&journal->j_state_lock); 1113 read_unlock(&journal->j_state_lock);
1107 1114
1108 BUFFER_TRACE(bh, "marking dirty"); 1115 BUFFER_TRACE(bh, "marking dirty");
1109 mark_buffer_dirty(bh); 1116 mark_buffer_dirty(bh);
@@ -1124,12 +1131,12 @@ out:
1124 * any future commit will have to be careful to update the 1131 * any future commit will have to be careful to update the
1125 * superblock again to re-record the true start of the log. */ 1132 * superblock again to re-record the true start of the log. */
1126 1133
1127 spin_lock(&journal->j_state_lock); 1134 write_lock(&journal->j_state_lock);
1128 if (sb->s_start) 1135 if (sb->s_start)
1129 journal->j_flags &= ~JBD2_FLUSHED; 1136 journal->j_flags &= ~JBD2_FLUSHED;
1130 else 1137 else
1131 journal->j_flags |= JBD2_FLUSHED; 1138 journal->j_flags |= JBD2_FLUSHED;
1132 spin_unlock(&journal->j_state_lock); 1139 write_unlock(&journal->j_state_lock);
1133} 1140}
1134 1141
1135/* 1142/*
@@ -1391,13 +1398,9 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
1391int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat, 1398int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
1392 unsigned long ro, unsigned long incompat) 1399 unsigned long ro, unsigned long incompat)
1393{ 1400{
1394 journal_superblock_t *sb;
1395
1396 if (!compat && !ro && !incompat) 1401 if (!compat && !ro && !incompat)
1397 return 1; 1402 return 1;
1398 1403
1399 sb = journal->j_superblock;
1400
1401 /* We can support any known requested features iff the 1404 /* We can support any known requested features iff the
1402 * superblock is in version 2. Otherwise we fail to support any 1405 * superblock is in version 2. Otherwise we fail to support any
1403 * extended sb features. */ 1406 * extended sb features. */
@@ -1545,7 +1548,7 @@ int jbd2_journal_flush(journal_t *journal)
1545 transaction_t *transaction = NULL; 1548 transaction_t *transaction = NULL;
1546 unsigned long old_tail; 1549 unsigned long old_tail;
1547 1550
1548 spin_lock(&journal->j_state_lock); 1551 write_lock(&journal->j_state_lock);
1549 1552
1550 /* Force everything buffered to the log... */ 1553 /* Force everything buffered to the log... */
1551 if (journal->j_running_transaction) { 1554 if (journal->j_running_transaction) {
@@ -1558,10 +1561,10 @@ int jbd2_journal_flush(journal_t *journal)
1558 if (transaction) { 1561 if (transaction) {
1559 tid_t tid = transaction->t_tid; 1562 tid_t tid = transaction->t_tid;
1560 1563
1561 spin_unlock(&journal->j_state_lock); 1564 write_unlock(&journal->j_state_lock);
1562 jbd2_log_wait_commit(journal, tid); 1565 jbd2_log_wait_commit(journal, tid);
1563 } else { 1566 } else {
1564 spin_unlock(&journal->j_state_lock); 1567 write_unlock(&journal->j_state_lock);
1565 } 1568 }
1566 1569
1567 /* ...and flush everything in the log out to disk. */ 1570 /* ...and flush everything in the log out to disk. */
@@ -1585,12 +1588,12 @@ int jbd2_journal_flush(journal_t *journal)
1585 * the magic code for a fully-recovered superblock. Any future 1588 * the magic code for a fully-recovered superblock. Any future
1586 * commits of data to the journal will restore the current 1589 * commits of data to the journal will restore the current
1587 * s_start value. */ 1590 * s_start value. */
1588 spin_lock(&journal->j_state_lock); 1591 write_lock(&journal->j_state_lock);
1589 old_tail = journal->j_tail; 1592 old_tail = journal->j_tail;
1590 journal->j_tail = 0; 1593 journal->j_tail = 0;
1591 spin_unlock(&journal->j_state_lock); 1594 write_unlock(&journal->j_state_lock);
1592 jbd2_journal_update_superblock(journal, 1); 1595 jbd2_journal_update_superblock(journal, 1);
1593 spin_lock(&journal->j_state_lock); 1596 write_lock(&journal->j_state_lock);
1594 journal->j_tail = old_tail; 1597 journal->j_tail = old_tail;
1595 1598
1596 J_ASSERT(!journal->j_running_transaction); 1599 J_ASSERT(!journal->j_running_transaction);
@@ -1598,7 +1601,7 @@ int jbd2_journal_flush(journal_t *journal)
1598 J_ASSERT(!journal->j_checkpoint_transactions); 1601 J_ASSERT(!journal->j_checkpoint_transactions);
1599 J_ASSERT(journal->j_head == journal->j_tail); 1602 J_ASSERT(journal->j_head == journal->j_tail);
1600 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); 1603 J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1601 spin_unlock(&journal->j_state_lock); 1604 write_unlock(&journal->j_state_lock);
1602 return 0; 1605 return 0;
1603} 1606}
1604 1607
@@ -1617,7 +1620,6 @@ int jbd2_journal_flush(journal_t *journal)
1617 1620
1618int jbd2_journal_wipe(journal_t *journal, int write) 1621int jbd2_journal_wipe(journal_t *journal, int write)
1619{ 1622{
1620 journal_superblock_t *sb;
1621 int err = 0; 1623 int err = 0;
1622 1624
1623 J_ASSERT (!(journal->j_flags & JBD2_LOADED)); 1625 J_ASSERT (!(journal->j_flags & JBD2_LOADED));
@@ -1626,8 +1628,6 @@ int jbd2_journal_wipe(journal_t *journal, int write)
1626 if (err) 1628 if (err)
1627 return err; 1629 return err;
1628 1630
1629 sb = journal->j_superblock;
1630
1631 if (!journal->j_tail) 1631 if (!journal->j_tail)
1632 goto no_recovery; 1632 goto no_recovery;
1633 1633
@@ -1665,12 +1665,12 @@ void __jbd2_journal_abort_hard(journal_t *journal)
1665 printk(KERN_ERR "Aborting journal on device %s.\n", 1665 printk(KERN_ERR "Aborting journal on device %s.\n",
1666 journal->j_devname); 1666 journal->j_devname);
1667 1667
1668 spin_lock(&journal->j_state_lock); 1668 write_lock(&journal->j_state_lock);
1669 journal->j_flags |= JBD2_ABORT; 1669 journal->j_flags |= JBD2_ABORT;
1670 transaction = journal->j_running_transaction; 1670 transaction = journal->j_running_transaction;
1671 if (transaction) 1671 if (transaction)
1672 __jbd2_log_start_commit(journal, transaction->t_tid); 1672 __jbd2_log_start_commit(journal, transaction->t_tid);
1673 spin_unlock(&journal->j_state_lock); 1673 write_unlock(&journal->j_state_lock);
1674} 1674}
1675 1675
1676/* Soft abort: record the abort error status in the journal superblock, 1676/* Soft abort: record the abort error status in the journal superblock,
@@ -1755,12 +1755,12 @@ int jbd2_journal_errno(journal_t *journal)
1755{ 1755{
1756 int err; 1756 int err;
1757 1757
1758 spin_lock(&journal->j_state_lock); 1758 read_lock(&journal->j_state_lock);
1759 if (journal->j_flags & JBD2_ABORT) 1759 if (journal->j_flags & JBD2_ABORT)
1760 err = -EROFS; 1760 err = -EROFS;
1761 else 1761 else
1762 err = journal->j_errno; 1762 err = journal->j_errno;
1763 spin_unlock(&journal->j_state_lock); 1763 read_unlock(&journal->j_state_lock);
1764 return err; 1764 return err;
1765} 1765}
1766 1766
@@ -1775,12 +1775,12 @@ int jbd2_journal_clear_err(journal_t *journal)
1775{ 1775{
1776 int err = 0; 1776 int err = 0;
1777 1777
1778 spin_lock(&journal->j_state_lock); 1778 write_lock(&journal->j_state_lock);
1779 if (journal->j_flags & JBD2_ABORT) 1779 if (journal->j_flags & JBD2_ABORT)
1780 err = -EROFS; 1780 err = -EROFS;
1781 else 1781 else
1782 journal->j_errno = 0; 1782 journal->j_errno = 0;
1783 spin_unlock(&journal->j_state_lock); 1783 write_unlock(&journal->j_state_lock);
1784 return err; 1784 return err;
1785} 1785}
1786 1786
@@ -1793,10 +1793,10 @@ int jbd2_journal_clear_err(journal_t *journal)
1793 */ 1793 */
1794void jbd2_journal_ack_err(journal_t *journal) 1794void jbd2_journal_ack_err(journal_t *journal)
1795{ 1795{
1796 spin_lock(&journal->j_state_lock); 1796 write_lock(&journal->j_state_lock);
1797 if (journal->j_errno) 1797 if (journal->j_errno)
1798 journal->j_flags |= JBD2_ACK_ERR; 1798 journal->j_flags |= JBD2_ACK_ERR;
1799 spin_unlock(&journal->j_state_lock); 1799 write_unlock(&journal->j_state_lock);
1800} 1800}
1801 1801
1802int jbd2_journal_blocks_per_page(struct inode *inode) 1802int jbd2_journal_blocks_per_page(struct inode *inode)
@@ -2201,8 +2201,6 @@ void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
2201void jbd2_journal_release_jbd_inode(journal_t *journal, 2201void jbd2_journal_release_jbd_inode(journal_t *journal,
2202 struct jbd2_inode *jinode) 2202 struct jbd2_inode *jinode)
2203{ 2203{
2204 int writeout = 0;
2205
2206 if (!journal) 2204 if (!journal)
2207 return; 2205 return;
2208restart: 2206restart:
@@ -2219,9 +2217,6 @@ restart:
2219 goto restart; 2217 goto restart;
2220 } 2218 }
2221 2219
2222 /* Do we need to wait for data writeback? */
2223 if (journal->j_committing_transaction == jinode->i_transaction)
2224 writeout = 1;
2225 if (jinode->i_transaction) { 2220 if (jinode->i_transaction) {
2226 list_del(&jinode->i_list); 2221 list_del(&jinode->i_list);
2227 jinode->i_transaction = NULL; 2222 jinode->i_transaction = NULL;
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 049281b7cb89..2bc4d5f116f1 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -285,12 +285,10 @@ int jbd2_journal_recover(journal_t *journal)
285int jbd2_journal_skip_recovery(journal_t *journal) 285int jbd2_journal_skip_recovery(journal_t *journal)
286{ 286{
287 int err; 287 int err;
288 journal_superblock_t * sb;
289 288
290 struct recovery_info info; 289 struct recovery_info info;
291 290
292 memset (&info, 0, sizeof(info)); 291 memset (&info, 0, sizeof(info));
293 sb = journal->j_superblock;
294 292
295 err = do_one_pass(journal, &info, PASS_SCAN); 293 err = do_one_pass(journal, &info, PASS_SCAN);
296 294
@@ -299,7 +297,8 @@ int jbd2_journal_skip_recovery(journal_t *journal)
299 ++journal->j_transaction_sequence; 297 ++journal->j_transaction_sequence;
300 } else { 298 } else {
301#ifdef CONFIG_JBD2_DEBUG 299#ifdef CONFIG_JBD2_DEBUG
302 int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); 300 int dropped = info.end_transaction -
301 be32_to_cpu(journal->j_superblock->s_sequence);
303#endif 302#endif
304 jbd_debug(1, 303 jbd_debug(1,
305 "JBD: ignoring %d transaction%s from the journal.\n", 304 "JBD: ignoring %d transaction%s from the journal.\n",
@@ -365,11 +364,6 @@ static int do_one_pass(journal_t *journal,
365 int tag_bytes = journal_tag_bytes(journal); 364 int tag_bytes = journal_tag_bytes(journal);
366 __u32 crc32_sum = ~0; /* Transactional Checksums */ 365 __u32 crc32_sum = ~0; /* Transactional Checksums */
367 366
368 /* Precompute the maximum metadata descriptors in a descriptor block */
369 int MAX_BLOCKS_PER_DESC;
370 MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
371 / tag_bytes);
372
373 /* 367 /*
374 * First thing is to establish what we expect to find in the log 368 * First thing is to establish what we expect to find in the log
375 * (in terms of transaction IDs), and where (in terms of log 369 * (in terms of transaction IDs), and where (in terms of log
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index b8e0806681bb..d95cc9d0401d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -26,6 +26,8 @@
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/hrtimer.h> 28#include <linux/hrtimer.h>
29#include <linux/backing-dev.h>
30#include <linux/module.h>
29 31
30static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); 32static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
31 33
@@ -53,6 +55,9 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
53 transaction->t_tid = journal->j_transaction_sequence++; 55 transaction->t_tid = journal->j_transaction_sequence++;
54 transaction->t_expires = jiffies + journal->j_commit_interval; 56 transaction->t_expires = jiffies + journal->j_commit_interval;
55 spin_lock_init(&transaction->t_handle_lock); 57 spin_lock_init(&transaction->t_handle_lock);
58 atomic_set(&transaction->t_updates, 0);
59 atomic_set(&transaction->t_outstanding_credits, 0);
60 atomic_set(&transaction->t_handle_count, 0);
56 INIT_LIST_HEAD(&transaction->t_inode_list); 61 INIT_LIST_HEAD(&transaction->t_inode_list);
57 INIT_LIST_HEAD(&transaction->t_private_list); 62 INIT_LIST_HEAD(&transaction->t_private_list);
58 63
@@ -83,65 +88,75 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
83 * transaction's buffer credits. 88 * transaction's buffer credits.
84 */ 89 */
85 90
86static int start_this_handle(journal_t *journal, handle_t *handle) 91static int start_this_handle(journal_t *journal, handle_t *handle,
92 int gfp_mask)
87{ 93{
88 transaction_t *transaction; 94 transaction_t *transaction;
89 int needed; 95 int needed;
90 int nblocks = handle->h_buffer_credits; 96 int nblocks = handle->h_buffer_credits;
91 transaction_t *new_transaction = NULL; 97 transaction_t *new_transaction = NULL;
92 int ret = 0;
93 unsigned long ts = jiffies; 98 unsigned long ts = jiffies;
94 99
95 if (nblocks > journal->j_max_transaction_buffers) { 100 if (nblocks > journal->j_max_transaction_buffers) {
96 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", 101 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
97 current->comm, nblocks, 102 current->comm, nblocks,
98 journal->j_max_transaction_buffers); 103 journal->j_max_transaction_buffers);
99 ret = -ENOSPC; 104 return -ENOSPC;
100 goto out;
101 } 105 }
102 106
103alloc_transaction: 107alloc_transaction:
104 if (!journal->j_running_transaction) { 108 if (!journal->j_running_transaction) {
105 new_transaction = kzalloc(sizeof(*new_transaction), 109 new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask);
106 GFP_NOFS|__GFP_NOFAIL);
107 if (!new_transaction) { 110 if (!new_transaction) {
108 ret = -ENOMEM; 111 /*
109 goto out; 112 * If __GFP_FS is not present, then we may be
113 * being called from inside the fs writeback
114 * layer, so we MUST NOT fail. Since
115 * __GFP_NOFAIL is going away, we will arrange
116 * to retry the allocation ourselves.
117 */
118 if ((gfp_mask & __GFP_FS) == 0) {
119 congestion_wait(BLK_RW_ASYNC, HZ/50);
120 goto alloc_transaction;
121 }
122 return -ENOMEM;
110 } 123 }
111 } 124 }
112 125
113 jbd_debug(3, "New handle %p going live.\n", handle); 126 jbd_debug(3, "New handle %p going live.\n", handle);
114 127
115repeat:
116
117 /* 128 /*
118 * We need to hold j_state_lock until t_updates has been incremented, 129 * We need to hold j_state_lock until t_updates has been incremented,
119 * for proper journal barrier handling 130 * for proper journal barrier handling
120 */ 131 */
121 spin_lock(&journal->j_state_lock); 132repeat:
122repeat_locked: 133 read_lock(&journal->j_state_lock);
123 if (is_journal_aborted(journal) || 134 if (is_journal_aborted(journal) ||
124 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { 135 (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
125 spin_unlock(&journal->j_state_lock); 136 read_unlock(&journal->j_state_lock);
126 ret = -EROFS; 137 kfree(new_transaction);
127 goto out; 138 return -EROFS;
128 } 139 }
129 140
130 /* Wait on the journal's transaction barrier if necessary */ 141 /* Wait on the journal's transaction barrier if necessary */
131 if (journal->j_barrier_count) { 142 if (journal->j_barrier_count) {
132 spin_unlock(&journal->j_state_lock); 143 read_unlock(&journal->j_state_lock);
133 wait_event(journal->j_wait_transaction_locked, 144 wait_event(journal->j_wait_transaction_locked,
134 journal->j_barrier_count == 0); 145 journal->j_barrier_count == 0);
135 goto repeat; 146 goto repeat;
136 } 147 }
137 148
138 if (!journal->j_running_transaction) { 149 if (!journal->j_running_transaction) {
139 if (!new_transaction) { 150 read_unlock(&journal->j_state_lock);
140 spin_unlock(&journal->j_state_lock); 151 if (!new_transaction)
141 goto alloc_transaction; 152 goto alloc_transaction;
153 write_lock(&journal->j_state_lock);
154 if (!journal->j_running_transaction) {
155 jbd2_get_transaction(journal, new_transaction);
156 new_transaction = NULL;
142 } 157 }
143 jbd2_get_transaction(journal, new_transaction); 158 write_unlock(&journal->j_state_lock);
144 new_transaction = NULL; 159 goto repeat;
145 } 160 }
146 161
147 transaction = journal->j_running_transaction; 162 transaction = journal->j_running_transaction;
@@ -155,7 +170,7 @@ repeat_locked:
155 170
156 prepare_to_wait(&journal->j_wait_transaction_locked, 171 prepare_to_wait(&journal->j_wait_transaction_locked,
157 &wait, TASK_UNINTERRUPTIBLE); 172 &wait, TASK_UNINTERRUPTIBLE);
158 spin_unlock(&journal->j_state_lock); 173 read_unlock(&journal->j_state_lock);
159 schedule(); 174 schedule();
160 finish_wait(&journal->j_wait_transaction_locked, &wait); 175 finish_wait(&journal->j_wait_transaction_locked, &wait);
161 goto repeat; 176 goto repeat;
@@ -166,8 +181,8 @@ repeat_locked:
166 * buffers requested by this operation, we need to stall pending a log 181 * buffers requested by this operation, we need to stall pending a log
167 * checkpoint to free some more log space. 182 * checkpoint to free some more log space.
168 */ 183 */
169 spin_lock(&transaction->t_handle_lock); 184 needed = atomic_add_return(nblocks,
170 needed = transaction->t_outstanding_credits + nblocks; 185 &transaction->t_outstanding_credits);
171 186
172 if (needed > journal->j_max_transaction_buffers) { 187 if (needed > journal->j_max_transaction_buffers) {
173 /* 188 /*
@@ -178,11 +193,11 @@ repeat_locked:
178 DEFINE_WAIT(wait); 193 DEFINE_WAIT(wait);
179 194
180 jbd_debug(2, "Handle %p starting new commit...\n", handle); 195 jbd_debug(2, "Handle %p starting new commit...\n", handle);
181 spin_unlock(&transaction->t_handle_lock); 196 atomic_sub(nblocks, &transaction->t_outstanding_credits);
182 prepare_to_wait(&journal->j_wait_transaction_locked, &wait, 197 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
183 TASK_UNINTERRUPTIBLE); 198 TASK_UNINTERRUPTIBLE);
184 __jbd2_log_start_commit(journal, transaction->t_tid); 199 __jbd2_log_start_commit(journal, transaction->t_tid);
185 spin_unlock(&journal->j_state_lock); 200 read_unlock(&journal->j_state_lock);
186 schedule(); 201 schedule();
187 finish_wait(&journal->j_wait_transaction_locked, &wait); 202 finish_wait(&journal->j_wait_transaction_locked, &wait);
188 goto repeat; 203 goto repeat;
@@ -215,35 +230,48 @@ repeat_locked:
215 */ 230 */
216 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { 231 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
217 jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); 232 jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
218 spin_unlock(&transaction->t_handle_lock); 233 atomic_sub(nblocks, &transaction->t_outstanding_credits);
219 __jbd2_log_wait_for_space(journal); 234 read_unlock(&journal->j_state_lock);
220 goto repeat_locked; 235 write_lock(&journal->j_state_lock);
236 if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
237 __jbd2_log_wait_for_space(journal);
238 write_unlock(&journal->j_state_lock);
239 goto repeat;
221 } 240 }
222 241
223 /* OK, account for the buffers that this operation expects to 242 /* OK, account for the buffers that this operation expects to
224 * use and add the handle to the running transaction. */ 243 * use and add the handle to the running transaction.
225 244 *
226 if (time_after(transaction->t_start, ts)) { 245 * In order for t_max_wait to be reliable, it must be
246 * protected by a lock. But doing so will mean that
247 * start_this_handle() can not be run in parallel on SMP
248 * systems, which limits our scalability. So we only enable
249 * it when debugging is enabled. We may want to use a
250 * separate flag, eventually, so we can enable this
251 * independently of debugging.
252 */
253#ifdef CONFIG_JBD2_DEBUG
254 if (jbd2_journal_enable_debug &&
255 time_after(transaction->t_start, ts)) {
227 ts = jbd2_time_diff(ts, transaction->t_start); 256 ts = jbd2_time_diff(ts, transaction->t_start);
257 spin_lock(&transaction->t_handle_lock);
228 if (ts > transaction->t_max_wait) 258 if (ts > transaction->t_max_wait)
229 transaction->t_max_wait = ts; 259 transaction->t_max_wait = ts;
260 spin_unlock(&transaction->t_handle_lock);
230 } 261 }
231 262#endif
232 handle->h_transaction = transaction; 263 handle->h_transaction = transaction;
233 transaction->t_outstanding_credits += nblocks; 264 atomic_inc(&transaction->t_updates);
234 transaction->t_updates++; 265 atomic_inc(&transaction->t_handle_count);
235 transaction->t_handle_count++;
236 jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", 266 jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
237 handle, nblocks, transaction->t_outstanding_credits, 267 handle, nblocks,
268 atomic_read(&transaction->t_outstanding_credits),
238 __jbd2_log_space_left(journal)); 269 __jbd2_log_space_left(journal));
239 spin_unlock(&transaction->t_handle_lock); 270 read_unlock(&journal->j_state_lock);
240 spin_unlock(&journal->j_state_lock);
241 271
242 lock_map_acquire(&handle->h_lockdep_map); 272 lock_map_acquire(&handle->h_lockdep_map);
243out: 273 kfree(new_transaction);
244 if (unlikely(new_transaction)) /* It's usually NULL */ 274 return 0;
245 kfree(new_transaction);
246 return ret;
247} 275}
248 276
249static struct lock_class_key jbd2_handle_key; 277static struct lock_class_key jbd2_handle_key;
@@ -278,7 +306,7 @@ static handle_t *new_handle(int nblocks)
278 * 306 *
279 * Return a pointer to a newly allocated handle, or NULL on failure 307 * Return a pointer to a newly allocated handle, or NULL on failure
280 */ 308 */
281handle_t *jbd2_journal_start(journal_t *journal, int nblocks) 309handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
282{ 310{
283 handle_t *handle = journal_current_handle(); 311 handle_t *handle = journal_current_handle();
284 int err; 312 int err;
@@ -298,7 +326,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
298 326
299 current->journal_info = handle; 327 current->journal_info = handle;
300 328
301 err = start_this_handle(journal, handle); 329 err = start_this_handle(journal, handle, gfp_mask);
302 if (err < 0) { 330 if (err < 0) {
303 jbd2_free_handle(handle); 331 jbd2_free_handle(handle);
304 current->journal_info = NULL; 332 current->journal_info = NULL;
@@ -308,6 +336,15 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
308out: 336out:
309 return handle; 337 return handle;
310} 338}
339EXPORT_SYMBOL(jbd2__journal_start);
340
341
342handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
343{
344 return jbd2__journal_start(journal, nblocks, GFP_NOFS);
345}
346EXPORT_SYMBOL(jbd2_journal_start);
347
311 348
312/** 349/**
313 * int jbd2_journal_extend() - extend buffer credits. 350 * int jbd2_journal_extend() - extend buffer credits.
@@ -342,7 +379,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
342 379
343 result = 1; 380 result = 1;
344 381
345 spin_lock(&journal->j_state_lock); 382 read_lock(&journal->j_state_lock);
346 383
347 /* Don't extend a locked-down transaction! */ 384 /* Don't extend a locked-down transaction! */
348 if (handle->h_transaction->t_state != T_RUNNING) { 385 if (handle->h_transaction->t_state != T_RUNNING) {
@@ -352,7 +389,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
352 } 389 }
353 390
354 spin_lock(&transaction->t_handle_lock); 391 spin_lock(&transaction->t_handle_lock);
355 wanted = transaction->t_outstanding_credits + nblocks; 392 wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
356 393
357 if (wanted > journal->j_max_transaction_buffers) { 394 if (wanted > journal->j_max_transaction_buffers) {
358 jbd_debug(3, "denied handle %p %d blocks: " 395 jbd_debug(3, "denied handle %p %d blocks: "
@@ -367,14 +404,14 @@ int jbd2_journal_extend(handle_t *handle, int nblocks)
367 } 404 }
368 405
369 handle->h_buffer_credits += nblocks; 406 handle->h_buffer_credits += nblocks;
370 transaction->t_outstanding_credits += nblocks; 407 atomic_add(nblocks, &transaction->t_outstanding_credits);
371 result = 0; 408 result = 0;
372 409
373 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); 410 jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
374unlock: 411unlock:
375 spin_unlock(&transaction->t_handle_lock); 412 spin_unlock(&transaction->t_handle_lock);
376error_out: 413error_out:
377 spin_unlock(&journal->j_state_lock); 414 read_unlock(&journal->j_state_lock);
378out: 415out:
379 return result; 416 return result;
380} 417}
@@ -394,8 +431,7 @@ out:
394 * transaction capabable of guaranteeing the requested number of 431 * transaction capabable of guaranteeing the requested number of
395 * credits. 432 * credits.
396 */ 433 */
397 434int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
398int jbd2_journal_restart(handle_t *handle, int nblocks)
399{ 435{
400 transaction_t *transaction = handle->h_transaction; 436 transaction_t *transaction = handle->h_transaction;
401 journal_t *journal = transaction->t_journal; 437 journal_t *journal = transaction->t_journal;
@@ -410,29 +446,35 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
410 * First unlink the handle from its current transaction, and start the 446 * First unlink the handle from its current transaction, and start the
411 * commit on that. 447 * commit on that.
412 */ 448 */
413 J_ASSERT(transaction->t_updates > 0); 449 J_ASSERT(atomic_read(&transaction->t_updates) > 0);
414 J_ASSERT(journal_current_handle() == handle); 450 J_ASSERT(journal_current_handle() == handle);
415 451
416 spin_lock(&journal->j_state_lock); 452 read_lock(&journal->j_state_lock);
417 spin_lock(&transaction->t_handle_lock); 453 spin_lock(&transaction->t_handle_lock);
418 transaction->t_outstanding_credits -= handle->h_buffer_credits; 454 atomic_sub(handle->h_buffer_credits,
419 transaction->t_updates--; 455 &transaction->t_outstanding_credits);
420 456 if (atomic_dec_and_test(&transaction->t_updates))
421 if (!transaction->t_updates)
422 wake_up(&journal->j_wait_updates); 457 wake_up(&journal->j_wait_updates);
423 spin_unlock(&transaction->t_handle_lock); 458 spin_unlock(&transaction->t_handle_lock);
424 459
425 jbd_debug(2, "restarting handle %p\n", handle); 460 jbd_debug(2, "restarting handle %p\n", handle);
426 __jbd2_log_start_commit(journal, transaction->t_tid); 461 __jbd2_log_start_commit(journal, transaction->t_tid);
427 spin_unlock(&journal->j_state_lock); 462 read_unlock(&journal->j_state_lock);
428 463
429 lock_map_release(&handle->h_lockdep_map); 464 lock_map_release(&handle->h_lockdep_map);
430 handle->h_buffer_credits = nblocks; 465 handle->h_buffer_credits = nblocks;
431 ret = start_this_handle(journal, handle); 466 ret = start_this_handle(journal, handle, gfp_mask);
432 return ret; 467 return ret;
433} 468}
469EXPORT_SYMBOL(jbd2__journal_restart);
434 470
435 471
472int jbd2_journal_restart(handle_t *handle, int nblocks)
473{
474 return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
475}
476EXPORT_SYMBOL(jbd2_journal_restart);
477
436/** 478/**
437 * void jbd2_journal_lock_updates () - establish a transaction barrier. 479 * void jbd2_journal_lock_updates () - establish a transaction barrier.
438 * @journal: Journal to establish a barrier on. 480 * @journal: Journal to establish a barrier on.
@@ -447,7 +489,7 @@ void jbd2_journal_lock_updates(journal_t *journal)
447{ 489{
448 DEFINE_WAIT(wait); 490 DEFINE_WAIT(wait);
449 491
450 spin_lock(&journal->j_state_lock); 492 write_lock(&journal->j_state_lock);
451 ++journal->j_barrier_count; 493 ++journal->j_barrier_count;
452 494
453 /* Wait until there are no running updates */ 495 /* Wait until there are no running updates */
@@ -458,19 +500,19 @@ void jbd2_journal_lock_updates(journal_t *journal)
458 break; 500 break;
459 501
460 spin_lock(&transaction->t_handle_lock); 502 spin_lock(&transaction->t_handle_lock);
461 if (!transaction->t_updates) { 503 if (!atomic_read(&transaction->t_updates)) {
462 spin_unlock(&transaction->t_handle_lock); 504 spin_unlock(&transaction->t_handle_lock);
463 break; 505 break;
464 } 506 }
465 prepare_to_wait(&journal->j_wait_updates, &wait, 507 prepare_to_wait(&journal->j_wait_updates, &wait,
466 TASK_UNINTERRUPTIBLE); 508 TASK_UNINTERRUPTIBLE);
467 spin_unlock(&transaction->t_handle_lock); 509 spin_unlock(&transaction->t_handle_lock);
468 spin_unlock(&journal->j_state_lock); 510 write_unlock(&journal->j_state_lock);
469 schedule(); 511 schedule();
470 finish_wait(&journal->j_wait_updates, &wait); 512 finish_wait(&journal->j_wait_updates, &wait);
471 spin_lock(&journal->j_state_lock); 513 write_lock(&journal->j_state_lock);
472 } 514 }
473 spin_unlock(&journal->j_state_lock); 515 write_unlock(&journal->j_state_lock);
474 516
475 /* 517 /*
476 * We have now established a barrier against other normal updates, but 518 * We have now established a barrier against other normal updates, but
@@ -494,9 +536,9 @@ void jbd2_journal_unlock_updates (journal_t *journal)
494 J_ASSERT(journal->j_barrier_count != 0); 536 J_ASSERT(journal->j_barrier_count != 0);
495 537
496 mutex_unlock(&journal->j_barrier); 538 mutex_unlock(&journal->j_barrier);
497 spin_lock(&journal->j_state_lock); 539 write_lock(&journal->j_state_lock);
498 --journal->j_barrier_count; 540 --journal->j_barrier_count;
499 spin_unlock(&journal->j_state_lock); 541 write_unlock(&journal->j_state_lock);
500 wake_up(&journal->j_wait_transaction_locked); 542 wake_up(&journal->j_wait_transaction_locked);
501} 543}
502 544
@@ -1238,7 +1280,8 @@ int jbd2_journal_stop(handle_t *handle)
1238{ 1280{
1239 transaction_t *transaction = handle->h_transaction; 1281 transaction_t *transaction = handle->h_transaction;
1240 journal_t *journal = transaction->t_journal; 1282 journal_t *journal = transaction->t_journal;
1241 int err; 1283 int err, wait_for_commit = 0;
1284 tid_t tid;
1242 pid_t pid; 1285 pid_t pid;
1243 1286
1244 J_ASSERT(journal_current_handle() == handle); 1287 J_ASSERT(journal_current_handle() == handle);
@@ -1246,7 +1289,7 @@ int jbd2_journal_stop(handle_t *handle)
1246 if (is_handle_aborted(handle)) 1289 if (is_handle_aborted(handle))
1247 err = -EIO; 1290 err = -EIO;
1248 else { 1291 else {
1249 J_ASSERT(transaction->t_updates > 0); 1292 J_ASSERT(atomic_read(&transaction->t_updates) > 0);
1250 err = 0; 1293 err = 0;
1251 } 1294 }
1252 1295
@@ -1291,9 +1334,9 @@ int jbd2_journal_stop(handle_t *handle)
1291 1334
1292 journal->j_last_sync_writer = pid; 1335 journal->j_last_sync_writer = pid;
1293 1336
1294 spin_lock(&journal->j_state_lock); 1337 read_lock(&journal->j_state_lock);
1295 commit_time = journal->j_average_commit_time; 1338 commit_time = journal->j_average_commit_time;
1296 spin_unlock(&journal->j_state_lock); 1339 read_unlock(&journal->j_state_lock);
1297 1340
1298 trans_time = ktime_to_ns(ktime_sub(ktime_get(), 1341 trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1299 transaction->t_start_time)); 1342 transaction->t_start_time));
@@ -1314,14 +1357,8 @@ int jbd2_journal_stop(handle_t *handle)
1314 if (handle->h_sync) 1357 if (handle->h_sync)
1315 transaction->t_synchronous_commit = 1; 1358 transaction->t_synchronous_commit = 1;
1316 current->journal_info = NULL; 1359 current->journal_info = NULL;
1317 spin_lock(&transaction->t_handle_lock); 1360 atomic_sub(handle->h_buffer_credits,
1318 transaction->t_outstanding_credits -= handle->h_buffer_credits; 1361 &transaction->t_outstanding_credits);
1319 transaction->t_updates--;
1320 if (!transaction->t_updates) {
1321 wake_up(&journal->j_wait_updates);
1322 if (journal->j_barrier_count)
1323 wake_up(&journal->j_wait_transaction_locked);
1324 }
1325 1362
1326 /* 1363 /*
1327 * If the handle is marked SYNC, we need to set another commit 1364 * If the handle is marked SYNC, we need to set another commit
@@ -1330,15 +1367,13 @@ int jbd2_journal_stop(handle_t *handle)
1330 * transaction is too old now. 1367 * transaction is too old now.
1331 */ 1368 */
1332 if (handle->h_sync || 1369 if (handle->h_sync ||
1333 transaction->t_outstanding_credits > 1370 (atomic_read(&transaction->t_outstanding_credits) >
1334 journal->j_max_transaction_buffers || 1371 journal->j_max_transaction_buffers) ||
1335 time_after_eq(jiffies, transaction->t_expires)) { 1372 time_after_eq(jiffies, transaction->t_expires)) {
1336 /* Do this even for aborted journals: an abort still 1373 /* Do this even for aborted journals: an abort still
1337 * completes the commit thread, it just doesn't write 1374 * completes the commit thread, it just doesn't write
1338 * anything to disk. */ 1375 * anything to disk. */
1339 tid_t tid = transaction->t_tid;
1340 1376
1341 spin_unlock(&transaction->t_handle_lock);
1342 jbd_debug(2, "transaction too old, requesting commit for " 1377 jbd_debug(2, "transaction too old, requesting commit for "
1343 "handle %p\n", handle); 1378 "handle %p\n", handle);
1344 /* This is non-blocking */ 1379 /* This is non-blocking */
@@ -1349,11 +1384,25 @@ int jbd2_journal_stop(handle_t *handle)
1349 * to wait for the commit to complete. 1384 * to wait for the commit to complete.
1350 */ 1385 */
1351 if (handle->h_sync && !(current->flags & PF_MEMALLOC)) 1386 if (handle->h_sync && !(current->flags & PF_MEMALLOC))
1352 err = jbd2_log_wait_commit(journal, tid); 1387 wait_for_commit = 1;
1353 } else {
1354 spin_unlock(&transaction->t_handle_lock);
1355 } 1388 }
1356 1389
1390 /*
1391 * Once we drop t_updates, if it goes to zero the transaction
1392 * could start commiting on us and eventually disappear. So
1393 * once we do this, we must not dereference transaction
1394 * pointer again.
1395 */
1396 tid = transaction->t_tid;
1397 if (atomic_dec_and_test(&transaction->t_updates)) {
1398 wake_up(&journal->j_wait_updates);
1399 if (journal->j_barrier_count)
1400 wake_up(&journal->j_wait_transaction_locked);
1401 }
1402
1403 if (wait_for_commit)
1404 err = jbd2_log_wait_commit(journal, tid);
1405
1357 lock_map_release(&handle->h_lockdep_map); 1406 lock_map_release(&handle->h_lockdep_map);
1358 1407
1359 jbd2_free_handle(handle); 1408 jbd2_free_handle(handle);
@@ -1719,7 +1768,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1719 goto zap_buffer_unlocked; 1768 goto zap_buffer_unlocked;
1720 1769
1721 /* OK, we have data buffer in journaled mode */ 1770 /* OK, we have data buffer in journaled mode */
1722 spin_lock(&journal->j_state_lock); 1771 write_lock(&journal->j_state_lock);
1723 jbd_lock_bh_state(bh); 1772 jbd_lock_bh_state(bh);
1724 spin_lock(&journal->j_list_lock); 1773 spin_lock(&journal->j_list_lock);
1725 1774
@@ -1772,7 +1821,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1772 jbd2_journal_put_journal_head(jh); 1821 jbd2_journal_put_journal_head(jh);
1773 spin_unlock(&journal->j_list_lock); 1822 spin_unlock(&journal->j_list_lock);
1774 jbd_unlock_bh_state(bh); 1823 jbd_unlock_bh_state(bh);
1775 spin_unlock(&journal->j_state_lock); 1824 write_unlock(&journal->j_state_lock);
1776 return ret; 1825 return ret;
1777 } else { 1826 } else {
1778 /* There is no currently-running transaction. So the 1827 /* There is no currently-running transaction. So the
@@ -1786,7 +1835,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1786 jbd2_journal_put_journal_head(jh); 1835 jbd2_journal_put_journal_head(jh);
1787 spin_unlock(&journal->j_list_lock); 1836 spin_unlock(&journal->j_list_lock);
1788 jbd_unlock_bh_state(bh); 1837 jbd_unlock_bh_state(bh);
1789 spin_unlock(&journal->j_state_lock); 1838 write_unlock(&journal->j_state_lock);
1790 return ret; 1839 return ret;
1791 } else { 1840 } else {
1792 /* The orphan record's transaction has 1841 /* The orphan record's transaction has
@@ -1810,7 +1859,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1810 jbd2_journal_put_journal_head(jh); 1859 jbd2_journal_put_journal_head(jh);
1811 spin_unlock(&journal->j_list_lock); 1860 spin_unlock(&journal->j_list_lock);
1812 jbd_unlock_bh_state(bh); 1861 jbd_unlock_bh_state(bh);
1813 spin_unlock(&journal->j_state_lock); 1862 write_unlock(&journal->j_state_lock);
1814 return 0; 1863 return 0;
1815 } else { 1864 } else {
1816 /* Good, the buffer belongs to the running transaction. 1865 /* Good, the buffer belongs to the running transaction.
@@ -1829,7 +1878,7 @@ zap_buffer:
1829zap_buffer_no_jh: 1878zap_buffer_no_jh:
1830 spin_unlock(&journal->j_list_lock); 1879 spin_unlock(&journal->j_list_lock);
1831 jbd_unlock_bh_state(bh); 1880 jbd_unlock_bh_state(bh);
1832 spin_unlock(&journal->j_state_lock); 1881 write_unlock(&journal->j_state_lock);
1833zap_buffer_unlocked: 1882zap_buffer_unlocked:
1834 clear_buffer_dirty(bh); 1883 clear_buffer_dirty(bh);
1835 J_ASSERT_BH(bh, !buffer_jbddirty(bh)); 1884 J_ASSERT_BH(bh, !buffer_jbddirty(bh));
@@ -2136,9 +2185,9 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal,
2136 /* Locks are here just to force reading of recent values, it is 2185 /* Locks are here just to force reading of recent values, it is
2137 * enough that the transaction was not committing before we started 2186 * enough that the transaction was not committing before we started
2138 * a transaction adding the inode to orphan list */ 2187 * a transaction adding the inode to orphan list */
2139 spin_lock(&journal->j_state_lock); 2188 read_lock(&journal->j_state_lock);
2140 commit_trans = journal->j_committing_transaction; 2189 commit_trans = journal->j_committing_transaction;
2141 spin_unlock(&journal->j_state_lock); 2190 read_unlock(&journal->j_state_lock);
2142 spin_lock(&journal->j_list_lock); 2191 spin_lock(&journal->j_list_lock);
2143 inode_trans = jinode->i_transaction; 2192 inode_trans = jinode->i_transaction;
2144 spin_unlock(&journal->j_list_lock); 2193 spin_unlock(&journal->j_list_lock);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index a43d07e7b924..cc1bb33b59b8 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -61,8 +61,8 @@ config NFS_V3_ACL
61 If unsure, say N. 61 If unsure, say N.
62 62
63config NFS_V4 63config NFS_V4
64 bool "NFS client support for NFS version 4 (EXPERIMENTAL)" 64 bool "NFS client support for NFS version 4"
65 depends on NFS_FS && EXPERIMENTAL 65 depends on NFS_FS
66 select RPCSEC_GSS_KRB5 66 select RPCSEC_GSS_KRB5
67 help 67 help
68 This option enables support for version 4 of the NFS protocol 68 This option enables support for version 4 of the NFS protocol
@@ -72,16 +72,16 @@ config NFS_V4
72 space programs which can be found in the Linux nfs-utils package, 72 space programs which can be found in the Linux nfs-utils package,
73 available from http://linux-nfs.org/. 73 available from http://linux-nfs.org/.
74 74
75 If unsure, say N. 75 If unsure, say Y.
76 76
77config NFS_V4_1 77config NFS_V4_1
78 bool "NFS client support for NFSv4.1 (DEVELOPER ONLY)" 78 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
79 depends on NFS_V4 && EXPERIMENTAL 79 depends on NFS_V4 && EXPERIMENTAL
80 help 80 help
81 This option enables support for minor version 1 of the NFSv4 protocol 81 This option enables support for minor version 1 of the NFSv4 protocol
82 (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. 82 (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client.
83 83
84 Unless you're an NFS developer, say N. 84 If unsure, say N.
85 85
86config ROOT_NFS 86config ROOT_NFS
87 bool "Root file system on NFS" 87 bool "Root file system on NFS"
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index a08770a7e857..930d10fecdaf 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -37,8 +37,8 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
37 if (inode == NULL) 37 if (inode == NULL)
38 goto out_putclient; 38 goto out_putclient;
39 nfsi = NFS_I(inode); 39 nfsi = NFS_I(inode);
40 down_read(&nfsi->rwsem); 40 rcu_read_lock();
41 delegation = nfsi->delegation; 41 delegation = rcu_dereference(nfsi->delegation);
42 if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0) 42 if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0)
43 goto out_iput; 43 goto out_iput;
44 res->size = i_size_read(inode); 44 res->size = i_size_read(inode);
@@ -53,7 +53,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres *
53 args->bitmap[1]; 53 args->bitmap[1];
54 res->status = 0; 54 res->status = 0;
55out_iput: 55out_iput:
56 up_read(&nfsi->rwsem); 56 rcu_read_unlock();
57 iput(inode); 57 iput(inode);
58out_putclient: 58out_putclient:
59 nfs_put_client(clp); 59 nfs_put_client(clp);
@@ -62,16 +62,6 @@ out:
62 return res->status; 62 return res->status;
63} 63}
64 64
65static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *)
66{
67#if defined(CONFIG_NFS_V4_1)
68 if (clp->cl_minorversion > 0)
69 return nfs41_validate_delegation_stateid;
70#endif
71 return nfs4_validate_delegation_stateid;
72}
73
74
75__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) 65__be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
76{ 66{
77 struct nfs_client *clp; 67 struct nfs_client *clp;
@@ -92,8 +82,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
92 inode = nfs_delegation_find_inode(clp, &args->fh); 82 inode = nfs_delegation_find_inode(clp, &args->fh);
93 if (inode != NULL) { 83 if (inode != NULL) {
94 /* Set up a helper thread to actually return the delegation */ 84 /* Set up a helper thread to actually return the delegation */
95 switch (nfs_async_inode_return_delegation(inode, &args->stateid, 85 switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
96 nfs_validate_delegation_stateid(clp))) {
97 case 0: 86 case 0:
98 res = 0; 87 res = 0;
99 break; 88 break;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d25b5257b7a1..4e7df2adb212 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -150,6 +150,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
150 clp->cl_boot_time = CURRENT_TIME; 150 clp->cl_boot_time = CURRENT_TIME;
151 clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; 151 clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
152 clp->cl_minorversion = cl_init->minorversion; 152 clp->cl_minorversion = cl_init->minorversion;
153 clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
153#endif 154#endif
154 cred = rpc_lookup_machine_cred(); 155 cred = rpc_lookup_machine_cred();
155 if (!IS_ERR(cred)) 156 if (!IS_ERR(cred))
@@ -178,7 +179,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
178 clp->cl_session = NULL; 179 clp->cl_session = NULL;
179 } 180 }
180 181
181 clp->cl_call_sync = _nfs4_call_sync; 182 clp->cl_mvops = nfs_v4_minor_ops[0];
182#endif /* CONFIG_NFS_V4_1 */ 183#endif /* CONFIG_NFS_V4_1 */
183} 184}
184 185
@@ -188,7 +189,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp)
188static void nfs4_destroy_callback(struct nfs_client *clp) 189static void nfs4_destroy_callback(struct nfs_client *clp)
189{ 190{
190 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) 191 if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state))
191 nfs_callback_down(clp->cl_minorversion); 192 nfs_callback_down(clp->cl_mvops->minor_version);
192} 193}
193 194
194static void nfs4_shutdown_client(struct nfs_client *clp) 195static void nfs4_shutdown_client(struct nfs_client *clp)
@@ -1126,7 +1127,7 @@ static int nfs4_init_callback(struct nfs_client *clp)
1126 return error; 1127 return error;
1127 } 1128 }
1128 1129
1129 error = nfs_callback_up(clp->cl_minorversion, 1130 error = nfs_callback_up(clp->cl_mvops->minor_version,
1130 clp->cl_rpcclient->cl_xprt); 1131 clp->cl_rpcclient->cl_xprt);
1131 if (error < 0) { 1132 if (error < 0) {
1132 dprintk("%s: failed to start callback. Error = %d\n", 1133 dprintk("%s: failed to start callback. Error = %d\n",
@@ -1143,10 +1144,8 @@ static int nfs4_init_callback(struct nfs_client *clp)
1143 */ 1144 */
1144static int nfs4_init_client_minor_version(struct nfs_client *clp) 1145static int nfs4_init_client_minor_version(struct nfs_client *clp)
1145{ 1146{
1146 clp->cl_call_sync = _nfs4_call_sync;
1147
1148#if defined(CONFIG_NFS_V4_1) 1147#if defined(CONFIG_NFS_V4_1)
1149 if (clp->cl_minorversion) { 1148 if (clp->cl_mvops->minor_version) {
1150 struct nfs4_session *session = NULL; 1149 struct nfs4_session *session = NULL;
1151 /* 1150 /*
1152 * Create the session and mark it expired. 1151 * Create the session and mark it expired.
@@ -1158,7 +1157,13 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
1158 return -ENOMEM; 1157 return -ENOMEM;
1159 1158
1160 clp->cl_session = session; 1159 clp->cl_session = session;
1161 clp->cl_call_sync = _nfs4_call_sync_session; 1160 /*
1161 * The create session reply races with the server back
1162 * channel probe. Mark the client NFS_CS_SESSION_INITING
1163 * so that the client back channel can find the
1164 * nfs_client struct
1165 */
1166 clp->cl_cons_state = NFS_CS_SESSION_INITING;
1162 } 1167 }
1163#endif /* CONFIG_NFS_V4_1 */ 1168#endif /* CONFIG_NFS_V4_1 */
1164 1169
@@ -1454,7 +1459,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1454 data->authflavor, 1459 data->authflavor,
1455 parent_server->client->cl_xprt->prot, 1460 parent_server->client->cl_xprt->prot,
1456 parent_server->client->cl_timeout, 1461 parent_server->client->cl_timeout,
1457 parent_client->cl_minorversion); 1462 parent_client->cl_mvops->minor_version);
1458 if (error < 0) 1463 if (error < 0)
1459 goto error; 1464 goto error;
1460 1465
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 301634543974..b9c3c43cea1d 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -268,14 +268,6 @@ out:
268 return status; 268 return status;
269} 269}
270 270
271/* Sync all data to disk upon delegation return */
272static void nfs_msync_inode(struct inode *inode)
273{
274 filemap_fdatawrite(inode->i_mapping);
275 nfs_wb_all(inode);
276 filemap_fdatawait(inode->i_mapping);
277}
278
279/* 271/*
280 * Basic procedure for returning a delegation to the server 272 * Basic procedure for returning a delegation to the server
281 */ 273 */
@@ -367,7 +359,7 @@ int nfs_inode_return_delegation(struct inode *inode)
367 delegation = nfs_detach_delegation_locked(nfsi, NULL, clp); 359 delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
368 spin_unlock(&clp->cl_lock); 360 spin_unlock(&clp->cl_lock);
369 if (delegation != NULL) { 361 if (delegation != NULL) {
370 nfs_msync_inode(inode); 362 nfs_wb_all(inode);
371 err = __nfs_inode_return_delegation(inode, delegation, 1); 363 err = __nfs_inode_return_delegation(inode, delegation, 1);
372 } 364 }
373 } 365 }
@@ -471,9 +463,7 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
471/* 463/*
472 * Asynchronous delegation recall! 464 * Asynchronous delegation recall!
473 */ 465 */
474int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, 466int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
475 int (*validate_stateid)(struct nfs_delegation *delegation,
476 const nfs4_stateid *stateid))
477{ 467{
478 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 468 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
479 struct nfs_delegation *delegation; 469 struct nfs_delegation *delegation;
@@ -481,7 +471,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
481 rcu_read_lock(); 471 rcu_read_lock();
482 delegation = rcu_dereference(NFS_I(inode)->delegation); 472 delegation = rcu_dereference(NFS_I(inode)->delegation);
483 473
484 if (!validate_stateid(delegation, stateid)) { 474 if (!clp->cl_mvops->validate_stateid(delegation, stateid)) {
485 rcu_read_unlock(); 475 rcu_read_unlock();
486 return -ENOENT; 476 return -ENOENT;
487 } 477 }
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 69e7b8140122..2026304bda19 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -34,9 +34,7 @@ enum {
34int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); 34int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
35void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); 35void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
36int nfs_inode_return_delegation(struct inode *inode); 36int nfs_inode_return_delegation(struct inode *inode);
37int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, 37int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
38 int (*validate_stateid)(struct nfs_delegation *delegation,
39 const nfs4_stateid *stateid));
40void nfs_inode_return_delegation_noreclaim(struct inode *inode); 38void nfs_inode_return_delegation_noreclaim(struct inode *inode);
41 39
42struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); 40struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 832e9e239324..29539ceeb745 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1652,16 +1652,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1652 } 1652 }
1653 } 1653 }
1654 1654
1655 /*
1656 * ... prune child dentries and writebacks if needed.
1657 */
1658 if (atomic_read(&old_dentry->d_count) > 1) {
1659 if (S_ISREG(old_inode->i_mode))
1660 nfs_wb_all(old_inode);
1661 shrink_dcache_parent(old_dentry);
1662 }
1663 nfs_inode_return_delegation(old_inode); 1655 nfs_inode_return_delegation(old_inode);
1664
1665 if (new_inode != NULL) 1656 if (new_inode != NULL)
1666 nfs_inode_return_delegation(new_inode); 1657 nfs_inode_return_delegation(new_inode);
1667 1658
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index ad4cd31d6050..064a80961677 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -69,6 +69,7 @@ struct nfs_direct_req {
69 69
70 /* I/O parameters */ 70 /* I/O parameters */
71 struct nfs_open_context *ctx; /* file open context info */ 71 struct nfs_open_context *ctx; /* file open context info */
72 struct nfs_lock_context *l_ctx; /* Lock context info */
72 struct kiocb * iocb; /* controlling i/o request */ 73 struct kiocb * iocb; /* controlling i/o request */
73 struct inode * inode; /* target file of i/o */ 74 struct inode * inode; /* target file of i/o */
74 75
@@ -160,6 +161,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
160 INIT_LIST_HEAD(&dreq->rewrite_list); 161 INIT_LIST_HEAD(&dreq->rewrite_list);
161 dreq->iocb = NULL; 162 dreq->iocb = NULL;
162 dreq->ctx = NULL; 163 dreq->ctx = NULL;
164 dreq->l_ctx = NULL;
163 spin_lock_init(&dreq->lock); 165 spin_lock_init(&dreq->lock);
164 atomic_set(&dreq->io_count, 0); 166 atomic_set(&dreq->io_count, 0);
165 dreq->count = 0; 167 dreq->count = 0;
@@ -173,6 +175,8 @@ static void nfs_direct_req_free(struct kref *kref)
173{ 175{
174 struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); 176 struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
175 177
178 if (dreq->l_ctx != NULL)
179 nfs_put_lock_context(dreq->l_ctx);
176 if (dreq->ctx != NULL) 180 if (dreq->ctx != NULL)
177 put_nfs_open_context(dreq->ctx); 181 put_nfs_open_context(dreq->ctx);
178 kmem_cache_free(nfs_direct_cachep, dreq); 182 kmem_cache_free(nfs_direct_cachep, dreq);
@@ -336,6 +340,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
336 data->cred = msg.rpc_cred; 340 data->cred = msg.rpc_cred;
337 data->args.fh = NFS_FH(inode); 341 data->args.fh = NFS_FH(inode);
338 data->args.context = ctx; 342 data->args.context = ctx;
343 data->args.lock_context = dreq->l_ctx;
339 data->args.offset = pos; 344 data->args.offset = pos;
340 data->args.pgbase = pgbase; 345 data->args.pgbase = pgbase;
341 data->args.pages = data->pagevec; 346 data->args.pages = data->pagevec;
@@ -416,24 +421,28 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
416static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, 421static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
417 unsigned long nr_segs, loff_t pos) 422 unsigned long nr_segs, loff_t pos)
418{ 423{
419 ssize_t result = 0; 424 ssize_t result = -ENOMEM;
420 struct inode *inode = iocb->ki_filp->f_mapping->host; 425 struct inode *inode = iocb->ki_filp->f_mapping->host;
421 struct nfs_direct_req *dreq; 426 struct nfs_direct_req *dreq;
422 427
423 dreq = nfs_direct_req_alloc(); 428 dreq = nfs_direct_req_alloc();
424 if (!dreq) 429 if (dreq == NULL)
425 return -ENOMEM; 430 goto out;
426 431
427 dreq->inode = inode; 432 dreq->inode = inode;
428 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 433 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
434 dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
435 if (dreq->l_ctx == NULL)
436 goto out_release;
429 if (!is_sync_kiocb(iocb)) 437 if (!is_sync_kiocb(iocb))
430 dreq->iocb = iocb; 438 dreq->iocb = iocb;
431 439
432 result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos); 440 result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
433 if (!result) 441 if (!result)
434 result = nfs_direct_wait(dreq); 442 result = nfs_direct_wait(dreq);
443out_release:
435 nfs_direct_req_release(dreq); 444 nfs_direct_req_release(dreq);
436 445out:
437 return result; 446 return result;
438} 447}
439 448
@@ -574,6 +583,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
574 data->args.offset = 0; 583 data->args.offset = 0;
575 data->args.count = 0; 584 data->args.count = 0;
576 data->args.context = dreq->ctx; 585 data->args.context = dreq->ctx;
586 data->args.lock_context = dreq->l_ctx;
577 data->res.count = 0; 587 data->res.count = 0;
578 data->res.fattr = &data->fattr; 588 data->res.fattr = &data->fattr;
579 data->res.verf = &data->verf; 589 data->res.verf = &data->verf;
@@ -761,6 +771,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
761 data->cred = msg.rpc_cred; 771 data->cred = msg.rpc_cred;
762 data->args.fh = NFS_FH(inode); 772 data->args.fh = NFS_FH(inode);
763 data->args.context = ctx; 773 data->args.context = ctx;
774 data->args.lock_context = dreq->l_ctx;
764 data->args.offset = pos; 775 data->args.offset = pos;
765 data->args.pgbase = pgbase; 776 data->args.pgbase = pgbase;
766 data->args.pages = data->pagevec; 777 data->args.pages = data->pagevec;
@@ -845,7 +856,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
845 unsigned long nr_segs, loff_t pos, 856 unsigned long nr_segs, loff_t pos,
846 size_t count) 857 size_t count)
847{ 858{
848 ssize_t result = 0; 859 ssize_t result = -ENOMEM;
849 struct inode *inode = iocb->ki_filp->f_mapping->host; 860 struct inode *inode = iocb->ki_filp->f_mapping->host;
850 struct nfs_direct_req *dreq; 861 struct nfs_direct_req *dreq;
851 size_t wsize = NFS_SERVER(inode)->wsize; 862 size_t wsize = NFS_SERVER(inode)->wsize;
@@ -853,7 +864,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
853 864
854 dreq = nfs_direct_req_alloc(); 865 dreq = nfs_direct_req_alloc();
855 if (!dreq) 866 if (!dreq)
856 return -ENOMEM; 867 goto out;
857 nfs_alloc_commit_data(dreq); 868 nfs_alloc_commit_data(dreq);
858 869
859 if (dreq->commit_data == NULL || count < wsize) 870 if (dreq->commit_data == NULL || count < wsize)
@@ -861,14 +872,18 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
861 872
862 dreq->inode = inode; 873 dreq->inode = inode;
863 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); 874 dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
875 dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
876 if (dreq->l_ctx != NULL)
877 goto out_release;
864 if (!is_sync_kiocb(iocb)) 878 if (!is_sync_kiocb(iocb))
865 dreq->iocb = iocb; 879 dreq->iocb = iocb;
866 880
867 result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync); 881 result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
868 if (!result) 882 if (!result)
869 result = nfs_direct_wait(dreq); 883 result = nfs_direct_wait(dreq);
884out_release:
870 nfs_direct_req_release(dreq); 885 nfs_direct_req_release(dreq);
871 886out:
872 return result; 887 return result;
873} 888}
874 889
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f036153d9f50..2d141a74ae82 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -203,37 +203,11 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
203} 203}
204 204
205/* 205/*
206 * Helper for nfs_file_flush() and nfs_file_fsync()
207 *
208 * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
209 * disk, but it retrieves and clears ctx->error after synching, despite
210 * the two being set at the same time in nfs_context_set_write_error().
211 * This is because the former is used to notify the _next_ call to
212 * nfs_file_write() that a write error occured, and hence cause it to
213 * fall back to doing a synchronous write.
214 */
215static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
216{
217 int have_error, status;
218 int ret = 0;
219
220 have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
221 status = nfs_wb_all(inode);
222 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
223 if (have_error)
224 ret = xchg(&ctx->error, 0);
225 if (!ret)
226 ret = status;
227 return ret;
228}
229
230/*
231 * Flush all dirty pages, and check for write errors. 206 * Flush all dirty pages, and check for write errors.
232 */ 207 */
233static int 208static int
234nfs_file_flush(struct file *file, fl_owner_t id) 209nfs_file_flush(struct file *file, fl_owner_t id)
235{ 210{
236 struct nfs_open_context *ctx = nfs_file_open_context(file);
237 struct dentry *dentry = file->f_path.dentry; 211 struct dentry *dentry = file->f_path.dentry;
238 struct inode *inode = dentry->d_inode; 212 struct inode *inode = dentry->d_inode;
239 213
@@ -246,7 +220,7 @@ nfs_file_flush(struct file *file, fl_owner_t id)
246 return 0; 220 return 0;
247 221
248 /* Flush writes to the server and return any errors */ 222 /* Flush writes to the server and return any errors */
249 return nfs_do_fsync(ctx, inode); 223 return vfs_fsync(file, 0);
250} 224}
251 225
252static ssize_t 226static ssize_t
@@ -321,6 +295,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
321 * Flush any dirty pages for this process, and check for write errors. 295 * Flush any dirty pages for this process, and check for write errors.
322 * The return status from this call provides a reliable indication of 296 * The return status from this call provides a reliable indication of
323 * whether any write errors occurred for this process. 297 * whether any write errors occurred for this process.
298 *
299 * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
300 * disk, but it retrieves and clears ctx->error after synching, despite
301 * the two being set at the same time in nfs_context_set_write_error().
302 * This is because the former is used to notify the _next_ call to
303 * nfs_file_write() that a write error occured, and hence cause it to
304 * fall back to doing a synchronous write.
324 */ 305 */
325static int 306static int
326nfs_file_fsync(struct file *file, int datasync) 307nfs_file_fsync(struct file *file, int datasync)
@@ -328,13 +309,23 @@ nfs_file_fsync(struct file *file, int datasync)
328 struct dentry *dentry = file->f_path.dentry; 309 struct dentry *dentry = file->f_path.dentry;
329 struct nfs_open_context *ctx = nfs_file_open_context(file); 310 struct nfs_open_context *ctx = nfs_file_open_context(file);
330 struct inode *inode = dentry->d_inode; 311 struct inode *inode = dentry->d_inode;
312 int have_error, status;
313 int ret = 0;
314
331 315
332 dprintk("NFS: fsync file(%s/%s) datasync %d\n", 316 dprintk("NFS: fsync file(%s/%s) datasync %d\n",
333 dentry->d_parent->d_name.name, dentry->d_name.name, 317 dentry->d_parent->d_name.name, dentry->d_name.name,
334 datasync); 318 datasync);
335 319
336 nfs_inc_stats(inode, NFSIOS_VFSFSYNC); 320 nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
337 return nfs_do_fsync(ctx, inode); 321 have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
322 status = nfs_commit_inode(inode, FLUSH_SYNC);
323 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
324 if (have_error)
325 ret = xchg(&ctx->error, 0);
326 if (!ret)
327 ret = status;
328 return ret;
338} 329}
339 330
340/* 331/*
@@ -648,7 +639,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
648 639
649 /* Return error values for O_DSYNC and IS_SYNC() */ 640 /* Return error values for O_DSYNC and IS_SYNC() */
650 if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { 641 if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) {
651 int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); 642 int err = vfs_fsync(iocb->ki_filp, 0);
652 if (err < 0) 643 if (err < 0)
653 result = err; 644 result = err;
654 } 645 }
@@ -684,7 +675,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
684 written = ret; 675 written = ret;
685 676
686 if (ret >= 0 && nfs_need_sync_write(filp, inode)) { 677 if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
687 int err = nfs_do_fsync(nfs_file_open_context(filp), inode); 678 int err = vfs_fsync(filp, 0);
688 if (err < 0) 679 if (err < 0)
689 ret = err; 680 ret = err;
690 } 681 }
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 099b3518feea..581d8f081e68 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -413,10 +413,8 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
413 return 0; 413 return 0;
414 414
415 /* Write all dirty data */ 415 /* Write all dirty data */
416 if (S_ISREG(inode->i_mode)) { 416 if (S_ISREG(inode->i_mode))
417 filemap_write_and_wait(inode->i_mapping);
418 nfs_wb_all(inode); 417 nfs_wb_all(inode);
419 }
420 418
421 fattr = nfs_alloc_fattr(); 419 fattr = nfs_alloc_fattr();
422 if (fattr == NULL) 420 if (fattr == NULL)
@@ -530,6 +528,68 @@ out:
530 return err; 528 return err;
531} 529}
532 530
531static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
532{
533 atomic_set(&l_ctx->count, 1);
534 l_ctx->lockowner = current->files;
535 l_ctx->pid = current->tgid;
536 INIT_LIST_HEAD(&l_ctx->list);
537}
538
539static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
540{
541 struct nfs_lock_context *pos;
542
543 list_for_each_entry(pos, &ctx->lock_context.list, list) {
544 if (pos->lockowner != current->files)
545 continue;
546 if (pos->pid != current->tgid)
547 continue;
548 atomic_inc(&pos->count);
549 return pos;
550 }
551 return NULL;
552}
553
554struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
555{
556 struct nfs_lock_context *res, *new = NULL;
557 struct inode *inode = ctx->path.dentry->d_inode;
558
559 spin_lock(&inode->i_lock);
560 res = __nfs_find_lock_context(ctx);
561 if (res == NULL) {
562 spin_unlock(&inode->i_lock);
563 new = kmalloc(sizeof(*new), GFP_KERNEL);
564 if (new == NULL)
565 return NULL;
566 nfs_init_lock_context(new);
567 spin_lock(&inode->i_lock);
568 res = __nfs_find_lock_context(ctx);
569 if (res == NULL) {
570 list_add_tail(&new->list, &ctx->lock_context.list);
571 new->open_context = ctx;
572 res = new;
573 new = NULL;
574 }
575 }
576 spin_unlock(&inode->i_lock);
577 kfree(new);
578 return res;
579}
580
581void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
582{
583 struct nfs_open_context *ctx = l_ctx->open_context;
584 struct inode *inode = ctx->path.dentry->d_inode;
585
586 if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock))
587 return;
588 list_del(&l_ctx->list);
589 spin_unlock(&inode->i_lock);
590 kfree(l_ctx);
591}
592
533/** 593/**
534 * nfs_close_context - Common close_context() routine NFSv2/v3 594 * nfs_close_context - Common close_context() routine NFSv2/v3
535 * @ctx: pointer to context 595 * @ctx: pointer to context
@@ -566,11 +626,11 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
566 path_get(&ctx->path); 626 path_get(&ctx->path);
567 ctx->cred = get_rpccred(cred); 627 ctx->cred = get_rpccred(cred);
568 ctx->state = NULL; 628 ctx->state = NULL;
569 ctx->lockowner = current->files;
570 ctx->flags = 0; 629 ctx->flags = 0;
571 ctx->error = 0; 630 ctx->error = 0;
572 ctx->dir_cookie = 0; 631 ctx->dir_cookie = 0;
573 atomic_set(&ctx->count, 1); 632 nfs_init_lock_context(&ctx->lock_context);
633 ctx->lock_context.open_context = ctx;
574 } 634 }
575 return ctx; 635 return ctx;
576} 636}
@@ -578,7 +638,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct
578struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) 638struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
579{ 639{
580 if (ctx != NULL) 640 if (ctx != NULL)
581 atomic_inc(&ctx->count); 641 atomic_inc(&ctx->lock_context.count);
582 return ctx; 642 return ctx;
583} 643}
584 644
@@ -586,7 +646,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
586{ 646{
587 struct inode *inode = ctx->path.dentry->d_inode; 647 struct inode *inode = ctx->path.dentry->d_inode;
588 648
589 if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) 649 if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock))
590 return; 650 return;
591 list_del(&ctx->list); 651 list_del(&ctx->list);
592 spin_unlock(&inode->i_lock); 652 spin_unlock(&inode->i_lock);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e70f44b9b3f4..4c2150d86714 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -370,10 +370,9 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
370 * Helper for restarting RPC calls in the possible presence of NFSv4.1 370 * Helper for restarting RPC calls in the possible presence of NFSv4.1
371 * sessions. 371 * sessions.
372 */ 372 */
373static inline void nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp) 373static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp)
374{ 374{
375 if (nfs4_has_session(clp)) 375 if (nfs4_has_session(clp))
376 rpc_restart_call_prepare(task); 376 return rpc_restart_call_prepare(task);
377 else 377 return rpc_restart_call(task);
378 rpc_restart_call(task);
379} 378}
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 81cf14257916..db8846a0e82e 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -233,7 +233,7 @@ nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs
233static int 233static int
234nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) 234nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
235{ 235{
236 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 236 struct rpc_auth *auth = req->rq_cred->cr_auth;
237 unsigned int replen; 237 unsigned int replen;
238 u32 offset = (u32)args->offset; 238 u32 offset = (u32)args->offset;
239 u32 count = args->count; 239 u32 count = args->count;
@@ -393,8 +393,7 @@ nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *arg
393static int 393static int
394nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args) 394nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args)
395{ 395{
396 struct rpc_task *task = req->rq_task; 396 struct rpc_auth *auth = req->rq_cred->cr_auth;
397 struct rpc_auth *auth = task->tk_msg.rpc_cred->cr_auth;
398 unsigned int replen; 397 unsigned int replen;
399 u32 count = args->count; 398 u32 count = args->count;
400 399
@@ -575,7 +574,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res)
575static int 574static int
576nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args) 575nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args)
577{ 576{
578 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 577 struct rpc_auth *auth = req->rq_cred->cr_auth;
579 unsigned int replen; 578 unsigned int replen;
580 579
581 p = xdr_encode_fhandle(p, args->fh); 580 p = xdr_encode_fhandle(p, args->fh);
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 75dcfc7da365..9769704f8ce6 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -330,7 +330,7 @@ nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *arg
330static int 330static int
331nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) 331nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args)
332{ 332{
333 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 333 struct rpc_auth *auth = req->rq_cred->cr_auth;
334 unsigned int replen; 334 unsigned int replen;
335 u32 count = args->count; 335 u32 count = args->count;
336 336
@@ -471,7 +471,7 @@ nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args)
471static int 471static int
472nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args) 472nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args)
473{ 473{
474 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 474 struct rpc_auth *auth = req->rq_cred->cr_auth;
475 unsigned int replen; 475 unsigned int replen;
476 u32 count = args->count; 476 u32 count = args->count;
477 477
@@ -675,7 +675,7 @@ static int
675nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p, 675nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p,
676 struct nfs3_getaclargs *args) 676 struct nfs3_getaclargs *args)
677{ 677{
678 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 678 struct rpc_auth *auth = req->rq_cred->cr_auth;
679 unsigned int replen; 679 unsigned int replen;
680 680
681 p = xdr_encode_fhandle(p, args->fh); 681 p = xdr_encode_fhandle(p, args->fh);
@@ -802,7 +802,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res)
802static int 802static int
803nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args) 803nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args)
804{ 804{
805 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 805 struct rpc_auth *auth = req->rq_cred->cr_auth;
806 unsigned int replen; 806 unsigned int replen;
807 807
808 p = xdr_encode_fhandle(p, args->fh); 808 p = xdr_encode_fhandle(p, args->fh);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index c538c6106e16..311e15cc8af0 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -45,10 +45,29 @@ enum nfs4_client_state {
45 NFS4CLNT_RECLAIM_NOGRACE, 45 NFS4CLNT_RECLAIM_NOGRACE,
46 NFS4CLNT_DELEGRETURN, 46 NFS4CLNT_DELEGRETURN,
47 NFS4CLNT_SESSION_RESET, 47 NFS4CLNT_SESSION_RESET,
48 NFS4CLNT_SESSION_DRAINING,
49 NFS4CLNT_RECALL_SLOT, 48 NFS4CLNT_RECALL_SLOT,
50}; 49};
51 50
51enum nfs4_session_state {
52 NFS4_SESSION_INITING,
53 NFS4_SESSION_DRAINING,
54};
55
56struct nfs4_minor_version_ops {
57 u32 minor_version;
58
59 int (*call_sync)(struct nfs_server *server,
60 struct rpc_message *msg,
61 struct nfs4_sequence_args *args,
62 struct nfs4_sequence_res *res,
63 int cache_reply);
64 int (*validate_stateid)(struct nfs_delegation *,
65 const nfs4_stateid *);
66 const struct nfs4_state_recovery_ops *reboot_recovery_ops;
67 const struct nfs4_state_recovery_ops *nograce_recovery_ops;
68 const struct nfs4_state_maintenance_ops *state_renewal_ops;
69};
70
52/* 71/*
53 * struct rpc_sequence ensures that RPC calls are sent in the exact 72 * struct rpc_sequence ensures that RPC calls are sent in the exact
54 * order that they appear on the list. 73 * order that they appear on the list.
@@ -89,7 +108,6 @@ struct nfs_unique_id {
89 */ 108 */
90struct nfs4_state_owner { 109struct nfs4_state_owner {
91 struct nfs_unique_id so_owner_id; 110 struct nfs_unique_id so_owner_id;
92 struct nfs_client *so_client;
93 struct nfs_server *so_server; 111 struct nfs_server *so_server;
94 struct rb_node so_client_node; 112 struct rb_node so_client_node;
95 113
@@ -99,7 +117,6 @@ struct nfs4_state_owner {
99 atomic_t so_count; 117 atomic_t so_count;
100 unsigned long so_flags; 118 unsigned long so_flags;
101 struct list_head so_states; 119 struct list_head so_states;
102 struct list_head so_delegations;
103 struct nfs_seqid_counter so_seqid; 120 struct nfs_seqid_counter so_seqid;
104 struct rpc_sequence so_sequence; 121 struct rpc_sequence so_sequence;
105}; 122};
@@ -125,10 +142,20 @@ enum {
125 * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) 142 * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
126 */ 143 */
127 144
145struct nfs4_lock_owner {
146 unsigned int lo_type;
147#define NFS4_ANY_LOCK_TYPE (0U)
148#define NFS4_FLOCK_LOCK_TYPE (1U << 0)
149#define NFS4_POSIX_LOCK_TYPE (1U << 1)
150 union {
151 fl_owner_t posix_owner;
152 pid_t flock_owner;
153 } lo_u;
154};
155
128struct nfs4_lock_state { 156struct nfs4_lock_state {
129 struct list_head ls_locks; /* Other lock stateids */ 157 struct list_head ls_locks; /* Other lock stateids */
130 struct nfs4_state * ls_state; /* Pointer to open state */ 158 struct nfs4_state * ls_state; /* Pointer to open state */
131 fl_owner_t ls_owner; /* POSIX lock owner */
132#define NFS_LOCK_INITIALIZED 1 159#define NFS_LOCK_INITIALIZED 1
133 int ls_flags; 160 int ls_flags;
134 struct nfs_seqid_counter ls_seqid; 161 struct nfs_seqid_counter ls_seqid;
@@ -136,6 +163,7 @@ struct nfs4_lock_state {
136 struct nfs_unique_id ls_id; 163 struct nfs_unique_id ls_id;
137 nfs4_stateid ls_stateid; 164 nfs4_stateid ls_stateid;
138 atomic_t ls_count; 165 atomic_t ls_count;
166 struct nfs4_lock_owner ls_owner;
139}; 167};
140 168
141/* bits for nfs4_state->flags */ 169/* bits for nfs4_state->flags */
@@ -219,11 +247,15 @@ extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nam
219extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); 247extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
220extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, 248extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
221 struct nfs4_fs_locations *fs_locations, struct page *page); 249 struct nfs4_fs_locations *fs_locations, struct page *page);
250extern void nfs4_release_lockowner(const struct nfs4_lock_state *);
222 251
223extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[];
224extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[];
225#if defined(CONFIG_NFS_V4_1) 252#if defined(CONFIG_NFS_V4_1)
226extern int nfs4_setup_sequence(struct nfs_client *clp, 253static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
254{
255 return server->nfs_client->cl_session;
256}
257
258extern int nfs4_setup_sequence(const struct nfs_server *server,
227 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 259 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
228 int cache_reply, struct rpc_task *task); 260 int cache_reply, struct rpc_task *task);
229extern void nfs4_destroy_session(struct nfs4_session *session); 261extern void nfs4_destroy_session(struct nfs4_session *session);
@@ -234,7 +266,12 @@ extern int nfs4_init_session(struct nfs_server *server);
234extern int nfs4_proc_get_lease_time(struct nfs_client *clp, 266extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
235 struct nfs_fsinfo *fsinfo); 267 struct nfs_fsinfo *fsinfo);
236#else /* CONFIG_NFS_v4_1 */ 268#else /* CONFIG_NFS_v4_1 */
237static inline int nfs4_setup_sequence(struct nfs_client *clp, 269static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
270{
271 return NULL;
272}
273
274static inline int nfs4_setup_sequence(const struct nfs_server *server,
238 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 275 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
239 int cache_reply, struct rpc_task *task) 276 int cache_reply, struct rpc_task *task)
240{ 277{
@@ -247,7 +284,7 @@ static inline int nfs4_init_session(struct nfs_server *server)
247} 284}
248#endif /* CONFIG_NFS_V4_1 */ 285#endif /* CONFIG_NFS_V4_1 */
249 286
250extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[]; 287extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
251 288
252extern const u32 nfs4_fattr_bitmap[2]; 289extern const u32 nfs4_fattr_bitmap[2];
253extern const u32 nfs4_statfs_bitmap[2]; 290extern const u32 nfs4_statfs_bitmap[2];
@@ -284,7 +321,7 @@ extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
284extern void nfs41_handle_recall_slot(struct nfs_client *clp); 321extern void nfs41_handle_recall_slot(struct nfs_client *clp);
285extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); 322extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
286extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); 323extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
287extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); 324extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t);
288 325
289extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); 326extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
290extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); 327extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 70015dd60a98..7ffbb98ddec3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -303,15 +303,19 @@ do_state_recovery:
303} 303}
304 304
305 305
306static void renew_lease(const struct nfs_server *server, unsigned long timestamp) 306static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp)
307{ 307{
308 struct nfs_client *clp = server->nfs_client;
309 spin_lock(&clp->cl_lock); 308 spin_lock(&clp->cl_lock);
310 if (time_before(clp->cl_last_renewal,timestamp)) 309 if (time_before(clp->cl_last_renewal,timestamp))
311 clp->cl_last_renewal = timestamp; 310 clp->cl_last_renewal = timestamp;
312 spin_unlock(&clp->cl_lock); 311 spin_unlock(&clp->cl_lock);
313} 312}
314 313
314static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
315{
316 do_renew_lease(server->nfs_client, timestamp);
317}
318
315#if defined(CONFIG_NFS_V4_1) 319#if defined(CONFIG_NFS_V4_1)
316 320
317/* 321/*
@@ -356,7 +360,7 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
356{ 360{
357 struct rpc_task *task; 361 struct rpc_task *task;
358 362
359 if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) { 363 if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
360 task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); 364 task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq);
361 if (task) 365 if (task)
362 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); 366 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
@@ -370,12 +374,11 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
370 complete(&ses->complete); 374 complete(&ses->complete);
371} 375}
372 376
373static void nfs41_sequence_free_slot(const struct nfs_client *clp, 377static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
374 struct nfs4_sequence_res *res)
375{ 378{
376 struct nfs4_slot_table *tbl; 379 struct nfs4_slot_table *tbl;
377 380
378 tbl = &clp->cl_session->fc_slot_table; 381 tbl = &res->sr_session->fc_slot_table;
379 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { 382 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) {
380 /* just wake up the next guy waiting since 383 /* just wake up the next guy waiting since
381 * we may have not consumed a slot after all */ 384 * we may have not consumed a slot after all */
@@ -385,18 +388,17 @@ static void nfs41_sequence_free_slot(const struct nfs_client *clp,
385 388
386 spin_lock(&tbl->slot_tbl_lock); 389 spin_lock(&tbl->slot_tbl_lock);
387 nfs4_free_slot(tbl, res->sr_slotid); 390 nfs4_free_slot(tbl, res->sr_slotid);
388 nfs41_check_drain_session_complete(clp->cl_session); 391 nfs41_check_drain_session_complete(res->sr_session);
389 spin_unlock(&tbl->slot_tbl_lock); 392 spin_unlock(&tbl->slot_tbl_lock);
390 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 393 res->sr_slotid = NFS4_MAX_SLOT_TABLE;
391} 394}
392 395
393static void nfs41_sequence_done(struct nfs_client *clp, 396static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
394 struct nfs4_sequence_res *res,
395 int rpc_status)
396{ 397{
397 unsigned long timestamp; 398 unsigned long timestamp;
398 struct nfs4_slot_table *tbl; 399 struct nfs4_slot_table *tbl;
399 struct nfs4_slot *slot; 400 struct nfs4_slot *slot;
401 struct nfs_client *clp;
400 402
401 /* 403 /*
402 * sr_status remains 1 if an RPC level error occurred. The server 404 * sr_status remains 1 if an RPC level error occurred. The server
@@ -411,25 +413,51 @@ static void nfs41_sequence_done(struct nfs_client *clp,
411 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) 413 if (res->sr_slotid == NFS4_MAX_SLOT_TABLE)
412 goto out; 414 goto out;
413 415
416 tbl = &res->sr_session->fc_slot_table;
417 slot = tbl->slots + res->sr_slotid;
418
414 /* Check the SEQUENCE operation status */ 419 /* Check the SEQUENCE operation status */
415 if (res->sr_status == 0) { 420 switch (res->sr_status) {
416 tbl = &clp->cl_session->fc_slot_table; 421 case 0:
417 slot = tbl->slots + res->sr_slotid;
418 /* Update the slot's sequence and clientid lease timer */ 422 /* Update the slot's sequence and clientid lease timer */
419 ++slot->seq_nr; 423 ++slot->seq_nr;
420 timestamp = res->sr_renewal_time; 424 timestamp = res->sr_renewal_time;
421 spin_lock(&clp->cl_lock); 425 clp = res->sr_session->clp;
422 if (time_before(clp->cl_last_renewal, timestamp)) 426 do_renew_lease(clp, timestamp);
423 clp->cl_last_renewal = timestamp;
424 spin_unlock(&clp->cl_lock);
425 /* Check sequence flags */ 427 /* Check sequence flags */
426 if (atomic_read(&clp->cl_count) > 1) 428 if (atomic_read(&clp->cl_count) > 1)
427 nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); 429 nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
430 break;
431 case -NFS4ERR_DELAY:
432 /* The server detected a resend of the RPC call and
433 * returned NFS4ERR_DELAY as per Section 2.10.6.2
434 * of RFC5661.
435 */
436 dprintk("%s: slot=%d seq=%d: Operation in progress\n",
437 __func__, res->sr_slotid, slot->seq_nr);
438 goto out_retry;
439 default:
440 /* Just update the slot sequence no. */
441 ++slot->seq_nr;
428 } 442 }
429out: 443out:
430 /* The session may be reset by one of the error handlers. */ 444 /* The session may be reset by one of the error handlers. */
431 dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); 445 dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
432 nfs41_sequence_free_slot(clp, res); 446 nfs41_sequence_free_slot(res);
447 return 1;
448out_retry:
449 if (!rpc_restart_call(task))
450 goto out;
451 rpc_delay(task, NFS4_POLL_RETRY_MAX);
452 return 0;
453}
454
455static int nfs4_sequence_done(struct rpc_task *task,
456 struct nfs4_sequence_res *res)
457{
458 if (res->sr_session == NULL)
459 return 1;
460 return nfs41_sequence_done(task, res);
433} 461}
434 462
435/* 463/*
@@ -480,12 +508,11 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
480 if (res->sr_slotid != NFS4_MAX_SLOT_TABLE) 508 if (res->sr_slotid != NFS4_MAX_SLOT_TABLE)
481 return 0; 509 return 0;
482 510
483 memset(res, 0, sizeof(*res));
484 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 511 res->sr_slotid = NFS4_MAX_SLOT_TABLE;
485 tbl = &session->fc_slot_table; 512 tbl = &session->fc_slot_table;
486 513
487 spin_lock(&tbl->slot_tbl_lock); 514 spin_lock(&tbl->slot_tbl_lock);
488 if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) && 515 if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
489 !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { 516 !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
490 /* 517 /*
491 * The state manager will wait until the slot table is empty. 518 * The state manager will wait until the slot table is empty.
@@ -525,6 +552,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
525 res->sr_session = session; 552 res->sr_session = session;
526 res->sr_slotid = slotid; 553 res->sr_slotid = slotid;
527 res->sr_renewal_time = jiffies; 554 res->sr_renewal_time = jiffies;
555 res->sr_status_flags = 0;
528 /* 556 /*
529 * sr_status is only set in decode_sequence, and so will remain 557 * sr_status is only set in decode_sequence, and so will remain
530 * set to 1 if an rpc level failure occurs. 558 * set to 1 if an rpc level failure occurs.
@@ -533,33 +561,33 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
533 return 0; 561 return 0;
534} 562}
535 563
536int nfs4_setup_sequence(struct nfs_client *clp, 564int nfs4_setup_sequence(const struct nfs_server *server,
537 struct nfs4_sequence_args *args, 565 struct nfs4_sequence_args *args,
538 struct nfs4_sequence_res *res, 566 struct nfs4_sequence_res *res,
539 int cache_reply, 567 int cache_reply,
540 struct rpc_task *task) 568 struct rpc_task *task)
541{ 569{
570 struct nfs4_session *session = nfs4_get_session(server);
542 int ret = 0; 571 int ret = 0;
543 572
573 if (session == NULL) {
574 args->sa_session = NULL;
575 res->sr_session = NULL;
576 goto out;
577 }
578
544 dprintk("--> %s clp %p session %p sr_slotid %d\n", 579 dprintk("--> %s clp %p session %p sr_slotid %d\n",
545 __func__, clp, clp->cl_session, res->sr_slotid); 580 __func__, session->clp, session, res->sr_slotid);
546 581
547 if (!nfs4_has_session(clp)) 582 ret = nfs41_setup_sequence(session, args, res, cache_reply,
548 goto out;
549 ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply,
550 task); 583 task);
551 if (ret && ret != -EAGAIN) {
552 /* terminate rpc task */
553 task->tk_status = ret;
554 task->tk_action = NULL;
555 }
556out: 584out:
557 dprintk("<-- %s status=%d\n", __func__, ret); 585 dprintk("<-- %s status=%d\n", __func__, ret);
558 return ret; 586 return ret;
559} 587}
560 588
561struct nfs41_call_sync_data { 589struct nfs41_call_sync_data {
562 struct nfs_client *clp; 590 const struct nfs_server *seq_server;
563 struct nfs4_sequence_args *seq_args; 591 struct nfs4_sequence_args *seq_args;
564 struct nfs4_sequence_res *seq_res; 592 struct nfs4_sequence_res *seq_res;
565 int cache_reply; 593 int cache_reply;
@@ -569,9 +597,9 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
569{ 597{
570 struct nfs41_call_sync_data *data = calldata; 598 struct nfs41_call_sync_data *data = calldata;
571 599
572 dprintk("--> %s data->clp->cl_session %p\n", __func__, 600 dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server);
573 data->clp->cl_session); 601
574 if (nfs4_setup_sequence(data->clp, data->seq_args, 602 if (nfs4_setup_sequence(data->seq_server, data->seq_args,
575 data->seq_res, data->cache_reply, task)) 603 data->seq_res, data->cache_reply, task))
576 return; 604 return;
577 rpc_call_start(task); 605 rpc_call_start(task);
@@ -587,7 +615,7 @@ static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
587{ 615{
588 struct nfs41_call_sync_data *data = calldata; 616 struct nfs41_call_sync_data *data = calldata;
589 617
590 nfs41_sequence_done(data->clp, data->seq_res, task->tk_status); 618 nfs41_sequence_done(task, data->seq_res);
591} 619}
592 620
593struct rpc_call_ops nfs41_call_sync_ops = { 621struct rpc_call_ops nfs41_call_sync_ops = {
@@ -600,8 +628,7 @@ struct rpc_call_ops nfs41_call_priv_sync_ops = {
600 .rpc_call_done = nfs41_call_sync_done, 628 .rpc_call_done = nfs41_call_sync_done,
601}; 629};
602 630
603static int nfs4_call_sync_sequence(struct nfs_client *clp, 631static int nfs4_call_sync_sequence(struct nfs_server *server,
604 struct rpc_clnt *clnt,
605 struct rpc_message *msg, 632 struct rpc_message *msg,
606 struct nfs4_sequence_args *args, 633 struct nfs4_sequence_args *args,
607 struct nfs4_sequence_res *res, 634 struct nfs4_sequence_res *res,
@@ -611,13 +638,13 @@ static int nfs4_call_sync_sequence(struct nfs_client *clp,
611 int ret; 638 int ret;
612 struct rpc_task *task; 639 struct rpc_task *task;
613 struct nfs41_call_sync_data data = { 640 struct nfs41_call_sync_data data = {
614 .clp = clp, 641 .seq_server = server,
615 .seq_args = args, 642 .seq_args = args,
616 .seq_res = res, 643 .seq_res = res,
617 .cache_reply = cache_reply, 644 .cache_reply = cache_reply,
618 }; 645 };
619 struct rpc_task_setup task_setup = { 646 struct rpc_task_setup task_setup = {
620 .rpc_client = clnt, 647 .rpc_client = server->client,
621 .rpc_message = msg, 648 .rpc_message = msg,
622 .callback_ops = &nfs41_call_sync_ops, 649 .callback_ops = &nfs41_call_sync_ops,
623 .callback_data = &data 650 .callback_data = &data
@@ -642,10 +669,15 @@ int _nfs4_call_sync_session(struct nfs_server *server,
642 struct nfs4_sequence_res *res, 669 struct nfs4_sequence_res *res,
643 int cache_reply) 670 int cache_reply)
644{ 671{
645 return nfs4_call_sync_sequence(server->nfs_client, server->client, 672 return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0);
646 msg, args, res, cache_reply, 0);
647} 673}
648 674
675#else
676static int nfs4_sequence_done(struct rpc_task *task,
677 struct nfs4_sequence_res *res)
678{
679 return 1;
680}
649#endif /* CONFIG_NFS_V4_1 */ 681#endif /* CONFIG_NFS_V4_1 */
650 682
651int _nfs4_call_sync(struct nfs_server *server, 683int _nfs4_call_sync(struct nfs_server *server,
@@ -659,18 +691,9 @@ int _nfs4_call_sync(struct nfs_server *server,
659} 691}
660 692
661#define nfs4_call_sync(server, msg, args, res, cache_reply) \ 693#define nfs4_call_sync(server, msg, args, res, cache_reply) \
662 (server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \ 694 (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \
663 &(res)->seq_res, (cache_reply)) 695 &(res)->seq_res, (cache_reply))
664 696
665static void nfs4_sequence_done(const struct nfs_server *server,
666 struct nfs4_sequence_res *res, int rpc_status)
667{
668#ifdef CONFIG_NFS_V4_1
669 if (nfs4_has_session(server->nfs_client))
670 nfs41_sequence_done(server->nfs_client, res, rpc_status);
671#endif /* CONFIG_NFS_V4_1 */
672}
673
674static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) 697static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
675{ 698{
676 struct nfs_inode *nfsi = NFS_I(dir); 699 struct nfs_inode *nfsi = NFS_I(dir);
@@ -745,19 +768,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
745 p->o_arg.server = server; 768 p->o_arg.server = server;
746 p->o_arg.bitmask = server->attr_bitmask; 769 p->o_arg.bitmask = server->attr_bitmask;
747 p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; 770 p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
748 if (flags & O_EXCL) { 771 if (flags & O_CREAT) {
749 if (nfs4_has_persistent_session(server->nfs_client)) { 772 u32 *s;
750 /* GUARDED */ 773
751 p->o_arg.u.attrs = &p->attrs;
752 memcpy(&p->attrs, attrs, sizeof(p->attrs));
753 } else { /* EXCLUSIVE4_1 */
754 u32 *s = (u32 *) p->o_arg.u.verifier.data;
755 s[0] = jiffies;
756 s[1] = current->pid;
757 }
758 } else if (flags & O_CREAT) {
759 p->o_arg.u.attrs = &p->attrs; 774 p->o_arg.u.attrs = &p->attrs;
760 memcpy(&p->attrs, attrs, sizeof(p->attrs)); 775 memcpy(&p->attrs, attrs, sizeof(p->attrs));
776 s = (u32 *) p->o_arg.u.verifier.data;
777 s[0] = jiffies;
778 s[1] = current->pid;
761 } 779 }
762 p->c_arg.fh = &p->o_res.fh; 780 p->c_arg.fh = &p->o_res.fh;
763 p->c_arg.stateid = &p->o_res.stateid; 781 p->c_arg.stateid = &p->o_res.stateid;
@@ -1255,8 +1273,6 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
1255 struct nfs4_opendata *data = calldata; 1273 struct nfs4_opendata *data = calldata;
1256 1274
1257 data->rpc_status = task->tk_status; 1275 data->rpc_status = task->tk_status;
1258 if (RPC_ASSASSINATED(task))
1259 return;
1260 if (data->rpc_status == 0) { 1276 if (data->rpc_status == 0) {
1261 memcpy(data->o_res.stateid.data, data->c_res.stateid.data, 1277 memcpy(data->o_res.stateid.data, data->c_res.stateid.data,
1262 sizeof(data->o_res.stateid.data)); 1278 sizeof(data->o_res.stateid.data));
@@ -1356,13 +1372,13 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
1356 } 1372 }
1357 /* Update sequence id. */ 1373 /* Update sequence id. */
1358 data->o_arg.id = sp->so_owner_id.id; 1374 data->o_arg.id = sp->so_owner_id.id;
1359 data->o_arg.clientid = sp->so_client->cl_clientid; 1375 data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid;
1360 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { 1376 if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
1361 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; 1377 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
1362 nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); 1378 nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
1363 } 1379 }
1364 data->timestamp = jiffies; 1380 data->timestamp = jiffies;
1365 if (nfs4_setup_sequence(data->o_arg.server->nfs_client, 1381 if (nfs4_setup_sequence(data->o_arg.server,
1366 &data->o_arg.seq_args, 1382 &data->o_arg.seq_args,
1367 &data->o_res.seq_res, 1, task)) 1383 &data->o_res.seq_res, 1, task))
1368 return; 1384 return;
@@ -1385,11 +1401,9 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
1385 1401
1386 data->rpc_status = task->tk_status; 1402 data->rpc_status = task->tk_status;
1387 1403
1388 nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res, 1404 if (!nfs4_sequence_done(task, &data->o_res.seq_res))
1389 task->tk_status);
1390
1391 if (RPC_ASSASSINATED(task))
1392 return; 1405 return;
1406
1393 if (task->tk_status == 0) { 1407 if (task->tk_status == 0) {
1394 switch (data->o_res.f_attr->mode & S_IFMT) { 1408 switch (data->o_res.f_attr->mode & S_IFMT) {
1395 case S_IFREG: 1409 case S_IFREG:
@@ -1773,7 +1787,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
1773 if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { 1787 if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) {
1774 /* Use that stateid */ 1788 /* Use that stateid */
1775 } else if (state != NULL) { 1789 } else if (state != NULL) {
1776 nfs4_copy_stateid(&arg.stateid, state, current->files); 1790 nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid);
1777 } else 1791 } else
1778 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); 1792 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
1779 1793
@@ -1838,8 +1852,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
1838 struct nfs4_state *state = calldata->state; 1852 struct nfs4_state *state = calldata->state;
1839 struct nfs_server *server = NFS_SERVER(calldata->inode); 1853 struct nfs_server *server = NFS_SERVER(calldata->inode);
1840 1854
1841 nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status); 1855 if (!nfs4_sequence_done(task, &calldata->res.seq_res))
1842 if (RPC_ASSASSINATED(task))
1843 return; 1856 return;
1844 /* hmm. we are done with the inode, and in the process of freeing 1857 /* hmm. we are done with the inode, and in the process of freeing
1845 * the state_owner. we keep this around to process errors 1858 * the state_owner. we keep this around to process errors
@@ -1903,7 +1916,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
1903 1916
1904 nfs_fattr_init(calldata->res.fattr); 1917 nfs_fattr_init(calldata->res.fattr);
1905 calldata->timestamp = jiffies; 1918 calldata->timestamp = jiffies;
1906 if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client, 1919 if (nfs4_setup_sequence(NFS_SERVER(calldata->inode),
1907 &calldata->arg.seq_args, &calldata->res.seq_res, 1920 &calldata->arg.seq_args, &calldata->res.seq_res,
1908 1, task)) 1921 1, task))
1909 return; 1922 return;
@@ -2648,7 +2661,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
2648{ 2661{
2649 struct nfs_removeres *res = task->tk_msg.rpc_resp; 2662 struct nfs_removeres *res = task->tk_msg.rpc_resp;
2650 2663
2651 nfs4_sequence_done(res->server, &res->seq_res, task->tk_status); 2664 if (!nfs4_sequence_done(task, &res->seq_res))
2665 return 0;
2652 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) 2666 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
2653 return 0; 2667 return 0;
2654 update_changeattr(dir, &res->cinfo); 2668 update_changeattr(dir, &res->cinfo);
@@ -3093,7 +3107,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
3093 3107
3094 dprintk("--> %s\n", __func__); 3108 dprintk("--> %s\n", __func__);
3095 3109
3096 nfs4_sequence_done(server, &data->res.seq_res, task->tk_status); 3110 if (!nfs4_sequence_done(task, &data->res.seq_res))
3111 return -EAGAIN;
3097 3112
3098 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { 3113 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
3099 nfs_restart_rpc(task, server->nfs_client); 3114 nfs_restart_rpc(task, server->nfs_client);
@@ -3116,8 +3131,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
3116{ 3131{
3117 struct inode *inode = data->inode; 3132 struct inode *inode = data->inode;
3118 3133
3119 nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, 3134 if (!nfs4_sequence_done(task, &data->res.seq_res))
3120 task->tk_status); 3135 return -EAGAIN;
3121 3136
3122 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { 3137 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
3123 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); 3138 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
@@ -3145,8 +3160,9 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
3145{ 3160{
3146 struct inode *inode = data->inode; 3161 struct inode *inode = data->inode;
3147 3162
3148 nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, 3163 if (!nfs4_sequence_done(task, &data->res.seq_res))
3149 task->tk_status); 3164 return -EAGAIN;
3165
3150 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { 3166 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
3151 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); 3167 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
3152 return -EAGAIN; 3168 return -EAGAIN;
@@ -3196,10 +3212,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
3196 nfs4_schedule_state_recovery(clp); 3212 nfs4_schedule_state_recovery(clp);
3197 return; 3213 return;
3198 } 3214 }
3199 spin_lock(&clp->cl_lock); 3215 do_renew_lease(clp, timestamp);
3200 if (time_before(clp->cl_last_renewal,timestamp))
3201 clp->cl_last_renewal = timestamp;
3202 spin_unlock(&clp->cl_lock);
3203} 3216}
3204 3217
3205static const struct rpc_call_ops nfs4_renew_ops = { 3218static const struct rpc_call_ops nfs4_renew_ops = {
@@ -3240,10 +3253,7 @@ int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
3240 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); 3253 status = rpc_call_sync(clp->cl_rpcclient, &msg, 0);
3241 if (status < 0) 3254 if (status < 0)
3242 return status; 3255 return status;
3243 spin_lock(&clp->cl_lock); 3256 do_renew_lease(clp, now);
3244 if (time_before(clp->cl_last_renewal,now))
3245 clp->cl_last_renewal = now;
3246 spin_unlock(&clp->cl_lock);
3247 return 0; 3257 return 0;
3248} 3258}
3249 3259
@@ -3464,9 +3474,11 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
3464} 3474}
3465 3475
3466static int 3476static int
3467_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state) 3477nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
3468{ 3478{
3469 if (!clp || task->tk_status >= 0) 3479 struct nfs_client *clp = server->nfs_client;
3480
3481 if (task->tk_status >= 0)
3470 return 0; 3482 return 0;
3471 switch(task->tk_status) { 3483 switch(task->tk_status) {
3472 case -NFS4ERR_ADMIN_REVOKED: 3484 case -NFS4ERR_ADMIN_REVOKED:
@@ -3498,8 +3510,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3498 return -EAGAIN; 3510 return -EAGAIN;
3499#endif /* CONFIG_NFS_V4_1 */ 3511#endif /* CONFIG_NFS_V4_1 */
3500 case -NFS4ERR_DELAY: 3512 case -NFS4ERR_DELAY:
3501 if (server) 3513 nfs_inc_server_stats(server, NFSIOS_DELAY);
3502 nfs_inc_server_stats(server, NFSIOS_DELAY);
3503 case -NFS4ERR_GRACE: 3514 case -NFS4ERR_GRACE:
3504 case -EKEYEXPIRED: 3515 case -EKEYEXPIRED:
3505 rpc_delay(task, NFS4_POLL_RETRY_MAX); 3516 rpc_delay(task, NFS4_POLL_RETRY_MAX);
@@ -3520,12 +3531,6 @@ do_state_recovery:
3520 return -EAGAIN; 3531 return -EAGAIN;
3521} 3532}
3522 3533
3523static int
3524nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
3525{
3526 return _nfs4_async_handle_error(task, server, server->nfs_client, state);
3527}
3528
3529int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, 3534int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
3530 unsigned short port, struct rpc_cred *cred, 3535 unsigned short port, struct rpc_cred *cred,
3531 struct nfs4_setclientid_res *res) 3536 struct nfs4_setclientid_res *res)
@@ -3641,8 +3646,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
3641{ 3646{
3642 struct nfs4_delegreturndata *data = calldata; 3647 struct nfs4_delegreturndata *data = calldata;
3643 3648
3644 nfs4_sequence_done(data->res.server, &data->res.seq_res, 3649 if (!nfs4_sequence_done(task, &data->res.seq_res))
3645 task->tk_status); 3650 return;
3646 3651
3647 switch (task->tk_status) { 3652 switch (task->tk_status) {
3648 case -NFS4ERR_STALE_STATEID: 3653 case -NFS4ERR_STALE_STATEID:
@@ -3672,7 +3677,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
3672 3677
3673 d_data = (struct nfs4_delegreturndata *)data; 3678 d_data = (struct nfs4_delegreturndata *)data;
3674 3679
3675 if (nfs4_setup_sequence(d_data->res.server->nfs_client, 3680 if (nfs4_setup_sequence(d_data->res.server,
3676 &d_data->args.seq_args, 3681 &d_data->args.seq_args,
3677 &d_data->res.seq_res, 1, task)) 3682 &d_data->res.seq_res, 1, task))
3678 return; 3683 return;
@@ -3892,9 +3897,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
3892{ 3897{
3893 struct nfs4_unlockdata *calldata = data; 3898 struct nfs4_unlockdata *calldata = data;
3894 3899
3895 nfs4_sequence_done(calldata->server, &calldata->res.seq_res, 3900 if (!nfs4_sequence_done(task, &calldata->res.seq_res))
3896 task->tk_status);
3897 if (RPC_ASSASSINATED(task))
3898 return; 3901 return;
3899 switch (task->tk_status) { 3902 switch (task->tk_status) {
3900 case 0: 3903 case 0:
@@ -3927,7 +3930,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
3927 return; 3930 return;
3928 } 3931 }
3929 calldata->timestamp = jiffies; 3932 calldata->timestamp = jiffies;
3930 if (nfs4_setup_sequence(calldata->server->nfs_client, 3933 if (nfs4_setup_sequence(calldata->server,
3931 &calldata->arg.seq_args, 3934 &calldata->arg.seq_args,
3932 &calldata->res.seq_res, 1, task)) 3935 &calldata->res.seq_res, 1, task))
3933 return; 3936 return;
@@ -4082,7 +4085,8 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
4082 } else 4085 } else
4083 data->arg.new_lock_owner = 0; 4086 data->arg.new_lock_owner = 0;
4084 data->timestamp = jiffies; 4087 data->timestamp = jiffies;
4085 if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args, 4088 if (nfs4_setup_sequence(data->server,
4089 &data->arg.seq_args,
4086 &data->res.seq_res, 1, task)) 4090 &data->res.seq_res, 1, task))
4087 return; 4091 return;
4088 rpc_call_start(task); 4092 rpc_call_start(task);
@@ -4101,12 +4105,10 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
4101 4105
4102 dprintk("%s: begin!\n", __func__); 4106 dprintk("%s: begin!\n", __func__);
4103 4107
4104 nfs4_sequence_done(data->server, &data->res.seq_res, 4108 if (!nfs4_sequence_done(task, &data->res.seq_res))
4105 task->tk_status); 4109 return;
4106 4110
4107 data->rpc_status = task->tk_status; 4111 data->rpc_status = task->tk_status;
4108 if (RPC_ASSASSINATED(task))
4109 goto out;
4110 if (data->arg.new_lock_owner != 0) { 4112 if (data->arg.new_lock_owner != 0) {
4111 if (data->rpc_status == 0) 4113 if (data->rpc_status == 0)
4112 nfs_confirm_seqid(&data->lsp->ls_seqid, 0); 4114 nfs_confirm_seqid(&data->lsp->ls_seqid, 0);
@@ -4424,6 +4426,34 @@ out:
4424 return err; 4426 return err;
4425} 4427}
4426 4428
4429static void nfs4_release_lockowner_release(void *calldata)
4430{
4431 kfree(calldata);
4432}
4433
4434const struct rpc_call_ops nfs4_release_lockowner_ops = {
4435 .rpc_release = nfs4_release_lockowner_release,
4436};
4437
4438void nfs4_release_lockowner(const struct nfs4_lock_state *lsp)
4439{
4440 struct nfs_server *server = lsp->ls_state->owner->so_server;
4441 struct nfs_release_lockowner_args *args;
4442 struct rpc_message msg = {
4443 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER],
4444 };
4445
4446 if (server->nfs_client->cl_mvops->minor_version != 0)
4447 return;
4448 args = kmalloc(sizeof(*args), GFP_NOFS);
4449 if (!args)
4450 return;
4451 args->lock_owner.clientid = server->nfs_client->cl_clientid;
4452 args->lock_owner.id = lsp->ls_id.id;
4453 msg.rpc_argp = args;
4454 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args);
4455}
4456
4427#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" 4457#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
4428 4458
4429int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, 4459int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf,
@@ -4611,7 +4641,8 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata)
4611 (struct nfs4_get_lease_time_data *)calldata; 4641 (struct nfs4_get_lease_time_data *)calldata;
4612 4642
4613 dprintk("--> %s\n", __func__); 4643 dprintk("--> %s\n", __func__);
4614 nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status); 4644 if (!nfs41_sequence_done(task, &data->res->lr_seq_res))
4645 return;
4615 switch (task->tk_status) { 4646 switch (task->tk_status) {
4616 case -NFS4ERR_DELAY: 4647 case -NFS4ERR_DELAY:
4617 case -NFS4ERR_GRACE: 4648 case -NFS4ERR_GRACE:
@@ -4805,13 +4836,6 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
4805 if (!session) 4836 if (!session)
4806 return NULL; 4837 return NULL;
4807 4838
4808 /*
4809 * The create session reply races with the server back
4810 * channel probe. Mark the client NFS_CS_SESSION_INITING
4811 * so that the client back channel can find the
4812 * nfs_client struct
4813 */
4814 clp->cl_cons_state = NFS_CS_SESSION_INITING;
4815 init_completion(&session->complete); 4839 init_completion(&session->complete);
4816 4840
4817 tbl = &session->fc_slot_table; 4841 tbl = &session->fc_slot_table;
@@ -4824,6 +4848,8 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
4824 spin_lock_init(&tbl->slot_tbl_lock); 4848 spin_lock_init(&tbl->slot_tbl_lock);
4825 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); 4849 rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table");
4826 4850
4851 session->session_state = 1<<NFS4_SESSION_INITING;
4852
4827 session->clp = clp; 4853 session->clp = clp;
4828 return session; 4854 return session;
4829} 4855}
@@ -5040,6 +5066,10 @@ int nfs4_init_session(struct nfs_server *server)
5040 if (!nfs4_has_session(clp)) 5066 if (!nfs4_has_session(clp))
5041 return 0; 5067 return 0;
5042 5068
5069 session = clp->cl_session;
5070 if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
5071 return 0;
5072
5043 rsize = server->rsize; 5073 rsize = server->rsize;
5044 if (rsize == 0) 5074 if (rsize == 0)
5045 rsize = NFS_MAX_FILE_IO_SIZE; 5075 rsize = NFS_MAX_FILE_IO_SIZE;
@@ -5047,7 +5077,6 @@ int nfs4_init_session(struct nfs_server *server)
5047 if (wsize == 0) 5077 if (wsize == 0)
5048 wsize = NFS_MAX_FILE_IO_SIZE; 5078 wsize = NFS_MAX_FILE_IO_SIZE;
5049 5079
5050 session = clp->cl_session;
5051 session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; 5080 session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
5052 session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; 5081 session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
5053 5082
@@ -5060,69 +5089,70 @@ int nfs4_init_session(struct nfs_server *server)
5060/* 5089/*
5061 * Renew the cl_session lease. 5090 * Renew the cl_session lease.
5062 */ 5091 */
5063static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) 5092struct nfs4_sequence_data {
5064{ 5093 struct nfs_client *clp;
5065 struct nfs4_sequence_args args; 5094 struct nfs4_sequence_args args;
5066 struct nfs4_sequence_res res; 5095 struct nfs4_sequence_res res;
5067 5096};
5068 struct rpc_message msg = {
5069 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
5070 .rpc_argp = &args,
5071 .rpc_resp = &res,
5072 .rpc_cred = cred,
5073 };
5074
5075 args.sa_cache_this = 0;
5076
5077 return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args,
5078 &res, args.sa_cache_this, 1);
5079}
5080 5097
5081static void nfs41_sequence_release(void *data) 5098static void nfs41_sequence_release(void *data)
5082{ 5099{
5083 struct nfs_client *clp = (struct nfs_client *)data; 5100 struct nfs4_sequence_data *calldata = data;
5101 struct nfs_client *clp = calldata->clp;
5084 5102
5085 if (atomic_read(&clp->cl_count) > 1) 5103 if (atomic_read(&clp->cl_count) > 1)
5086 nfs4_schedule_state_renewal(clp); 5104 nfs4_schedule_state_renewal(clp);
5087 nfs_put_client(clp); 5105 nfs_put_client(clp);
5106 kfree(calldata);
5107}
5108
5109static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp)
5110{
5111 switch(task->tk_status) {
5112 case -NFS4ERR_DELAY:
5113 case -EKEYEXPIRED:
5114 rpc_delay(task, NFS4_POLL_RETRY_MAX);
5115 return -EAGAIN;
5116 default:
5117 nfs4_schedule_state_recovery(clp);
5118 }
5119 return 0;
5088} 5120}
5089 5121
5090static void nfs41_sequence_call_done(struct rpc_task *task, void *data) 5122static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
5091{ 5123{
5092 struct nfs_client *clp = (struct nfs_client *)data; 5124 struct nfs4_sequence_data *calldata = data;
5125 struct nfs_client *clp = calldata->clp;
5093 5126
5094 nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status); 5127 if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp))
5128 return;
5095 5129
5096 if (task->tk_status < 0) { 5130 if (task->tk_status < 0) {
5097 dprintk("%s ERROR %d\n", __func__, task->tk_status); 5131 dprintk("%s ERROR %d\n", __func__, task->tk_status);
5098 if (atomic_read(&clp->cl_count) == 1) 5132 if (atomic_read(&clp->cl_count) == 1)
5099 goto out; 5133 goto out;
5100 5134
5101 if (_nfs4_async_handle_error(task, NULL, clp, NULL) 5135 if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) {
5102 == -EAGAIN) { 5136 rpc_restart_call_prepare(task);
5103 nfs_restart_rpc(task, clp);
5104 return; 5137 return;
5105 } 5138 }
5106 } 5139 }
5107 dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); 5140 dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
5108out: 5141out:
5109 kfree(task->tk_msg.rpc_argp);
5110 kfree(task->tk_msg.rpc_resp);
5111
5112 dprintk("<-- %s\n", __func__); 5142 dprintk("<-- %s\n", __func__);
5113} 5143}
5114 5144
5115static void nfs41_sequence_prepare(struct rpc_task *task, void *data) 5145static void nfs41_sequence_prepare(struct rpc_task *task, void *data)
5116{ 5146{
5117 struct nfs_client *clp; 5147 struct nfs4_sequence_data *calldata = data;
5148 struct nfs_client *clp = calldata->clp;
5118 struct nfs4_sequence_args *args; 5149 struct nfs4_sequence_args *args;
5119 struct nfs4_sequence_res *res; 5150 struct nfs4_sequence_res *res;
5120 5151
5121 clp = (struct nfs_client *)data;
5122 args = task->tk_msg.rpc_argp; 5152 args = task->tk_msg.rpc_argp;
5123 res = task->tk_msg.rpc_resp; 5153 res = task->tk_msg.rpc_resp;
5124 5154
5125 if (nfs4_setup_sequence(clp, args, res, 0, task)) 5155 if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task))
5126 return; 5156 return;
5127 rpc_call_start(task); 5157 rpc_call_start(task);
5128} 5158}
@@ -5133,32 +5163,67 @@ static const struct rpc_call_ops nfs41_sequence_ops = {
5133 .rpc_release = nfs41_sequence_release, 5163 .rpc_release = nfs41_sequence_release,
5134}; 5164};
5135 5165
5136static int nfs41_proc_async_sequence(struct nfs_client *clp, 5166static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
5137 struct rpc_cred *cred)
5138{ 5167{
5139 struct nfs4_sequence_args *args; 5168 struct nfs4_sequence_data *calldata;
5140 struct nfs4_sequence_res *res;
5141 struct rpc_message msg = { 5169 struct rpc_message msg = {
5142 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], 5170 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE],
5143 .rpc_cred = cred, 5171 .rpc_cred = cred,
5144 }; 5172 };
5173 struct rpc_task_setup task_setup_data = {
5174 .rpc_client = clp->cl_rpcclient,
5175 .rpc_message = &msg,
5176 .callback_ops = &nfs41_sequence_ops,
5177 .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT,
5178 };
5145 5179
5146 if (!atomic_inc_not_zero(&clp->cl_count)) 5180 if (!atomic_inc_not_zero(&clp->cl_count))
5147 return -EIO; 5181 return ERR_PTR(-EIO);
5148 args = kzalloc(sizeof(*args), GFP_NOFS); 5182 calldata = kmalloc(sizeof(*calldata), GFP_NOFS);
5149 res = kzalloc(sizeof(*res), GFP_NOFS); 5183 if (calldata == NULL) {
5150 if (!args || !res) {
5151 kfree(args);
5152 kfree(res);
5153 nfs_put_client(clp); 5184 nfs_put_client(clp);
5154 return -ENOMEM; 5185 return ERR_PTR(-ENOMEM);
5155 } 5186 }
5156 res->sr_slotid = NFS4_MAX_SLOT_TABLE; 5187 calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE;
5157 msg.rpc_argp = args; 5188 msg.rpc_argp = &calldata->args;
5158 msg.rpc_resp = res; 5189 msg.rpc_resp = &calldata->res;
5190 calldata->clp = clp;
5191 task_setup_data.callback_data = calldata;
5159 5192
5160 return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, 5193 return rpc_run_task(&task_setup_data);
5161 &nfs41_sequence_ops, (void *)clp); 5194}
5195
5196static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred)
5197{
5198 struct rpc_task *task;
5199 int ret = 0;
5200
5201 task = _nfs41_proc_sequence(clp, cred);
5202 if (IS_ERR(task))
5203 ret = PTR_ERR(task);
5204 else
5205 rpc_put_task(task);
5206 dprintk("<-- %s status=%d\n", __func__, ret);
5207 return ret;
5208}
5209
5210static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
5211{
5212 struct rpc_task *task;
5213 int ret;
5214
5215 task = _nfs41_proc_sequence(clp, cred);
5216 if (IS_ERR(task)) {
5217 ret = PTR_ERR(task);
5218 goto out;
5219 }
5220 ret = rpc_wait_for_completion_task(task);
5221 if (!ret)
5222 ret = task->tk_status;
5223 rpc_put_task(task);
5224out:
5225 dprintk("<-- %s status=%d\n", __func__, ret);
5226 return ret;
5162} 5227}
5163 5228
5164struct nfs4_reclaim_complete_data { 5229struct nfs4_reclaim_complete_data {
@@ -5172,13 +5237,31 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
5172 struct nfs4_reclaim_complete_data *calldata = data; 5237 struct nfs4_reclaim_complete_data *calldata = data;
5173 5238
5174 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); 5239 rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
5175 if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args, 5240 if (nfs41_setup_sequence(calldata->clp->cl_session,
5241 &calldata->arg.seq_args,
5176 &calldata->res.seq_res, 0, task)) 5242 &calldata->res.seq_res, 0, task))
5177 return; 5243 return;
5178 5244
5179 rpc_call_start(task); 5245 rpc_call_start(task);
5180} 5246}
5181 5247
5248static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp)
5249{
5250 switch(task->tk_status) {
5251 case 0:
5252 case -NFS4ERR_COMPLETE_ALREADY:
5253 case -NFS4ERR_WRONG_CRED: /* What to do here? */
5254 break;
5255 case -NFS4ERR_DELAY:
5256 case -EKEYEXPIRED:
5257 rpc_delay(task, NFS4_POLL_RETRY_MAX);
5258 return -EAGAIN;
5259 default:
5260 nfs4_schedule_state_recovery(clp);
5261 }
5262 return 0;
5263}
5264
5182static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) 5265static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
5183{ 5266{
5184 struct nfs4_reclaim_complete_data *calldata = data; 5267 struct nfs4_reclaim_complete_data *calldata = data;
@@ -5186,32 +5269,13 @@ static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data)
5186 struct nfs4_sequence_res *res = &calldata->res.seq_res; 5269 struct nfs4_sequence_res *res = &calldata->res.seq_res;
5187 5270
5188 dprintk("--> %s\n", __func__); 5271 dprintk("--> %s\n", __func__);
5189 nfs41_sequence_done(clp, res, task->tk_status); 5272 if (!nfs41_sequence_done(task, res))
5190 switch (task->tk_status) { 5273 return;
5191 case 0:
5192 case -NFS4ERR_COMPLETE_ALREADY:
5193 break;
5194 case -NFS4ERR_BADSESSION:
5195 case -NFS4ERR_DEADSESSION:
5196 /*
5197 * Handle the session error, but do not retry the operation, as
5198 * we have no way of telling whether the clientid had to be
5199 * reset before we got our reply. If reset, a new wave of
5200 * reclaim operations will follow, containing their own reclaim
5201 * complete. We don't want our retry to get on the way of
5202 * recovery by incorrectly indicating to the server that we're
5203 * done reclaiming state since the process had to be restarted.
5204 */
5205 _nfs4_async_handle_error(task, NULL, clp, NULL);
5206 break;
5207 default:
5208 if (_nfs4_async_handle_error(
5209 task, NULL, clp, NULL) == -EAGAIN) {
5210 rpc_restart_call_prepare(task);
5211 return;
5212 }
5213 }
5214 5274
5275 if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) {
5276 rpc_restart_call_prepare(task);
5277 return;
5278 }
5215 dprintk("<-- %s\n", __func__); 5279 dprintk("<-- %s\n", __func__);
5216} 5280}
5217 5281
@@ -5325,28 +5389,30 @@ struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
5325}; 5389};
5326#endif 5390#endif
5327 5391
5328/* 5392static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
5329 * Per minor version reboot and network partition recovery ops 5393 .minor_version = 0,
5330 */ 5394 .call_sync = _nfs4_call_sync,
5331 5395 .validate_stateid = nfs4_validate_delegation_stateid,
5332struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = { 5396 .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
5333 &nfs40_reboot_recovery_ops, 5397 .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
5334#if defined(CONFIG_NFS_V4_1) 5398 .state_renewal_ops = &nfs40_state_renewal_ops,
5335 &nfs41_reboot_recovery_ops,
5336#endif
5337}; 5399};
5338 5400
5339struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = {
5340 &nfs40_nograce_recovery_ops,
5341#if defined(CONFIG_NFS_V4_1) 5401#if defined(CONFIG_NFS_V4_1)
5342 &nfs41_nograce_recovery_ops, 5402static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
5343#endif 5403 .minor_version = 1,
5404 .call_sync = _nfs4_call_sync_session,
5405 .validate_stateid = nfs41_validate_delegation_stateid,
5406 .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
5407 .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
5408 .state_renewal_ops = &nfs41_state_renewal_ops,
5344}; 5409};
5410#endif
5345 5411
5346struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = { 5412const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
5347 &nfs40_state_renewal_ops, 5413 [0] = &nfs_v4_0_minor_ops,
5348#if defined(CONFIG_NFS_V4_1) 5414#if defined(CONFIG_NFS_V4_1)
5349 &nfs41_state_renewal_ops, 5415 [1] = &nfs_v4_1_minor_ops,
5350#endif 5416#endif
5351}; 5417};
5352 5418
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index d87f10327b72..72b6c580af13 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -54,14 +54,14 @@
54void 54void
55nfs4_renew_state(struct work_struct *work) 55nfs4_renew_state(struct work_struct *work)
56{ 56{
57 struct nfs4_state_maintenance_ops *ops; 57 const struct nfs4_state_maintenance_ops *ops;
58 struct nfs_client *clp = 58 struct nfs_client *clp =
59 container_of(work, struct nfs_client, cl_renewd.work); 59 container_of(work, struct nfs_client, cl_renewd.work);
60 struct rpc_cred *cred; 60 struct rpc_cred *cred;
61 long lease; 61 long lease;
62 unsigned long last, now; 62 unsigned long last, now;
63 63
64 ops = nfs4_state_renewal_ops[clp->cl_minorversion]; 64 ops = clp->cl_mvops->state_renewal_ops;
65 dprintk("%s: start\n", __func__); 65 dprintk("%s: start\n", __func__);
66 /* Are there any active superblocks? */ 66 /* Are there any active superblocks? */
67 if (list_empty(&clp->cl_superblocks)) 67 if (list_empty(&clp->cl_superblocks))
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 34acf5926fdc..3e2f19b04c06 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -145,7 +145,9 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
145 struct nfs4_session *ses = clp->cl_session; 145 struct nfs4_session *ses = clp->cl_session;
146 int max_slots; 146 int max_slots;
147 147
148 if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) { 148 if (ses == NULL)
149 return;
150 if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) {
149 spin_lock(&ses->fc_slot_table.slot_tbl_lock); 151 spin_lock(&ses->fc_slot_table.slot_tbl_lock);
150 max_slots = ses->fc_slot_table.max_slots; 152 max_slots = ses->fc_slot_table.max_slots;
151 while (max_slots--) { 153 while (max_slots--) {
@@ -167,7 +169,7 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
167 struct nfs4_slot_table *tbl = &ses->fc_slot_table; 169 struct nfs4_slot_table *tbl = &ses->fc_slot_table;
168 170
169 spin_lock(&tbl->slot_tbl_lock); 171 spin_lock(&tbl->slot_tbl_lock);
170 set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state); 172 set_bit(NFS4_SESSION_DRAINING, &ses->session_state);
171 if (tbl->highest_used_slotid != -1) { 173 if (tbl->highest_used_slotid != -1) {
172 INIT_COMPLETION(ses->complete); 174 INIT_COMPLETION(ses->complete);
173 spin_unlock(&tbl->slot_tbl_lock); 175 spin_unlock(&tbl->slot_tbl_lock);
@@ -371,7 +373,6 @@ nfs4_alloc_state_owner(void)
371 return NULL; 373 return NULL;
372 spin_lock_init(&sp->so_lock); 374 spin_lock_init(&sp->so_lock);
373 INIT_LIST_HEAD(&sp->so_states); 375 INIT_LIST_HEAD(&sp->so_states);
374 INIT_LIST_HEAD(&sp->so_delegations);
375 rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); 376 rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue");
376 sp->so_seqid.sequence = &sp->so_sequence; 377 sp->so_seqid.sequence = &sp->so_sequence;
377 spin_lock_init(&sp->so_sequence.lock); 378 spin_lock_init(&sp->so_sequence.lock);
@@ -384,7 +385,7 @@ static void
384nfs4_drop_state_owner(struct nfs4_state_owner *sp) 385nfs4_drop_state_owner(struct nfs4_state_owner *sp)
385{ 386{
386 if (!RB_EMPTY_NODE(&sp->so_client_node)) { 387 if (!RB_EMPTY_NODE(&sp->so_client_node)) {
387 struct nfs_client *clp = sp->so_client; 388 struct nfs_client *clp = sp->so_server->nfs_client;
388 389
389 spin_lock(&clp->cl_lock); 390 spin_lock(&clp->cl_lock);
390 rb_erase(&sp->so_client_node, &clp->cl_state_owners); 391 rb_erase(&sp->so_client_node, &clp->cl_state_owners);
@@ -406,7 +407,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
406 new = nfs4_alloc_state_owner(); 407 new = nfs4_alloc_state_owner();
407 if (new == NULL) 408 if (new == NULL)
408 return NULL; 409 return NULL;
409 new->so_client = clp;
410 new->so_server = server; 410 new->so_server = server;
411 new->so_cred = cred; 411 new->so_cred = cred;
412 spin_lock(&clp->cl_lock); 412 spin_lock(&clp->cl_lock);
@@ -423,7 +423,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
423 423
424void nfs4_put_state_owner(struct nfs4_state_owner *sp) 424void nfs4_put_state_owner(struct nfs4_state_owner *sp)
425{ 425{
426 struct nfs_client *clp = sp->so_client; 426 struct nfs_client *clp = sp->so_server->nfs_client;
427 struct rpc_cred *cred = sp->so_cred; 427 struct rpc_cred *cred = sp->so_cred;
428 428
429 if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) 429 if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
@@ -602,12 +602,21 @@ void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
602 * that is compatible with current->files 602 * that is compatible with current->files
603 */ 603 */
604static struct nfs4_lock_state * 604static struct nfs4_lock_state *
605__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) 605__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
606{ 606{
607 struct nfs4_lock_state *pos; 607 struct nfs4_lock_state *pos;
608 list_for_each_entry(pos, &state->lock_states, ls_locks) { 608 list_for_each_entry(pos, &state->lock_states, ls_locks) {
609 if (pos->ls_owner != fl_owner) 609 if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type)
610 continue; 610 continue;
611 switch (pos->ls_owner.lo_type) {
612 case NFS4_POSIX_LOCK_TYPE:
613 if (pos->ls_owner.lo_u.posix_owner != fl_owner)
614 continue;
615 break;
616 case NFS4_FLOCK_LOCK_TYPE:
617 if (pos->ls_owner.lo_u.flock_owner != fl_pid)
618 continue;
619 }
611 atomic_inc(&pos->ls_count); 620 atomic_inc(&pos->ls_count);
612 return pos; 621 return pos;
613 } 622 }
@@ -619,10 +628,10 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
619 * exists, return an uninitialized one. 628 * exists, return an uninitialized one.
620 * 629 *
621 */ 630 */
622static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) 631static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type)
623{ 632{
624 struct nfs4_lock_state *lsp; 633 struct nfs4_lock_state *lsp;
625 struct nfs_client *clp = state->owner->so_client; 634 struct nfs_client *clp = state->owner->so_server->nfs_client;
626 635
627 lsp = kzalloc(sizeof(*lsp), GFP_NOFS); 636 lsp = kzalloc(sizeof(*lsp), GFP_NOFS);
628 if (lsp == NULL) 637 if (lsp == NULL)
@@ -633,7 +642,18 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
633 lsp->ls_seqid.sequence = &lsp->ls_sequence; 642 lsp->ls_seqid.sequence = &lsp->ls_sequence;
634 atomic_set(&lsp->ls_count, 1); 643 atomic_set(&lsp->ls_count, 1);
635 lsp->ls_state = state; 644 lsp->ls_state = state;
636 lsp->ls_owner = fl_owner; 645 lsp->ls_owner.lo_type = type;
646 switch (lsp->ls_owner.lo_type) {
647 case NFS4_FLOCK_LOCK_TYPE:
648 lsp->ls_owner.lo_u.flock_owner = fl_pid;
649 break;
650 case NFS4_POSIX_LOCK_TYPE:
651 lsp->ls_owner.lo_u.posix_owner = fl_owner;
652 break;
653 default:
654 kfree(lsp);
655 return NULL;
656 }
637 spin_lock(&clp->cl_lock); 657 spin_lock(&clp->cl_lock);
638 nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); 658 nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64);
639 spin_unlock(&clp->cl_lock); 659 spin_unlock(&clp->cl_lock);
@@ -643,7 +663,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
643 663
644static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) 664static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
645{ 665{
646 struct nfs_client *clp = lsp->ls_state->owner->so_client; 666 struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client;
647 667
648 spin_lock(&clp->cl_lock); 668 spin_lock(&clp->cl_lock);
649 nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); 669 nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id);
@@ -657,13 +677,13 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
657 * exists, return an uninitialized one. 677 * exists, return an uninitialized one.
658 * 678 *
659 */ 679 */
660static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) 680static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type)
661{ 681{
662 struct nfs4_lock_state *lsp, *new = NULL; 682 struct nfs4_lock_state *lsp, *new = NULL;
663 683
664 for(;;) { 684 for(;;) {
665 spin_lock(&state->state_lock); 685 spin_lock(&state->state_lock);
666 lsp = __nfs4_find_lock_state(state, owner); 686 lsp = __nfs4_find_lock_state(state, owner, pid, type);
667 if (lsp != NULL) 687 if (lsp != NULL)
668 break; 688 break;
669 if (new != NULL) { 689 if (new != NULL) {
@@ -674,7 +694,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
674 break; 694 break;
675 } 695 }
676 spin_unlock(&state->state_lock); 696 spin_unlock(&state->state_lock);
677 new = nfs4_alloc_lock_state(state, owner); 697 new = nfs4_alloc_lock_state(state, owner, pid, type);
678 if (new == NULL) 698 if (new == NULL)
679 return NULL; 699 return NULL;
680 } 700 }
@@ -701,6 +721,8 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
701 if (list_empty(&state->lock_states)) 721 if (list_empty(&state->lock_states))
702 clear_bit(LK_STATE_IN_USE, &state->flags); 722 clear_bit(LK_STATE_IN_USE, &state->flags);
703 spin_unlock(&state->state_lock); 723 spin_unlock(&state->state_lock);
724 if (lsp->ls_flags & NFS_LOCK_INITIALIZED)
725 nfs4_release_lockowner(lsp);
704 nfs4_free_lock_state(lsp); 726 nfs4_free_lock_state(lsp);
705} 727}
706 728
@@ -728,7 +750,12 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
728 750
729 if (fl->fl_ops != NULL) 751 if (fl->fl_ops != NULL)
730 return 0; 752 return 0;
731 lsp = nfs4_get_lock_state(state, fl->fl_owner); 753 if (fl->fl_flags & FL_POSIX)
754 lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
755 else if (fl->fl_flags & FL_FLOCK)
756 lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE);
757 else
758 return -EINVAL;
732 if (lsp == NULL) 759 if (lsp == NULL)
733 return -ENOMEM; 760 return -ENOMEM;
734 fl->fl_u.nfs4_fl.owner = lsp; 761 fl->fl_u.nfs4_fl.owner = lsp;
@@ -740,7 +767,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
740 * Byte-range lock aware utility to initialize the stateid of read/write 767 * Byte-range lock aware utility to initialize the stateid of read/write
741 * requests. 768 * requests.
742 */ 769 */
743void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) 770void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid)
744{ 771{
745 struct nfs4_lock_state *lsp; 772 struct nfs4_lock_state *lsp;
746 int seq; 773 int seq;
@@ -753,7 +780,7 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f
753 return; 780 return;
754 781
755 spin_lock(&state->state_lock); 782 spin_lock(&state->state_lock);
756 lsp = __nfs4_find_lock_state(state, fl_owner); 783 lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE);
757 if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) 784 if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
758 memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); 785 memcpy(dst, &lsp->ls_stateid, sizeof(*dst));
759 spin_unlock(&state->state_lock); 786 spin_unlock(&state->state_lock);
@@ -1041,11 +1068,11 @@ restart:
1041 case -NFS4ERR_BAD_STATEID: 1068 case -NFS4ERR_BAD_STATEID:
1042 case -NFS4ERR_RECLAIM_BAD: 1069 case -NFS4ERR_RECLAIM_BAD:
1043 case -NFS4ERR_RECLAIM_CONFLICT: 1070 case -NFS4ERR_RECLAIM_CONFLICT:
1044 nfs4_state_mark_reclaim_nograce(sp->so_client, state); 1071 nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
1045 break; 1072 break;
1046 case -NFS4ERR_EXPIRED: 1073 case -NFS4ERR_EXPIRED:
1047 case -NFS4ERR_NO_GRACE: 1074 case -NFS4ERR_NO_GRACE:
1048 nfs4_state_mark_reclaim_nograce(sp->so_client, state); 1075 nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state);
1049 case -NFS4ERR_STALE_CLIENTID: 1076 case -NFS4ERR_STALE_CLIENTID:
1050 case -NFS4ERR_BADSESSION: 1077 case -NFS4ERR_BADSESSION:
1051 case -NFS4ERR_BADSLOT: 1078 case -NFS4ERR_BADSLOT:
@@ -1120,8 +1147,7 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
1120 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) 1147 if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
1121 return; 1148 return;
1122 1149
1123 nfs4_reclaim_complete(clp, 1150 nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops);
1124 nfs4_reboot_recovery_ops[clp->cl_minorversion]);
1125 1151
1126 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { 1152 for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
1127 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); 1153 sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
@@ -1211,8 +1237,8 @@ restart:
1211static int nfs4_check_lease(struct nfs_client *clp) 1237static int nfs4_check_lease(struct nfs_client *clp)
1212{ 1238{
1213 struct rpc_cred *cred; 1239 struct rpc_cred *cred;
1214 struct nfs4_state_maintenance_ops *ops = 1240 const struct nfs4_state_maintenance_ops *ops =
1215 nfs4_state_renewal_ops[clp->cl_minorversion]; 1241 clp->cl_mvops->state_renewal_ops;
1216 int status = -NFS4ERR_EXPIRED; 1242 int status = -NFS4ERR_EXPIRED;
1217 1243
1218 /* Is the client already known to have an expired lease? */ 1244 /* Is the client already known to have an expired lease? */
@@ -1235,8 +1261,8 @@ out:
1235static int nfs4_reclaim_lease(struct nfs_client *clp) 1261static int nfs4_reclaim_lease(struct nfs_client *clp)
1236{ 1262{
1237 struct rpc_cred *cred; 1263 struct rpc_cred *cred;
1238 struct nfs4_state_recovery_ops *ops = 1264 const struct nfs4_state_recovery_ops *ops =
1239 nfs4_reboot_recovery_ops[clp->cl_minorversion]; 1265 clp->cl_mvops->reboot_recovery_ops;
1240 int status = -ENOENT; 1266 int status = -ENOENT;
1241 1267
1242 cred = ops->get_clid_cred(clp); 1268 cred = ops->get_clid_cred(clp);
@@ -1444,7 +1470,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
1444 /* First recover reboot state... */ 1470 /* First recover reboot state... */
1445 if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { 1471 if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
1446 status = nfs4_do_reclaim(clp, 1472 status = nfs4_do_reclaim(clp,
1447 nfs4_reboot_recovery_ops[clp->cl_minorversion]); 1473 clp->cl_mvops->reboot_recovery_ops);
1448 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || 1474 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
1449 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) 1475 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
1450 continue; 1476 continue;
@@ -1458,7 +1484,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
1458 /* Now recover expired state... */ 1484 /* Now recover expired state... */
1459 if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { 1485 if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
1460 status = nfs4_do_reclaim(clp, 1486 status = nfs4_do_reclaim(clp,
1461 nfs4_nograce_recovery_ops[clp->cl_minorversion]); 1487 clp->cl_mvops->nograce_recovery_ops);
1462 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || 1488 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) ||
1463 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || 1489 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
1464 test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) 1490 test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 65c8dae4b267..08ef91291132 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -202,14 +202,17 @@ static int nfs4_stat_to_errno(int);
202#define encode_link_maxsz (op_encode_hdr_maxsz + \ 202#define encode_link_maxsz (op_encode_hdr_maxsz + \
203 nfs4_name_maxsz) 203 nfs4_name_maxsz)
204#define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) 204#define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz)
205#define encode_lockowner_maxsz (7)
205#define encode_lock_maxsz (op_encode_hdr_maxsz + \ 206#define encode_lock_maxsz (op_encode_hdr_maxsz + \
206 7 + \ 207 7 + \
207 1 + encode_stateid_maxsz + 8) 208 1 + encode_stateid_maxsz + 1 + \
209 encode_lockowner_maxsz)
208#define decode_lock_denied_maxsz \ 210#define decode_lock_denied_maxsz \
209 (8 + decode_lockowner_maxsz) 211 (8 + decode_lockowner_maxsz)
210#define decode_lock_maxsz (op_decode_hdr_maxsz + \ 212#define decode_lock_maxsz (op_decode_hdr_maxsz + \
211 decode_lock_denied_maxsz) 213 decode_lock_denied_maxsz)
212#define encode_lockt_maxsz (op_encode_hdr_maxsz + 12) 214#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \
215 encode_lockowner_maxsz)
213#define decode_lockt_maxsz (op_decode_hdr_maxsz + \ 216#define decode_lockt_maxsz (op_decode_hdr_maxsz + \
214 decode_lock_denied_maxsz) 217 decode_lock_denied_maxsz)
215#define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \ 218#define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \
@@ -217,6 +220,11 @@ static int nfs4_stat_to_errno(int);
217 4) 220 4)
218#define decode_locku_maxsz (op_decode_hdr_maxsz + \ 221#define decode_locku_maxsz (op_decode_hdr_maxsz + \
219 decode_stateid_maxsz) 222 decode_stateid_maxsz)
223#define encode_release_lockowner_maxsz \
224 (op_encode_hdr_maxsz + \
225 encode_lockowner_maxsz)
226#define decode_release_lockowner_maxsz \
227 (op_decode_hdr_maxsz)
220#define encode_access_maxsz (op_encode_hdr_maxsz + 1) 228#define encode_access_maxsz (op_encode_hdr_maxsz + 1)
221#define decode_access_maxsz (op_decode_hdr_maxsz + 2) 229#define decode_access_maxsz (op_decode_hdr_maxsz + 2)
222#define encode_symlink_maxsz (op_encode_hdr_maxsz + \ 230#define encode_symlink_maxsz (op_encode_hdr_maxsz + \
@@ -471,6 +479,12 @@ static int nfs4_stat_to_errno(int);
471 decode_sequence_maxsz + \ 479 decode_sequence_maxsz + \
472 decode_putfh_maxsz + \ 480 decode_putfh_maxsz + \
473 decode_locku_maxsz) 481 decode_locku_maxsz)
482#define NFS4_enc_release_lockowner_sz \
483 (compound_encode_hdr_maxsz + \
484 encode_lockowner_maxsz)
485#define NFS4_dec_release_lockowner_sz \
486 (compound_decode_hdr_maxsz + \
487 decode_lockowner_maxsz)
474#define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ 488#define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \
475 encode_sequence_maxsz + \ 489 encode_sequence_maxsz + \
476 encode_putfh_maxsz + \ 490 encode_putfh_maxsz + \
@@ -744,7 +758,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
744 struct compound_hdr *hdr) 758 struct compound_hdr *hdr)
745{ 759{
746 __be32 *p; 760 __be32 *p;
747 struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; 761 struct rpc_auth *auth = req->rq_cred->cr_auth;
748 762
749 /* initialize running count of expected bytes in reply. 763 /* initialize running count of expected bytes in reply.
750 * NOTE: the replied tag SHOULD be the same is the one sent, 764 * NOTE: the replied tag SHOULD be the same is the one sent,
@@ -1042,6 +1056,17 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl)
1042 return fl->fl_end - fl->fl_start + 1; 1056 return fl->fl_end - fl->fl_start + 1;
1043} 1057}
1044 1058
1059static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner)
1060{
1061 __be32 *p;
1062
1063 p = reserve_space(xdr, 28);
1064 p = xdr_encode_hyper(p, lowner->clientid);
1065 *p++ = cpu_to_be32(16);
1066 p = xdr_encode_opaque_fixed(p, "lock id:", 8);
1067 xdr_encode_hyper(p, lowner->id);
1068}
1069
1045/* 1070/*
1046 * opcode,type,reclaim,offset,length,new_lock_owner = 32 1071 * opcode,type,reclaim,offset,length,new_lock_owner = 32
1047 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 1072 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40
@@ -1058,14 +1083,11 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
1058 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); 1083 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
1059 *p = cpu_to_be32(args->new_lock_owner); 1084 *p = cpu_to_be32(args->new_lock_owner);
1060 if (args->new_lock_owner){ 1085 if (args->new_lock_owner){
1061 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); 1086 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
1062 *p++ = cpu_to_be32(args->open_seqid->sequence->counter); 1087 *p++ = cpu_to_be32(args->open_seqid->sequence->counter);
1063 p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); 1088 p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
1064 *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); 1089 *p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
1065 p = xdr_encode_hyper(p, args->lock_owner.clientid); 1090 encode_lockowner(xdr, &args->lock_owner);
1066 *p++ = cpu_to_be32(16);
1067 p = xdr_encode_opaque_fixed(p, "lock id:", 8);
1068 xdr_encode_hyper(p, args->lock_owner.id);
1069 } 1091 }
1070 else { 1092 else {
1071 p = reserve_space(xdr, NFS4_STATEID_SIZE+4); 1093 p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
@@ -1080,15 +1102,12 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
1080{ 1102{
1081 __be32 *p; 1103 __be32 *p;
1082 1104
1083 p = reserve_space(xdr, 52); 1105 p = reserve_space(xdr, 24);
1084 *p++ = cpu_to_be32(OP_LOCKT); 1106 *p++ = cpu_to_be32(OP_LOCKT);
1085 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); 1107 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
1086 p = xdr_encode_hyper(p, args->fl->fl_start); 1108 p = xdr_encode_hyper(p, args->fl->fl_start);
1087 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); 1109 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
1088 p = xdr_encode_hyper(p, args->lock_owner.clientid); 1110 encode_lockowner(xdr, &args->lock_owner);
1089 *p++ = cpu_to_be32(16);
1090 p = xdr_encode_opaque_fixed(p, "lock id:", 8);
1091 xdr_encode_hyper(p, args->lock_owner.id);
1092 hdr->nops++; 1111 hdr->nops++;
1093 hdr->replen += decode_lockt_maxsz; 1112 hdr->replen += decode_lockt_maxsz;
1094} 1113}
@@ -1108,6 +1127,17 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
1108 hdr->replen += decode_locku_maxsz; 1127 hdr->replen += decode_locku_maxsz;
1109} 1128}
1110 1129
1130static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr)
1131{
1132 __be32 *p;
1133
1134 p = reserve_space(xdr, 4);
1135 *p = cpu_to_be32(OP_RELEASE_LOCKOWNER);
1136 encode_lockowner(xdr, lowner);
1137 hdr->nops++;
1138 hdr->replen += decode_release_lockowner_maxsz;
1139}
1140
1111static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) 1141static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
1112{ 1142{
1113 int len = name->len; 1143 int len = name->len;
@@ -1172,7 +1202,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
1172 break; 1202 break;
1173 default: 1203 default:
1174 clp = arg->server->nfs_client; 1204 clp = arg->server->nfs_client;
1175 if (clp->cl_minorversion > 0) { 1205 if (clp->cl_mvops->minor_version > 0) {
1176 if (nfs4_has_persistent_session(clp)) { 1206 if (nfs4_has_persistent_session(clp)) {
1177 *p = cpu_to_be32(NFS4_CREATE_GUARDED); 1207 *p = cpu_to_be32(NFS4_CREATE_GUARDED);
1178 encode_attrs(xdr, arg->u.attrs, arg->server); 1208 encode_attrs(xdr, arg->u.attrs, arg->server);
@@ -1324,14 +1354,14 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1324 hdr->replen += decode_putrootfh_maxsz; 1354 hdr->replen += decode_putrootfh_maxsz;
1325} 1355}
1326 1356
1327static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) 1357static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx)
1328{ 1358{
1329 nfs4_stateid stateid; 1359 nfs4_stateid stateid;
1330 __be32 *p; 1360 __be32 *p;
1331 1361
1332 p = reserve_space(xdr, NFS4_STATEID_SIZE); 1362 p = reserve_space(xdr, NFS4_STATEID_SIZE);
1333 if (ctx->state != NULL) { 1363 if (ctx->state != NULL) {
1334 nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); 1364 nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
1335 xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); 1365 xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
1336 } else 1366 } else
1337 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); 1367 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
@@ -1344,7 +1374,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
1344 p = reserve_space(xdr, 4); 1374 p = reserve_space(xdr, 4);
1345 *p = cpu_to_be32(OP_READ); 1375 *p = cpu_to_be32(OP_READ);
1346 1376
1347 encode_stateid(xdr, args->context); 1377 encode_stateid(xdr, args->context, args->lock_context);
1348 1378
1349 p = reserve_space(xdr, 12); 1379 p = reserve_space(xdr, 12);
1350 p = xdr_encode_hyper(p, args->offset); 1380 p = xdr_encode_hyper(p, args->offset);
@@ -1523,7 +1553,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
1523 p = reserve_space(xdr, 4); 1553 p = reserve_space(xdr, 4);
1524 *p = cpu_to_be32(OP_WRITE); 1554 *p = cpu_to_be32(OP_WRITE);
1525 1555
1526 encode_stateid(xdr, args->context); 1556 encode_stateid(xdr, args->context, args->lock_context);
1527 1557
1528 p = reserve_space(xdr, 16); 1558 p = reserve_space(xdr, 16);
1529 p = xdr_encode_hyper(p, args->offset); 1559 p = xdr_encode_hyper(p, args->offset);
@@ -1704,7 +1734,7 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args)
1704{ 1734{
1705#if defined(CONFIG_NFS_V4_1) 1735#if defined(CONFIG_NFS_V4_1)
1706 if (args->sa_session) 1736 if (args->sa_session)
1707 return args->sa_session->clp->cl_minorversion; 1737 return args->sa_session->clp->cl_mvops->minor_version;
1708#endif /* CONFIG_NFS_V4_1 */ 1738#endif /* CONFIG_NFS_V4_1 */
1709 return 0; 1739 return 0;
1710} 1740}
@@ -2048,6 +2078,20 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_
2048 return 0; 2078 return 0;
2049} 2079}
2050 2080
2081static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args)
2082{
2083 struct xdr_stream xdr;
2084 struct compound_hdr hdr = {
2085 .minorversion = 0,
2086 };
2087
2088 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
2089 encode_compound_hdr(&xdr, req, &hdr);
2090 encode_release_lockowner(&xdr, &args->lock_owner, &hdr);
2091 encode_nops(&hdr);
2092 return 0;
2093}
2094
2051/* 2095/*
2052 * Encode a READLINK request 2096 * Encode a READLINK request
2053 */ 2097 */
@@ -2395,7 +2439,7 @@ static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p,
2395{ 2439{
2396 struct xdr_stream xdr; 2440 struct xdr_stream xdr;
2397 struct compound_hdr hdr = { 2441 struct compound_hdr hdr = {
2398 .minorversion = args->client->cl_minorversion, 2442 .minorversion = args->client->cl_mvops->minor_version,
2399 }; 2443 };
2400 2444
2401 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2445 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
@@ -2413,7 +2457,7 @@ static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p,
2413{ 2457{
2414 struct xdr_stream xdr; 2458 struct xdr_stream xdr;
2415 struct compound_hdr hdr = { 2459 struct compound_hdr hdr = {
2416 .minorversion = args->client->cl_minorversion, 2460 .minorversion = args->client->cl_mvops->minor_version,
2417 }; 2461 };
2418 2462
2419 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2463 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
@@ -2431,7 +2475,7 @@ static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p,
2431{ 2475{
2432 struct xdr_stream xdr; 2476 struct xdr_stream xdr;
2433 struct compound_hdr hdr = { 2477 struct compound_hdr hdr = {
2434 .minorversion = session->clp->cl_minorversion, 2478 .minorversion = session->clp->cl_mvops->minor_version,
2435 }; 2479 };
2436 2480
2437 xdr_init_encode(&xdr, &req->rq_snd_buf, p); 2481 xdr_init_encode(&xdr, &req->rq_snd_buf, p);
@@ -3973,6 +4017,11 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
3973 return status; 4017 return status;
3974} 4018}
3975 4019
4020static int decode_release_lockowner(struct xdr_stream *xdr)
4021{
4022 return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER);
4023}
4024
3976static int decode_lookup(struct xdr_stream *xdr) 4025static int decode_lookup(struct xdr_stream *xdr)
3977{ 4026{
3978 return decode_op_hdr(xdr, OP_LOOKUP); 4027 return decode_op_hdr(xdr, OP_LOOKUP);
@@ -5259,6 +5308,19 @@ out:
5259 return status; 5308 return status;
5260} 5309}
5261 5310
5311static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy)
5312{
5313 struct xdr_stream xdr;
5314 struct compound_hdr hdr;
5315 int status;
5316
5317 xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
5318 status = decode_compound_hdr(&xdr, &hdr);
5319 if (!status)
5320 status = decode_release_lockowner(&xdr);
5321 return status;
5322}
5323
5262/* 5324/*
5263 * Decode READLINK response 5325 * Decode READLINK response
5264 */ 5326 */
@@ -5866,6 +5928,7 @@ struct rpc_procinfo nfs4_procedures[] = {
5866 PROC(GETACL, enc_getacl, dec_getacl), 5928 PROC(GETACL, enc_getacl, dec_getacl),
5867 PROC(SETACL, enc_setacl, dec_setacl), 5929 PROC(SETACL, enc_setacl, dec_setacl),
5868 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), 5930 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations),
5931 PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
5869#if defined(CONFIG_NFS_V4_1) 5932#if defined(CONFIG_NFS_V4_1)
5870 PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), 5933 PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id),
5871 PROC(CREATE_SESSION, enc_create_session, dec_create_session), 5934 PROC(CREATE_SESSION, enc_create_session, dec_create_session),
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a3654e57b589..919490232e17 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -79,6 +79,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
79 req->wb_pgbase = offset; 79 req->wb_pgbase = offset;
80 req->wb_bytes = count; 80 req->wb_bytes = count;
81 req->wb_context = get_nfs_open_context(ctx); 81 req->wb_context = get_nfs_open_context(ctx);
82 req->wb_lock_context = nfs_get_lock_context(ctx);
82 kref_init(&req->wb_kref); 83 kref_init(&req->wb_kref);
83 return req; 84 return req;
84} 85}
@@ -141,11 +142,16 @@ void nfs_clear_request(struct nfs_page *req)
141{ 142{
142 struct page *page = req->wb_page; 143 struct page *page = req->wb_page;
143 struct nfs_open_context *ctx = req->wb_context; 144 struct nfs_open_context *ctx = req->wb_context;
145 struct nfs_lock_context *l_ctx = req->wb_lock_context;
144 146
145 if (page != NULL) { 147 if (page != NULL) {
146 page_cache_release(page); 148 page_cache_release(page);
147 req->wb_page = NULL; 149 req->wb_page = NULL;
148 } 150 }
151 if (l_ctx != NULL) {
152 nfs_put_lock_context(l_ctx);
153 req->wb_lock_context = NULL;
154 }
149 if (ctx != NULL) { 155 if (ctx != NULL) {
150 put_nfs_open_context(ctx); 156 put_nfs_open_context(ctx);
151 req->wb_context = NULL; 157 req->wb_context = NULL;
@@ -235,7 +241,7 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
235{ 241{
236 if (req->wb_context->cred != prev->wb_context->cred) 242 if (req->wb_context->cred != prev->wb_context->cred)
237 return 0; 243 return 0;
238 if (req->wb_context->lockowner != prev->wb_context->lockowner) 244 if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
239 return 0; 245 return 0;
240 if (req->wb_context->state != prev->wb_context->state) 246 if (req->wb_context->state != prev->wb_context->state)
241 return 0; 247 return 0;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 6e2b06e6ca79..87adc2744246 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -190,6 +190,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
190 data->args.pages = data->pagevec; 190 data->args.pages = data->pagevec;
191 data->args.count = count; 191 data->args.count = count;
192 data->args.context = get_nfs_open_context(req->wb_context); 192 data->args.context = get_nfs_open_context(req->wb_context);
193 data->args.lock_context = req->wb_lock_context;
193 194
194 data->res.fattr = &data->fattr; 195 data->res.fattr = &data->fattr;
195 data->res.count = count; 196 data->res.count = count;
@@ -410,7 +411,7 @@ void nfs_read_prepare(struct rpc_task *task, void *calldata)
410{ 411{
411 struct nfs_read_data *data = calldata; 412 struct nfs_read_data *data = calldata;
412 413
413 if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client, 414 if (nfs4_setup_sequence(NFS_SERVER(data->inode),
414 &data->args.seq_args, &data->res.seq_res, 415 &data->args.seq_args, &data->res.seq_res,
415 0, task)) 416 0, task))
416 return; 417 return;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f9df16de4a56..f1ae39f6cb02 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -546,6 +546,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss,
546{ 546{
547 struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address; 547 struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address;
548 548
549 if (nfss->flags & NFS_MOUNT_LEGACY_INTERFACE)
550 return;
551
549 switch (sap->sa_family) { 552 switch (sap->sa_family) {
550 case AF_INET: { 553 case AF_INET: {
551 struct sockaddr_in *sin = (struct sockaddr_in *)sap; 554 struct sockaddr_in *sin = (struct sockaddr_in *)sap;
@@ -1780,6 +1783,7 @@ static int nfs_validate_mount_data(void *options,
1780 * can deal with. 1783 * can deal with.
1781 */ 1784 */
1782 args->flags = data->flags & NFS_MOUNT_FLAGMASK; 1785 args->flags = data->flags & NFS_MOUNT_FLAGMASK;
1786 args->flags |= NFS_MOUNT_LEGACY_INTERFACE;
1783 args->rsize = data->rsize; 1787 args->rsize = data->rsize;
1784 args->wsize = data->wsize; 1788 args->wsize = data->wsize;
1785 args->timeo = data->timeo; 1789 args->timeo = data->timeo;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index a2242af6a17d..2f84adaad427 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
110 struct nfs_unlinkdata *data = calldata; 110 struct nfs_unlinkdata *data = calldata;
111 struct nfs_server *server = NFS_SERVER(data->dir); 111 struct nfs_server *server = NFS_SERVER(data->dir);
112 112
113 if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, 113 if (nfs4_setup_sequence(server, &data->args.seq_args,
114 &data->res.seq_res, 1, task)) 114 &data->res.seq_res, 1, task))
115 return; 115 return;
116 rpc_call_start(task); 116 rpc_call_start(task);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9f81bdd91c55..874972d9427c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -700,7 +700,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
700 req = nfs_page_find_request(page); 700 req = nfs_page_find_request(page);
701 if (req == NULL) 701 if (req == NULL)
702 return 0; 702 return 0;
703 do_flush = req->wb_page != page || req->wb_context != ctx; 703 do_flush = req->wb_page != page || req->wb_context != ctx ||
704 req->wb_lock_context->lockowner != current->files ||
705 req->wb_lock_context->pid != current->tgid;
704 nfs_release_request(req); 706 nfs_release_request(req);
705 if (!do_flush) 707 if (!do_flush)
706 return 0; 708 return 0;
@@ -824,6 +826,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
824 data->args.pages = data->pagevec; 826 data->args.pages = data->pagevec;
825 data->args.count = count; 827 data->args.count = count;
826 data->args.context = get_nfs_open_context(req->wb_context); 828 data->args.context = get_nfs_open_context(req->wb_context);
829 data->args.lock_context = req->wb_lock_context;
827 data->args.stable = NFS_UNSTABLE; 830 data->args.stable = NFS_UNSTABLE;
828 if (how & FLUSH_STABLE) { 831 if (how & FLUSH_STABLE) {
829 data->args.stable = NFS_DATA_SYNC; 832 data->args.stable = NFS_DATA_SYNC;
@@ -1047,9 +1050,9 @@ out:
1047void nfs_write_prepare(struct rpc_task *task, void *calldata) 1050void nfs_write_prepare(struct rpc_task *task, void *calldata)
1048{ 1051{
1049 struct nfs_write_data *data = calldata; 1052 struct nfs_write_data *data = calldata;
1050 struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client;
1051 1053
1052 if (nfs4_setup_sequence(clp, &data->args.seq_args, 1054 if (nfs4_setup_sequence(NFS_SERVER(data->inode),
1055 &data->args.seq_args,
1053 &data->res.seq_res, 1, task)) 1056 &data->res.seq_res, 1, task))
1054 return; 1057 return;
1055 rpc_call_start(task); 1058 rpc_call_start(task);
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 3d68f45a37b9..5b7e3021e06b 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -168,7 +168,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
168 svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); 168 svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
169 169
170 fh_copy(&resp->fh, &argp->fh); 170 fh_copy(&resp->fh, &argp->fh);
171 nfserr = nfsd_read(rqstp, &resp->fh, NULL, 171 nfserr = nfsd_read(rqstp, &resp->fh,
172 argp->offset, 172 argp->offset,
173 rqstp->rq_vec, argp->vlen, 173 rqstp->rq_vec, argp->vlen,
174 &resp->count); 174 &resp->count);
@@ -271,7 +271,7 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
271 fh_init(&resp->fh, NFS3_FHSIZE); 271 fh_init(&resp->fh, NFS3_FHSIZE);
272 nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, 272 nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
273 &argp->attrs, S_IFDIR, 0, &resp->fh); 273 &argp->attrs, S_IFDIR, 0, &resp->fh);
274 274 fh_unlock(&resp->dirfh);
275 RETURN_STATUS(nfserr); 275 RETURN_STATUS(nfserr);
276} 276}
277 277
@@ -327,7 +327,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp,
327 type = nfs3_ftypes[argp->ftype]; 327 type = nfs3_ftypes[argp->ftype];
328 nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, 328 nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
329 &argp->attrs, type, rdev, &resp->fh); 329 &argp->attrs, type, rdev, &resp->fh);
330 330 fh_unlock(&resp->dirfh);
331 RETURN_STATUS(nfserr); 331 RETURN_STATUS(nfserr);
332} 332}
333 333
@@ -348,6 +348,7 @@ nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
348 /* Unlink. -S_IFDIR means file must not be a directory */ 348 /* Unlink. -S_IFDIR means file must not be a directory */
349 fh_copy(&resp->fh, &argp->fh); 349 fh_copy(&resp->fh, &argp->fh);
350 nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len); 350 nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len);
351 fh_unlock(&resp->fh);
351 RETURN_STATUS(nfserr); 352 RETURN_STATUS(nfserr);
352} 353}
353 354
@@ -367,6 +368,7 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp,
367 368
368 fh_copy(&resp->fh, &argp->fh); 369 fh_copy(&resp->fh, &argp->fh);
369 nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len); 370 nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len);
371 fh_unlock(&resp->fh);
370 RETURN_STATUS(nfserr); 372 RETURN_STATUS(nfserr);
371} 373}
372 374
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index eb78e7e22077..988cbb3a19b6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -143,8 +143,6 @@ struct nfs4_cb_compound_hdr {
143 u32 minorversion; 143 u32 minorversion;
144 /* res */ 144 /* res */
145 int status; 145 int status;
146 u32 taglen;
147 char *tag;
148}; 146};
149 147
150static struct { 148static struct {
@@ -205,6 +203,16 @@ nfs_cb_stat_to_errno(int stat)
205 */ 203 */
206 204
207static void 205static void
206encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
207{
208 __be32 *p;
209
210 RESERVE_SPACE(sizeof(stateid_t));
211 WRITE32(sid->si_generation);
212 WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
213}
214
215static void
208encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) 216encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr)
209{ 217{
210 __be32 * p; 218 __be32 * p;
@@ -229,10 +237,10 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp,
229 __be32 *p; 237 __be32 *p;
230 int len = dp->dl_fh.fh_size; 238 int len = dp->dl_fh.fh_size;
231 239
232 RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len); 240 RESERVE_SPACE(4);
233 WRITE32(OP_CB_RECALL); 241 WRITE32(OP_CB_RECALL);
234 WRITE32(dp->dl_stateid.si_generation); 242 encode_stateid(xdr, &dp->dl_stateid);
235 WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t)); 243 RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2));
236 WRITE32(0); /* truncate optimization not implemented */ 244 WRITE32(0); /* truncate optimization not implemented */
237 WRITE32(len); 245 WRITE32(len);
238 WRITEMEM(&dp->dl_fh.fh_base, len); 246 WRITEMEM(&dp->dl_fh.fh_base, len);
@@ -293,13 +301,14 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p,
293static int 301static int
294decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ 302decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){
295 __be32 *p; 303 __be32 *p;
304 u32 taglen;
296 305
297 READ_BUF(8); 306 READ_BUF(8);
298 READ32(hdr->status); 307 READ32(hdr->status);
299 READ32(hdr->taglen); 308 /* We've got no use for the tag; ignore it: */
300 READ_BUF(hdr->taglen + 4); 309 READ32(taglen);
301 hdr->tag = (char *)p; 310 READ_BUF(taglen + 4);
302 p += XDR_QUADLEN(hdr->taglen); 311 p += XDR_QUADLEN(taglen);
303 READ32(hdr->nops); 312 READ32(hdr->nops);
304 return 0; 313 return 0;
305} 314}
@@ -667,28 +676,28 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
667 } 676 }
668 677
669 switch (task->tk_status) { 678 switch (task->tk_status) {
670 case -EIO: 679 case 0:
680 return;
681 case -EBADHANDLE:
682 case -NFS4ERR_BAD_STATEID:
683 /* Race: client probably got cb_recall
684 * before open reply granting delegation */
685 break;
686 default:
671 /* Network partition? */ 687 /* Network partition? */
672 atomic_set(&clp->cl_cb_set, 0); 688 atomic_set(&clp->cl_cb_set, 0);
673 warn_no_callback_path(clp, task->tk_status); 689 warn_no_callback_path(clp, task->tk_status);
674 if (current_rpc_client != task->tk_client) { 690 if (current_rpc_client != task->tk_client) {
675 /* queue a callback on the new connection: */ 691 /* queue a callback on the new connection: */
692 atomic_inc(&dp->dl_count);
676 nfsd4_cb_recall(dp); 693 nfsd4_cb_recall(dp);
677 return; 694 return;
678 } 695 }
679 case -EBADHANDLE:
680 case -NFS4ERR_BAD_STATEID:
681 /* Race: client probably got cb_recall
682 * before open reply granting delegation */
683 break;
684 default:
685 /* success, or error we can't handle */
686 return;
687 } 696 }
688 if (dp->dl_retries--) { 697 if (dp->dl_retries--) {
689 rpc_delay(task, 2*HZ); 698 rpc_delay(task, 2*HZ);
690 task->tk_status = 0; 699 task->tk_status = 0;
691 rpc_restart_call(task); 700 rpc_restart_call_prepare(task);
692 return; 701 return;
693 } else { 702 } else {
694 atomic_set(&clp->cl_cb_set, 0); 703 atomic_set(&clp->cl_cb_set, 0);
@@ -752,18 +761,16 @@ static void _nfsd4_cb_recall(struct nfs4_delegation *dp)
752 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL], 761 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL],
753 .rpc_cred = callback_cred 762 .rpc_cred = callback_cred
754 }; 763 };
755 int status;
756 764
757 if (clnt == NULL) 765 if (clnt == NULL) {
766 nfs4_put_delegation(dp);
758 return; /* Client is shutting down; give up. */ 767 return; /* Client is shutting down; give up. */
768 }
759 769
760 args->args_op = dp; 770 args->args_op = dp;
761 msg.rpc_argp = args; 771 msg.rpc_argp = args;
762 dp->dl_retries = 1; 772 dp->dl_retries = 1;
763 status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, 773 rpc_call_async(clnt, &msg, RPC_TASK_SOFT, &nfsd4_cb_recall_ops, dp);
764 &nfsd4_cb_recall_ops, dp);
765 if (status)
766 nfs4_put_delegation(dp);
767} 774}
768 775
769void nfsd4_do_callback_rpc(struct work_struct *w) 776void nfsd4_do_callback_rpc(struct work_struct *w)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4a2734758778..2e7357104cfd 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -51,7 +51,6 @@ static time_t boot_time;
51static u32 current_ownerid = 1; 51static u32 current_ownerid = 1;
52static u32 current_fileid = 1; 52static u32 current_fileid = 1;
53static u32 current_delegid = 1; 53static u32 current_delegid = 1;
54static u32 nfs4_init;
55static stateid_t zerostateid; /* bits all 0 */ 54static stateid_t zerostateid; /* bits all 0 */
56static stateid_t onestateid; /* bits all 1 */ 55static stateid_t onestateid; /* bits all 1 */
57static u64 current_sessionid = 1; 56static u64 current_sessionid = 1;
@@ -163,6 +162,46 @@ static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
163static struct list_head file_hashtbl[FILE_HASH_SIZE]; 162static struct list_head file_hashtbl[FILE_HASH_SIZE];
164static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; 163static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
165 164
165static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
166{
167 BUG_ON(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));
168 atomic_inc(&fp->fi_access[oflag]);
169}
170
171static void nfs4_file_get_access(struct nfs4_file *fp, int oflag)
172{
173 if (oflag == O_RDWR) {
174 __nfs4_file_get_access(fp, O_RDONLY);
175 __nfs4_file_get_access(fp, O_WRONLY);
176 } else
177 __nfs4_file_get_access(fp, oflag);
178}
179
180static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag)
181{
182 if (fp->fi_fds[oflag]) {
183 fput(fp->fi_fds[oflag]);
184 fp->fi_fds[oflag] = NULL;
185 }
186}
187
188static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
189{
190 if (atomic_dec_and_test(&fp->fi_access[oflag])) {
191 nfs4_file_put_fd(fp, O_RDWR);
192 nfs4_file_put_fd(fp, oflag);
193 }
194}
195
196static void nfs4_file_put_access(struct nfs4_file *fp, int oflag)
197{
198 if (oflag == O_RDWR) {
199 __nfs4_file_put_access(fp, O_RDONLY);
200 __nfs4_file_put_access(fp, O_WRONLY);
201 } else
202 __nfs4_file_put_access(fp, oflag);
203}
204
166static struct nfs4_delegation * 205static struct nfs4_delegation *
167alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) 206alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
168{ 207{
@@ -171,6 +210,13 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
171 struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn; 210 struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn;
172 211
173 dprintk("NFSD alloc_init_deleg\n"); 212 dprintk("NFSD alloc_init_deleg\n");
213 /*
214 * Major work on the lease subsystem (for example, to support
215 * calbacks on stat) will be required before we can support
216 * write delegations properly.
217 */
218 if (type != NFS4_OPEN_DELEGATE_READ)
219 return NULL;
174 if (fp->fi_had_conflict) 220 if (fp->fi_had_conflict)
175 return NULL; 221 return NULL;
176 if (num_delegations > max_delegations) 222 if (num_delegations > max_delegations)
@@ -185,9 +231,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
185 dp->dl_client = clp; 231 dp->dl_client = clp;
186 get_nfs4_file(fp); 232 get_nfs4_file(fp);
187 dp->dl_file = fp; 233 dp->dl_file = fp;
234 nfs4_file_get_access(fp, O_RDONLY);
188 dp->dl_flock = NULL; 235 dp->dl_flock = NULL;
189 get_file(stp->st_vfs_file);
190 dp->dl_vfs_file = stp->st_vfs_file;
191 dp->dl_type = type; 236 dp->dl_type = type;
192 dp->dl_ident = cb->cb_ident; 237 dp->dl_ident = cb->cb_ident;
193 dp->dl_stateid.si_boot = boot_time; 238 dp->dl_stateid.si_boot = boot_time;
@@ -222,15 +267,12 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
222static void 267static void
223nfs4_close_delegation(struct nfs4_delegation *dp) 268nfs4_close_delegation(struct nfs4_delegation *dp)
224{ 269{
225 struct file *filp = dp->dl_vfs_file; 270 struct file *filp = find_readable_file(dp->dl_file);
226 271
227 dprintk("NFSD: close_delegation dp %p\n",dp); 272 dprintk("NFSD: close_delegation dp %p\n",dp);
228 dp->dl_vfs_file = NULL;
229 /* The following nfsd_close may not actually close the file,
230 * but we want to remove the lease in any case. */
231 if (dp->dl_flock) 273 if (dp->dl_flock)
232 vfs_setlease(filp, F_UNLCK, &dp->dl_flock); 274 vfs_setlease(filp, F_UNLCK, &dp->dl_flock);
233 nfsd_close(filp); 275 nfs4_file_put_access(dp->dl_file, O_RDONLY);
234} 276}
235 277
236/* Called under the state lock. */ 278/* Called under the state lock. */
@@ -302,8 +344,12 @@ static void free_generic_stateid(struct nfs4_stateid *stp)
302 344
303static void release_lock_stateid(struct nfs4_stateid *stp) 345static void release_lock_stateid(struct nfs4_stateid *stp)
304{ 346{
347 struct file *file;
348
305 unhash_generic_stateid(stp); 349 unhash_generic_stateid(stp);
306 locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner); 350 file = find_any_file(stp->st_file);
351 if (file)
352 locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
307 free_generic_stateid(stp); 353 free_generic_stateid(stp);
308} 354}
309 355
@@ -341,11 +387,85 @@ release_stateid_lockowners(struct nfs4_stateid *open_stp)
341 } 387 }
342} 388}
343 389
390/*
391 * We store the NONE, READ, WRITE, and BOTH bits separately in the
392 * st_{access,deny}_bmap field of the stateid, in order to track not
393 * only what share bits are currently in force, but also what
394 * combinations of share bits previous opens have used. This allows us
395 * to enforce the recommendation of rfc 3530 14.2.19 that the server
396 * return an error if the client attempt to downgrade to a combination
397 * of share bits not explicable by closing some of its previous opens.
398 *
399 * XXX: This enforcement is actually incomplete, since we don't keep
400 * track of access/deny bit combinations; so, e.g., we allow:
401 *
402 * OPEN allow read, deny write
403 * OPEN allow both, deny none
404 * DOWNGRADE allow read, deny none
405 *
406 * which we should reject.
407 */
408static void
409set_access(unsigned int *access, unsigned long bmap) {
410 int i;
411
412 *access = 0;
413 for (i = 1; i < 4; i++) {
414 if (test_bit(i, &bmap))
415 *access |= i;
416 }
417}
418
419static void
420set_deny(unsigned int *deny, unsigned long bmap) {
421 int i;
422
423 *deny = 0;
424 for (i = 0; i < 4; i++) {
425 if (test_bit(i, &bmap))
426 *deny |= i ;
427 }
428}
429
430static int
431test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
432 unsigned int access, deny;
433
434 set_access(&access, stp->st_access_bmap);
435 set_deny(&deny, stp->st_deny_bmap);
436 if ((access & open->op_share_deny) || (deny & open->op_share_access))
437 return 0;
438 return 1;
439}
440
441static int nfs4_access_to_omode(u32 access)
442{
443 switch (access) {
444 case NFS4_SHARE_ACCESS_READ:
445 return O_RDONLY;
446 case NFS4_SHARE_ACCESS_WRITE:
447 return O_WRONLY;
448 case NFS4_SHARE_ACCESS_BOTH:
449 return O_RDWR;
450 }
451 BUG();
452}
453
454static int nfs4_access_bmap_to_omode(struct nfs4_stateid *stp)
455{
456 unsigned int access;
457
458 set_access(&access, stp->st_access_bmap);
459 return nfs4_access_to_omode(access);
460}
461
344static void release_open_stateid(struct nfs4_stateid *stp) 462static void release_open_stateid(struct nfs4_stateid *stp)
345{ 463{
464 int oflag = nfs4_access_bmap_to_omode(stp);
465
346 unhash_generic_stateid(stp); 466 unhash_generic_stateid(stp);
347 release_stateid_lockowners(stp); 467 release_stateid_lockowners(stp);
348 nfsd_close(stp->st_vfs_file); 468 nfs4_file_put_access(stp->st_file, oflag);
349 free_generic_stateid(stp); 469 free_generic_stateid(stp);
350} 470}
351 471
@@ -457,7 +577,7 @@ static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan)
457 spin_unlock(&nfsd_drc_lock); 577 spin_unlock(&nfsd_drc_lock);
458 578
459 if (fchan->maxreqs == 0) 579 if (fchan->maxreqs == 0)
460 return nfserr_serverfault; 580 return nfserr_jukebox;
461 581
462 fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ; 582 fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ;
463 return 0; 583 return 0;
@@ -542,7 +662,7 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
542 BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot) 662 BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot)
543 + sizeof(struct nfsd4_session) > PAGE_SIZE); 663 + sizeof(struct nfsd4_session) > PAGE_SIZE);
544 664
545 status = nfserr_serverfault; 665 status = nfserr_jukebox;
546 /* allocate struct nfsd4_session and slot table pointers in one piece */ 666 /* allocate struct nfsd4_session and slot table pointers in one piece */
547 slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *); 667 slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *);
548 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL); 668 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
@@ -591,10 +711,8 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
591 711
592 dump_sessionid(__func__, sessionid); 712 dump_sessionid(__func__, sessionid);
593 idx = hash_sessionid(sessionid); 713 idx = hash_sessionid(sessionid);
594 dprintk("%s: idx is %d\n", __func__, idx);
595 /* Search in the appropriate list */ 714 /* Search in the appropriate list */
596 list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) { 715 list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
597 dump_sessionid("list traversal", &elem->se_sessionid);
598 if (!memcmp(elem->se_sessionid.data, sessionid->data, 716 if (!memcmp(elem->se_sessionid.data, sessionid->data,
599 NFS4_MAX_SESSIONID_LEN)) { 717 NFS4_MAX_SESSIONID_LEN)) {
600 return elem; 718 return elem;
@@ -714,7 +832,6 @@ release_session_client(struct nfsd4_session *session)
714 } else 832 } else
715 renew_client_locked(clp); 833 renew_client_locked(clp);
716 spin_unlock(&client_lock); 834 spin_unlock(&client_lock);
717 nfsd4_put_session(session);
718} 835}
719 836
720/* must be called under the client_lock */ 837/* must be called under the client_lock */
@@ -1220,7 +1337,7 @@ out_new:
1220 /* Normal case */ 1337 /* Normal case */
1221 new = create_client(exid->clname, dname, rqstp, &verf); 1338 new = create_client(exid->clname, dname, rqstp, &verf);
1222 if (new == NULL) { 1339 if (new == NULL) {
1223 status = nfserr_serverfault; 1340 status = nfserr_jukebox;
1224 goto out; 1341 goto out;
1225 } 1342 }
1226 1343
@@ -1760,6 +1877,8 @@ alloc_init_file(struct inode *ino)
1760 fp->fi_inode = igrab(ino); 1877 fp->fi_inode = igrab(ino);
1761 fp->fi_id = current_fileid++; 1878 fp->fi_id = current_fileid++;
1762 fp->fi_had_conflict = false; 1879 fp->fi_had_conflict = false;
1880 memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
1881 memset(fp->fi_access, 0, sizeof(fp->fi_access));
1763 spin_lock(&recall_lock); 1882 spin_lock(&recall_lock);
1764 list_add(&fp->fi_hash, &file_hashtbl[hashval]); 1883 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
1765 spin_unlock(&recall_lock); 1884 spin_unlock(&recall_lock);
@@ -1971,57 +2090,6 @@ static inline int deny_valid(u32 x)
1971} 2090}
1972 2091
1973/* 2092/*
1974 * We store the NONE, READ, WRITE, and BOTH bits separately in the
1975 * st_{access,deny}_bmap field of the stateid, in order to track not
1976 * only what share bits are currently in force, but also what
1977 * combinations of share bits previous opens have used. This allows us
1978 * to enforce the recommendation of rfc 3530 14.2.19 that the server
1979 * return an error if the client attempt to downgrade to a combination
1980 * of share bits not explicable by closing some of its previous opens.
1981 *
1982 * XXX: This enforcement is actually incomplete, since we don't keep
1983 * track of access/deny bit combinations; so, e.g., we allow:
1984 *
1985 * OPEN allow read, deny write
1986 * OPEN allow both, deny none
1987 * DOWNGRADE allow read, deny none
1988 *
1989 * which we should reject.
1990 */
1991static void
1992set_access(unsigned int *access, unsigned long bmap) {
1993 int i;
1994
1995 *access = 0;
1996 for (i = 1; i < 4; i++) {
1997 if (test_bit(i, &bmap))
1998 *access |= i;
1999 }
2000}
2001
2002static void
2003set_deny(unsigned int *deny, unsigned long bmap) {
2004 int i;
2005
2006 *deny = 0;
2007 for (i = 0; i < 4; i++) {
2008 if (test_bit(i, &bmap))
2009 *deny |= i ;
2010 }
2011}
2012
2013static int
2014test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
2015 unsigned int access, deny;
2016
2017 set_access(&access, stp->st_access_bmap);
2018 set_deny(&deny, stp->st_deny_bmap);
2019 if ((access & open->op_share_deny) || (deny & open->op_share_access))
2020 return 0;
2021 return 1;
2022}
2023
2024/*
2025 * Called to check deny when READ with all zero stateid or 2093 * Called to check deny when READ with all zero stateid or
2026 * WRITE with all zero or all one stateid 2094 * WRITE with all zero or all one stateid
2027 */ 2095 */
@@ -2052,14 +2120,12 @@ out:
2052} 2120}
2053 2121
2054static inline void 2122static inline void
2055nfs4_file_downgrade(struct file *filp, unsigned int share_access) 2123nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
2056{ 2124{
2057 if (share_access & NFS4_SHARE_ACCESS_WRITE) { 2125 if (share_access & NFS4_SHARE_ACCESS_WRITE)
2058 drop_file_write_access(filp); 2126 nfs4_file_put_access(fp, O_WRONLY);
2059 spin_lock(&filp->f_lock); 2127 if (share_access & NFS4_SHARE_ACCESS_READ)
2060 filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE; 2128 nfs4_file_put_access(fp, O_RDONLY);
2061 spin_unlock(&filp->f_lock);
2062 }
2063} 2129}
2064 2130
2065/* 2131/*
@@ -2255,6 +2321,13 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
2255 return NULL; 2321 return NULL;
2256} 2322}
2257 2323
2324int share_access_to_flags(u32 share_access)
2325{
2326 share_access &= ~NFS4_SHARE_WANT_MASK;
2327
2328 return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
2329}
2330
2258static __be32 2331static __be32
2259nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, 2332nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
2260 struct nfs4_delegation **dp) 2333 struct nfs4_delegation **dp)
@@ -2265,8 +2338,7 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open,
2265 *dp = find_delegation_file(fp, &open->op_delegate_stateid); 2338 *dp = find_delegation_file(fp, &open->op_delegate_stateid);
2266 if (*dp == NULL) 2339 if (*dp == NULL)
2267 goto out; 2340 goto out;
2268 flags = open->op_share_access == NFS4_SHARE_ACCESS_READ ? 2341 flags = share_access_to_flags(open->op_share_access);
2269 RD_STATE : WR_STATE;
2270 status = nfs4_check_delegmode(*dp, flags); 2342 status = nfs4_check_delegmode(*dp, flags);
2271 if (status) 2343 if (status)
2272 *dp = NULL; 2344 *dp = NULL;
@@ -2308,30 +2380,53 @@ nfs4_alloc_stateid(void)
2308 return kmem_cache_alloc(stateid_slab, GFP_KERNEL); 2380 return kmem_cache_alloc(stateid_slab, GFP_KERNEL);
2309} 2381}
2310 2382
2383static inline int nfs4_access_to_access(u32 nfs4_access)
2384{
2385 int flags = 0;
2386
2387 if (nfs4_access & NFS4_SHARE_ACCESS_READ)
2388 flags |= NFSD_MAY_READ;
2389 if (nfs4_access & NFS4_SHARE_ACCESS_WRITE)
2390 flags |= NFSD_MAY_WRITE;
2391 return flags;
2392}
2393
2394static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file
2395*fp, struct svc_fh *cur_fh, u32 nfs4_access)
2396{
2397 __be32 status;
2398 int oflag = nfs4_access_to_omode(nfs4_access);
2399 int access = nfs4_access_to_access(nfs4_access);
2400
2401 if (!fp->fi_fds[oflag]) {
2402 status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
2403 &fp->fi_fds[oflag]);
2404 if (status == nfserr_dropit)
2405 status = nfserr_jukebox;
2406 if (status)
2407 return status;
2408 }
2409 nfs4_file_get_access(fp, oflag);
2410
2411 return nfs_ok;
2412}
2413
2311static __be32 2414static __be32
2312nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp, 2415nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp,
2313 struct nfs4_delegation *dp, 2416 struct nfs4_file *fp, struct svc_fh *cur_fh,
2314 struct svc_fh *cur_fh, int flags) 2417 struct nfsd4_open *open)
2315{ 2418{
2316 struct nfs4_stateid *stp; 2419 struct nfs4_stateid *stp;
2420 __be32 status;
2317 2421
2318 stp = nfs4_alloc_stateid(); 2422 stp = nfs4_alloc_stateid();
2319 if (stp == NULL) 2423 if (stp == NULL)
2320 return nfserr_resource; 2424 return nfserr_resource;
2321 2425
2322 if (dp) { 2426 status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open->op_share_access);
2323 get_file(dp->dl_vfs_file); 2427 if (status) {
2324 stp->st_vfs_file = dp->dl_vfs_file; 2428 kmem_cache_free(stateid_slab, stp);
2325 } else { 2429 return status;
2326 __be32 status;
2327 status = nfsd_open(rqstp, cur_fh, S_IFREG, flags,
2328 &stp->st_vfs_file);
2329 if (status) {
2330 if (status == nfserr_dropit)
2331 status = nfserr_jukebox;
2332 kmem_cache_free(stateid_slab, stp);
2333 return status;
2334 }
2335 } 2430 }
2336 *stpp = stp; 2431 *stpp = stp;
2337 return 0; 2432 return 0;
@@ -2353,35 +2448,30 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
2353} 2448}
2354 2449
2355static __be32 2450static __be32
2356nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) 2451nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
2357{ 2452{
2358 struct file *filp = stp->st_vfs_file; 2453 u32 op_share_access, new_access;
2359 struct inode *inode = filp->f_path.dentry->d_inode;
2360 unsigned int share_access, new_writer;
2361 __be32 status; 2454 __be32 status;
2362 2455
2363 set_access(&share_access, stp->st_access_bmap); 2456 set_access(&new_access, stp->st_access_bmap);
2364 new_writer = (~share_access) & open->op_share_access 2457 new_access = (~new_access) & open->op_share_access & ~NFS4_SHARE_WANT_MASK;
2365 & NFS4_SHARE_ACCESS_WRITE; 2458
2366 2459 if (new_access) {
2367 if (new_writer) { 2460 status = nfs4_get_vfs_file(rqstp, fp, cur_fh, new_access);
2368 int err = get_write_access(inode); 2461 if (status)
2369 if (err) 2462 return status;
2370 return nfserrno(err);
2371 err = mnt_want_write(cur_fh->fh_export->ex_path.mnt);
2372 if (err)
2373 return nfserrno(err);
2374 file_take_write(filp);
2375 } 2463 }
2376 status = nfsd4_truncate(rqstp, cur_fh, open); 2464 status = nfsd4_truncate(rqstp, cur_fh, open);
2377 if (status) { 2465 if (status) {
2378 if (new_writer) 2466 if (new_access) {
2379 put_write_access(inode); 2467 int oflag = nfs4_access_to_omode(new_access);
2468 nfs4_file_put_access(fp, oflag);
2469 }
2380 return status; 2470 return status;
2381 } 2471 }
2382 /* remember the open */ 2472 /* remember the open */
2383 filp->f_mode |= open->op_share_access; 2473 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
2384 __set_bit(open->op_share_access, &stp->st_access_bmap); 2474 __set_bit(op_share_access, &stp->st_access_bmap);
2385 __set_bit(open->op_share_deny, &stp->st_deny_bmap); 2475 __set_bit(open->op_share_deny, &stp->st_deny_bmap);
2386 2476
2387 return nfs_ok; 2477 return nfs_ok;
@@ -2444,13 +2534,14 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2444 fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; 2534 fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
2445 fl.fl_end = OFFSET_MAX; 2535 fl.fl_end = OFFSET_MAX;
2446 fl.fl_owner = (fl_owner_t)dp; 2536 fl.fl_owner = (fl_owner_t)dp;
2447 fl.fl_file = stp->st_vfs_file; 2537 fl.fl_file = find_readable_file(stp->st_file);
2538 BUG_ON(!fl.fl_file);
2448 fl.fl_pid = current->tgid; 2539 fl.fl_pid = current->tgid;
2449 2540
2450 /* vfs_setlease checks to see if delegation should be handed out. 2541 /* vfs_setlease checks to see if delegation should be handed out.
2451 * the lock_manager callbacks fl_mylease and fl_change are used 2542 * the lock_manager callbacks fl_mylease and fl_change are used
2452 */ 2543 */
2453 if ((status = vfs_setlease(stp->st_vfs_file, fl.fl_type, &flp))) { 2544 if ((status = vfs_setlease(fl.fl_file, fl.fl_type, &flp))) {
2454 dprintk("NFSD: setlease failed [%d], no delegation\n", status); 2545 dprintk("NFSD: setlease failed [%d], no delegation\n", status);
2455 unhash_delegation(dp); 2546 unhash_delegation(dp);
2456 flag = NFS4_OPEN_DELEGATE_NONE; 2547 flag = NFS4_OPEN_DELEGATE_NONE;
@@ -2514,18 +2605,12 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
2514 */ 2605 */
2515 if (stp) { 2606 if (stp) {
2516 /* Stateid was found, this is an OPEN upgrade */ 2607 /* Stateid was found, this is an OPEN upgrade */
2517 status = nfs4_upgrade_open(rqstp, current_fh, stp, open); 2608 status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
2518 if (status) 2609 if (status)
2519 goto out; 2610 goto out;
2520 update_stateid(&stp->st_stateid); 2611 update_stateid(&stp->st_stateid);
2521 } else { 2612 } else {
2522 /* Stateid was not found, this is a new OPEN */ 2613 status = nfs4_new_open(rqstp, &stp, fp, current_fh, open);
2523 int flags = 0;
2524 if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
2525 flags |= NFSD_MAY_READ;
2526 if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
2527 flags |= NFSD_MAY_WRITE;
2528 status = nfs4_new_open(rqstp, &stp, dp, current_fh, flags);
2529 if (status) 2614 if (status)
2530 goto out; 2615 goto out;
2531 init_stateid(stp, fp, open); 2616 init_stateid(stp, fp, open);
@@ -2727,7 +2812,7 @@ search_close_lru(u32 st_id, int flags)
2727static inline int 2812static inline int
2728nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp) 2813nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp)
2729{ 2814{
2730 return fhp->fh_dentry->d_inode != stp->st_vfs_file->f_path.dentry->d_inode; 2815 return fhp->fh_dentry->d_inode != stp->st_file->fi_inode;
2731} 2816}
2732 2817
2733static int 2818static int
@@ -2760,6 +2845,9 @@ __be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags)
2760{ 2845{
2761 __be32 status = nfserr_openmode; 2846 __be32 status = nfserr_openmode;
2762 2847
2848 /* For lock stateid's, we test the parent open, not the lock: */
2849 if (stp->st_openstp)
2850 stp = stp->st_openstp;
2763 if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap))) 2851 if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap)))
2764 goto out; 2852 goto out;
2765 if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap))) 2853 if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap)))
@@ -2872,7 +2960,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2872 goto out; 2960 goto out;
2873 renew_client(dp->dl_client); 2961 renew_client(dp->dl_client);
2874 if (filpp) 2962 if (filpp)
2875 *filpp = dp->dl_vfs_file; 2963 *filpp = find_readable_file(dp->dl_file);
2964 BUG_ON(!*filpp);
2876 } else { /* open or lock stateid */ 2965 } else { /* open or lock stateid */
2877 stp = find_stateid(stateid, flags); 2966 stp = find_stateid(stateid, flags);
2878 if (!stp) 2967 if (!stp)
@@ -2889,8 +2978,13 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2889 if (status) 2978 if (status)
2890 goto out; 2979 goto out;
2891 renew_client(stp->st_stateowner->so_client); 2980 renew_client(stp->st_stateowner->so_client);
2892 if (filpp) 2981 if (filpp) {
2893 *filpp = stp->st_vfs_file; 2982 if (flags & RD_STATE)
2983 *filpp = find_readable_file(stp->st_file);
2984 else
2985 *filpp = find_writeable_file(stp->st_file);
2986 BUG_ON(!*filpp); /* assured by check_openmode */
2987 }
2894 } 2988 }
2895 status = nfs_ok; 2989 status = nfs_ok;
2896out: 2990out:
@@ -3126,8 +3220,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
3126 goto out; 3220 goto out;
3127 } 3221 }
3128 set_access(&share_access, stp->st_access_bmap); 3222 set_access(&share_access, stp->st_access_bmap);
3129 nfs4_file_downgrade(stp->st_vfs_file, 3223 nfs4_file_downgrade(stp->st_file, share_access & ~od->od_share_access);
3130 share_access & ~od->od_share_access);
3131 3224
3132 reset_union_bmap_access(od->od_share_access, &stp->st_access_bmap); 3225 reset_union_bmap_access(od->od_share_access, &stp->st_access_bmap);
3133 reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap); 3226 reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap);
@@ -3346,11 +3439,9 @@ static inline void
3346nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) 3439nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
3347{ 3440{
3348 struct nfs4_stateowner *sop; 3441 struct nfs4_stateowner *sop;
3349 unsigned int hval;
3350 3442
3351 if (fl->fl_lmops == &nfsd_posix_mng_ops) { 3443 if (fl->fl_lmops == &nfsd_posix_mng_ops) {
3352 sop = (struct nfs4_stateowner *) fl->fl_owner; 3444 sop = (struct nfs4_stateowner *) fl->fl_owner;
3353 hval = lockownerid_hashval(sop->so_id);
3354 kref_get(&sop->so_ref); 3445 kref_get(&sop->so_ref);
3355 deny->ld_sop = sop; 3446 deny->ld_sop = sop;
3356 deny->ld_clientid = sop->so_client->cl_clientid; 3447 deny->ld_clientid = sop->so_client->cl_clientid;
@@ -3446,8 +3537,6 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
3446 stp->st_stateid.si_stateownerid = sop->so_id; 3537 stp->st_stateid.si_stateownerid = sop->so_id;
3447 stp->st_stateid.si_fileid = fp->fi_id; 3538 stp->st_stateid.si_fileid = fp->fi_id;
3448 stp->st_stateid.si_generation = 0; 3539 stp->st_stateid.si_generation = 0;
3449 stp->st_vfs_file = open_stp->st_vfs_file; /* FIXME refcount?? */
3450 stp->st_access_bmap = open_stp->st_access_bmap;
3451 stp->st_deny_bmap = open_stp->st_deny_bmap; 3540 stp->st_deny_bmap = open_stp->st_deny_bmap;
3452 stp->st_openstp = open_stp; 3541 stp->st_openstp = open_stp;
3453 3542
@@ -3547,7 +3636,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3547 lock_sop = lock->lk_replay_owner; 3636 lock_sop = lock->lk_replay_owner;
3548 } 3637 }
3549 /* lock->lk_replay_owner and lock_stp have been created or found */ 3638 /* lock->lk_replay_owner and lock_stp have been created or found */
3550 filp = lock_stp->st_vfs_file;
3551 3639
3552 status = nfserr_grace; 3640 status = nfserr_grace;
3553 if (locks_in_grace() && !lock->lk_reclaim) 3641 if (locks_in_grace() && !lock->lk_reclaim)
@@ -3560,11 +3648,13 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3560 switch (lock->lk_type) { 3648 switch (lock->lk_type) {
3561 case NFS4_READ_LT: 3649 case NFS4_READ_LT:
3562 case NFS4_READW_LT: 3650 case NFS4_READW_LT:
3651 filp = find_readable_file(lock_stp->st_file);
3563 file_lock.fl_type = F_RDLCK; 3652 file_lock.fl_type = F_RDLCK;
3564 cmd = F_SETLK; 3653 cmd = F_SETLK;
3565 break; 3654 break;
3566 case NFS4_WRITE_LT: 3655 case NFS4_WRITE_LT:
3567 case NFS4_WRITEW_LT: 3656 case NFS4_WRITEW_LT:
3657 filp = find_writeable_file(lock_stp->st_file);
3568 file_lock.fl_type = F_WRLCK; 3658 file_lock.fl_type = F_WRLCK;
3569 cmd = F_SETLK; 3659 cmd = F_SETLK;
3570 break; 3660 break;
@@ -3572,6 +3662,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3572 status = nfserr_inval; 3662 status = nfserr_inval;
3573 goto out; 3663 goto out;
3574 } 3664 }
3665 if (!filp) {
3666 status = nfserr_openmode;
3667 goto out;
3668 }
3575 file_lock.fl_owner = (fl_owner_t)lock_sop; 3669 file_lock.fl_owner = (fl_owner_t)lock_sop;
3576 file_lock.fl_pid = current->tgid; 3670 file_lock.fl_pid = current->tgid;
3577 file_lock.fl_file = filp; 3671 file_lock.fl_file = filp;
@@ -3740,7 +3834,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3740 &locku->lu_stateowner, &stp, NULL))) 3834 &locku->lu_stateowner, &stp, NULL)))
3741 goto out; 3835 goto out;
3742 3836
3743 filp = stp->st_vfs_file; 3837 filp = find_any_file(stp->st_file);
3838 if (!filp) {
3839 status = nfserr_lock_range;
3840 goto out;
3841 }
3744 BUG_ON(!filp); 3842 BUG_ON(!filp);
3745 locks_init_lock(&file_lock); 3843 locks_init_lock(&file_lock);
3746 file_lock.fl_type = F_UNLCK; 3844 file_lock.fl_type = F_UNLCK;
@@ -3787,10 +3885,10 @@ out_nfserr:
3787 * 0: no locks held by lockowner 3885 * 0: no locks held by lockowner
3788 */ 3886 */
3789static int 3887static int
3790check_for_locks(struct file *filp, struct nfs4_stateowner *lowner) 3888check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner)
3791{ 3889{
3792 struct file_lock **flpp; 3890 struct file_lock **flpp;
3793 struct inode *inode = filp->f_path.dentry->d_inode; 3891 struct inode *inode = filp->fi_inode;
3794 int status = 0; 3892 int status = 0;
3795 3893
3796 lock_kernel(); 3894 lock_kernel();
@@ -3841,7 +3939,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
3841 continue; 3939 continue;
3842 list_for_each_entry(stp, &sop->so_stateids, 3940 list_for_each_entry(stp, &sop->so_stateids,
3843 st_perstateowner) { 3941 st_perstateowner) {
3844 if (check_for_locks(stp->st_vfs_file, sop)) 3942 if (check_for_locks(stp->st_file, sop))
3845 goto out; 3943 goto out;
3846 /* Note: so_perclient unused for lockowners, 3944 /* Note: so_perclient unused for lockowners,
3847 * so it's OK to fool with here. */ 3945 * so it's OK to fool with here. */
@@ -4066,16 +4164,8 @@ out_free_laundry:
4066int 4164int
4067nfs4_state_start(void) 4165nfs4_state_start(void)
4068{ 4166{
4069 int ret;
4070
4071 if (nfs4_init)
4072 return 0;
4073 nfsd4_load_reboot_recovery_data(); 4167 nfsd4_load_reboot_recovery_data();
4074 ret = __nfs4_state_start(); 4168 return __nfs4_state_start();
4075 if (ret)
4076 return ret;
4077 nfs4_init = 1;
4078 return 0;
4079} 4169}
4080 4170
4081static void 4171static void
@@ -4110,7 +4200,6 @@ __nfs4_state_shutdown(void)
4110 } 4200 }
4111 4201
4112 nfsd4_shutdown_recdir(); 4202 nfsd4_shutdown_recdir();
4113 nfs4_init = 0;
4114} 4203}
4115 4204
4116void 4205void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index ac17a7080239..f8931acb05f3 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2630,7 +2630,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
2630 } 2630 }
2631 read->rd_vlen = v; 2631 read->rd_vlen = v;
2632 2632
2633 nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, 2633 nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp,
2634 read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, 2634 read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
2635 &maxcount); 2635 &maxcount);
2636 2636
@@ -3325,6 +3325,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
3325 } 3325 }
3326 /* Renew the clientid on success and on replay */ 3326 /* Renew the clientid on success and on replay */
3327 release_session_client(cs->session); 3327 release_session_client(cs->session);
3328 nfsd4_put_session(cs->session);
3328 } 3329 }
3329 return 1; 3330 return 1;
3330} 3331}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 508941c23af7..b53b1d042f1f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -949,15 +949,12 @@ static ssize_t __write_ports_addfd(char *buf)
949 if (err != 0) 949 if (err != 0)
950 return err; 950 return err;
951 951
952 err = lockd_up();
953 if (err != 0)
954 goto out;
955
956 err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT); 952 err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
957 if (err < 0) 953 if (err < 0) {
958 lockd_down(); 954 svc_destroy(nfsd_serv);
955 return err;
956 }
959 957
960out:
961 /* Decrease the count, but don't shut down the service */ 958 /* Decrease the count, but don't shut down the service */
962 nfsd_serv->sv_nrthreads--; 959 nfsd_serv->sv_nrthreads--;
963 return err; 960 return err;
@@ -978,9 +975,6 @@ static ssize_t __write_ports_delfd(char *buf)
978 if (nfsd_serv != NULL) 975 if (nfsd_serv != NULL)
979 len = svc_sock_names(nfsd_serv, buf, 976 len = svc_sock_names(nfsd_serv, buf,
980 SIMPLE_TRANSACTION_LIMIT, toclose); 977 SIMPLE_TRANSACTION_LIMIT, toclose);
981 if (len >= 0)
982 lockd_down();
983
984 kfree(toclose); 978 kfree(toclose);
985 return len; 979 return len;
986} 980}
@@ -1014,6 +1008,9 @@ static ssize_t __write_ports_addxprt(char *buf)
1014 PF_INET6, port, SVC_SOCK_ANONYMOUS); 1008 PF_INET6, port, SVC_SOCK_ANONYMOUS);
1015 if (err < 0 && err != -EAFNOSUPPORT) 1009 if (err < 0 && err != -EAFNOSUPPORT)
1016 goto out_close; 1010 goto out_close;
1011
1012 /* Decrease the count, but don't shut down the service */
1013 nfsd_serv->sv_nrthreads--;
1017 return 0; 1014 return 0;
1018out_close: 1015out_close:
1019 xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port); 1016 xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port);
@@ -1022,8 +1019,7 @@ out_close:
1022 svc_xprt_put(xprt); 1019 svc_xprt_put(xprt);
1023 } 1020 }
1024out_err: 1021out_err:
1025 /* Decrease the count, but don't shut down the service */ 1022 svc_destroy(nfsd_serv);
1026 nfsd_serv->sv_nrthreads--;
1027 return err; 1023 return err;
1028} 1024}
1029 1025
@@ -1194,7 +1190,7 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
1194 bsize = NFSSVC_MAXBLKSIZE; 1190 bsize = NFSSVC_MAXBLKSIZE;
1195 bsize &= ~(1024-1); 1191 bsize &= ~(1024-1);
1196 mutex_lock(&nfsd_mutex); 1192 mutex_lock(&nfsd_mutex);
1197 if (nfsd_serv && nfsd_serv->sv_nrthreads) { 1193 if (nfsd_serv) {
1198 mutex_unlock(&nfsd_mutex); 1194 mutex_unlock(&nfsd_mutex);
1199 return -EBUSY; 1195 return -EBUSY;
1200 } 1196 }
@@ -1310,6 +1306,8 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
1310 return -EINVAL; 1306 return -EINVAL;
1311 1307
1312 status = nfs4_reset_recoverydir(recdir); 1308 status = nfs4_reset_recoverydir(recdir);
1309 if (status)
1310 return status;
1313 } 1311 }
1314 1312
1315 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n", 1313 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n",
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 72377761270e..b76ac3a82e39 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -153,6 +153,7 @@ void nfsd_lockd_shutdown(void);
153#define nfserr_bad_seqid cpu_to_be32(NFSERR_BAD_SEQID) 153#define nfserr_bad_seqid cpu_to_be32(NFSERR_BAD_SEQID)
154#define nfserr_symlink cpu_to_be32(NFSERR_SYMLINK) 154#define nfserr_symlink cpu_to_be32(NFSERR_SYMLINK)
155#define nfserr_not_same cpu_to_be32(NFSERR_NOT_SAME) 155#define nfserr_not_same cpu_to_be32(NFSERR_NOT_SAME)
156#define nfserr_lock_range cpu_to_be32(NFSERR_LOCK_RANGE)
156#define nfserr_restorefh cpu_to_be32(NFSERR_RESTOREFH) 157#define nfserr_restorefh cpu_to_be32(NFSERR_RESTOREFH)
157#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) 158#define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP)
158#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) 159#define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR)
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index a047ad6111ef..08e17264784b 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -144,7 +144,7 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
144 svc_reserve_auth(rqstp, (19<<2) + argp->count + 4); 144 svc_reserve_auth(rqstp, (19<<2) + argp->count + 4);
145 145
146 resp->count = argp->count; 146 resp->count = argp->count;
147 nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, 147 nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
148 argp->offset, 148 argp->offset,
149 rqstp->rq_vec, argp->vlen, 149 rqstp->rq_vec, argp->vlen,
150 &resp->count); 150 &resp->count);
@@ -290,7 +290,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
290 * gospel of sun micro 290 * gospel of sun micro
291 */ 291 */
292 if (type != S_IFREG) { 292 if (type != S_IFREG) {
293 int is_borc = 0;
294 if (type != S_IFBLK && type != S_IFCHR) { 293 if (type != S_IFBLK && type != S_IFCHR) {
295 rdev = 0; 294 rdev = 0;
296 } else if (type == S_IFCHR && !(attr->ia_valid & ATTR_SIZE)) { 295 } else if (type == S_IFCHR && !(attr->ia_valid & ATTR_SIZE)) {
@@ -298,7 +297,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
298 type = S_IFIFO; 297 type = S_IFIFO;
299 } else { 298 } else {
300 /* Okay, char or block special */ 299 /* Okay, char or block special */
301 is_borc = 1;
302 if (!rdev) 300 if (!rdev)
303 rdev = wanted; 301 rdev = wanted;
304 } 302 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 06b2a26edfe0..e2c43464f237 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -180,15 +180,80 @@ int nfsd_nrthreads(void)
180 return rv; 180 return rv;
181} 181}
182 182
183static int nfsd_init_socks(int port)
184{
185 int error;
186 if (!list_empty(&nfsd_serv->sv_permsocks))
187 return 0;
188
189 error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
190 SVC_SOCK_DEFAULTS);
191 if (error < 0)
192 return error;
193
194 error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
195 SVC_SOCK_DEFAULTS);
196 if (error < 0)
197 return error;
198
199 return 0;
200}
201
202static bool nfsd_up = false;
203
204static int nfsd_startup(unsigned short port, int nrservs)
205{
206 int ret;
207
208 if (nfsd_up)
209 return 0;
210 /*
211 * Readahead param cache - will no-op if it already exists.
212 * (Note therefore results will be suboptimal if number of
213 * threads is modified after nfsd start.)
214 */
215 ret = nfsd_racache_init(2*nrservs);
216 if (ret)
217 return ret;
218 ret = nfsd_init_socks(port);
219 if (ret)
220 goto out_racache;
221 ret = lockd_up();
222 if (ret)
223 goto out_racache;
224 ret = nfs4_state_start();
225 if (ret)
226 goto out_lockd;
227 nfsd_up = true;
228 return 0;
229out_lockd:
230 lockd_down();
231out_racache:
232 nfsd_racache_shutdown();
233 return ret;
234}
235
236static void nfsd_shutdown(void)
237{
238 /*
239 * write_ports can create the server without actually starting
240 * any threads--if we get shut down before any threads are
241 * started, then nfsd_last_thread will be run before any of this
242 * other initialization has been done.
243 */
244 if (!nfsd_up)
245 return;
246 nfs4_state_shutdown();
247 lockd_down();
248 nfsd_racache_shutdown();
249 nfsd_up = false;
250}
251
183static void nfsd_last_thread(struct svc_serv *serv) 252static void nfsd_last_thread(struct svc_serv *serv)
184{ 253{
185 /* When last nfsd thread exits we need to do some clean-up */ 254 /* When last nfsd thread exits we need to do some clean-up */
186 struct svc_xprt *xprt;
187 list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list)
188 lockd_down();
189 nfsd_serv = NULL; 255 nfsd_serv = NULL;
190 nfsd_racache_shutdown(); 256 nfsd_shutdown();
191 nfs4_state_shutdown();
192 257
193 printk(KERN_WARNING "nfsd: last server has exited, flushing export " 258 printk(KERN_WARNING "nfsd: last server has exited, flushing export "
194 "cache\n"); 259 "cache\n");
@@ -263,45 +328,18 @@ int nfsd_create_serv(void)
263 nfsd_max_blksize >= 8*1024*2) 328 nfsd_max_blksize >= 8*1024*2)
264 nfsd_max_blksize /= 2; 329 nfsd_max_blksize /= 2;
265 } 330 }
331 nfsd_reset_versions();
266 332
267 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, 333 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
268 nfsd_last_thread, nfsd, THIS_MODULE); 334 nfsd_last_thread, nfsd, THIS_MODULE);
269 if (nfsd_serv == NULL) 335 if (nfsd_serv == NULL)
270 err = -ENOMEM; 336 return -ENOMEM;
271 else
272 set_max_drc();
273 337
338 set_max_drc();
274 do_gettimeofday(&nfssvc_boot); /* record boot time */ 339 do_gettimeofday(&nfssvc_boot); /* record boot time */
275 return err; 340 return err;
276} 341}
277 342
278static int nfsd_init_socks(int port)
279{
280 int error;
281 if (!list_empty(&nfsd_serv->sv_permsocks))
282 return 0;
283
284 error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
285 SVC_SOCK_DEFAULTS);
286 if (error < 0)
287 return error;
288
289 error = lockd_up();
290 if (error < 0)
291 return error;
292
293 error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
294 SVC_SOCK_DEFAULTS);
295 if (error < 0)
296 return error;
297
298 error = lockd_up();
299 if (error < 0)
300 return error;
301
302 return 0;
303}
304
305int nfsd_nrpools(void) 343int nfsd_nrpools(void)
306{ 344{
307 if (nfsd_serv == NULL) 345 if (nfsd_serv == NULL)
@@ -376,10 +414,16 @@ int nfsd_set_nrthreads(int n, int *nthreads)
376 return err; 414 return err;
377} 415}
378 416
417/*
418 * Adjust the number of threads and return the new number of threads.
419 * This is also the function that starts the server if necessary, if
420 * this is the first time nrservs is nonzero.
421 */
379int 422int
380nfsd_svc(unsigned short port, int nrservs) 423nfsd_svc(unsigned short port, int nrservs)
381{ 424{
382 int error; 425 int error;
426 bool nfsd_up_before;
383 427
384 mutex_lock(&nfsd_mutex); 428 mutex_lock(&nfsd_mutex);
385 dprintk("nfsd: creating service\n"); 429 dprintk("nfsd: creating service\n");
@@ -391,34 +435,29 @@ nfsd_svc(unsigned short port, int nrservs)
391 if (nrservs == 0 && nfsd_serv == NULL) 435 if (nrservs == 0 && nfsd_serv == NULL)
392 goto out; 436 goto out;
393 437
394 /* Readahead param cache - will no-op if it already exists */ 438 error = nfsd_create_serv();
395 error = nfsd_racache_init(2*nrservs);
396 if (error<0)
397 goto out;
398 error = nfs4_state_start();
399 if (error) 439 if (error)
400 goto out; 440 goto out;
401 441
402 nfsd_reset_versions(); 442 nfsd_up_before = nfsd_up;
403
404 error = nfsd_create_serv();
405 443
444 error = nfsd_startup(port, nrservs);
406 if (error) 445 if (error)
407 goto out; 446 goto out_destroy;
408 error = nfsd_init_socks(port);
409 if (error)
410 goto failure;
411
412 error = svc_set_num_threads(nfsd_serv, NULL, nrservs); 447 error = svc_set_num_threads(nfsd_serv, NULL, nrservs);
413 if (error == 0) 448 if (error)
414 /* We are holding a reference to nfsd_serv which 449 goto out_shutdown;
415 * we don't want to count in the return value, 450 /* We are holding a reference to nfsd_serv which
416 * so subtract 1 451 * we don't want to count in the return value,
417 */ 452 * so subtract 1
418 error = nfsd_serv->sv_nrthreads - 1; 453 */
419 failure: 454 error = nfsd_serv->sv_nrthreads - 1;
455out_shutdown:
456 if (error < 0 && !nfsd_up_before)
457 nfsd_shutdown();
458out_destroy:
420 svc_destroy(nfsd_serv); /* Release server */ 459 svc_destroy(nfsd_serv); /* Release server */
421 out: 460out:
422 mutex_unlock(&nfsd_mutex); 461 mutex_unlock(&nfsd_mutex);
423 return error; 462 return error;
424} 463}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 006c84230c7c..7731a75971dd 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -88,7 +88,6 @@ struct nfs4_delegation {
88 struct nfs4_client *dl_client; 88 struct nfs4_client *dl_client;
89 struct nfs4_file *dl_file; 89 struct nfs4_file *dl_file;
90 struct file_lock *dl_flock; 90 struct file_lock *dl_flock;
91 struct file *dl_vfs_file;
92 u32 dl_type; 91 u32 dl_type;
93 time_t dl_time; 92 time_t dl_time;
94/* For recall: */ 93/* For recall: */
@@ -342,12 +341,50 @@ struct nfs4_file {
342 struct list_head fi_hash; /* hash by "struct inode *" */ 341 struct list_head fi_hash; /* hash by "struct inode *" */
343 struct list_head fi_stateids; 342 struct list_head fi_stateids;
344 struct list_head fi_delegations; 343 struct list_head fi_delegations;
344 /* One each for O_RDONLY, O_WRONLY, O_RDWR: */
345 struct file * fi_fds[3];
346 /* One each for O_RDONLY, O_WRONLY: */
347 atomic_t fi_access[2];
348 /*
349 * Each open stateid contributes 1 to either fi_readers or
350 * fi_writers, or both, depending on the open mode. A
351 * delegation also takes an fi_readers reference. Lock
352 * stateid's take none.
353 */
354 atomic_t fi_readers;
355 atomic_t fi_writers;
345 struct inode *fi_inode; 356 struct inode *fi_inode;
346 u32 fi_id; /* used with stateowner->so_id 357 u32 fi_id; /* used with stateowner->so_id
347 * for stateid_hashtbl hash */ 358 * for stateid_hashtbl hash */
348 bool fi_had_conflict; 359 bool fi_had_conflict;
349}; 360};
350 361
362/* XXX: for first cut may fall back on returning file that doesn't work
363 * at all? */
364static inline struct file *find_writeable_file(struct nfs4_file *f)
365{
366 if (f->fi_fds[O_RDWR])
367 return f->fi_fds[O_RDWR];
368 return f->fi_fds[O_WRONLY];
369}
370
371static inline struct file *find_readable_file(struct nfs4_file *f)
372{
373 if (f->fi_fds[O_RDWR])
374 return f->fi_fds[O_RDWR];
375 return f->fi_fds[O_RDONLY];
376}
377
378static inline struct file *find_any_file(struct nfs4_file *f)
379{
380 if (f->fi_fds[O_RDWR])
381 return f->fi_fds[O_RDWR];
382 else if (f->fi_fds[O_RDWR])
383 return f->fi_fds[O_WRONLY];
384 else
385 return f->fi_fds[O_RDONLY];
386}
387
351/* 388/*
352* nfs4_stateid can either be an open stateid or (eventually) a lock stateid 389* nfs4_stateid can either be an open stateid or (eventually) a lock stateid
353* 390*
@@ -373,7 +410,6 @@ struct nfs4_stateid {
373 struct nfs4_stateowner * st_stateowner; 410 struct nfs4_stateowner * st_stateowner;
374 struct nfs4_file * st_file; 411 struct nfs4_file * st_file;
375 stateid_t st_stateid; 412 stateid_t st_stateid;
376 struct file * st_vfs_file;
377 unsigned long st_access_bmap; 413 unsigned long st_access_bmap;
378 unsigned long st_deny_bmap; 414 unsigned long st_deny_bmap;
379 struct nfs4_stateid * st_openstp; 415 struct nfs4_stateid * st_openstp;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 3c111120b619..9df85a13af28 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -604,7 +604,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
604 return error; 604 return error;
605} 605}
606 606
607#endif /* defined(CONFIG_NFS_V4) */ 607#endif /* defined(CONFIG_NFSD_V4) */
608 608
609#ifdef CONFIG_NFSD_V3 609#ifdef CONFIG_NFSD_V3
610/* 610/*
@@ -903,7 +903,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
903 loff_t offset, struct kvec *vec, int vlen, unsigned long *count) 903 loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
904{ 904{
905 struct inode *inode; 905 struct inode *inode;
906 struct raparms *ra;
907 mm_segment_t oldfs; 906 mm_segment_t oldfs;
908 __be32 err; 907 __be32 err;
909 int host_err; 908 int host_err;
@@ -914,12 +913,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
914 if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count)) 913 if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count))
915 goto out; 914 goto out;
916 915
917 /* Get readahead parameters */
918 ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
919
920 if (ra && ra->p_set)
921 file->f_ra = ra->p_ra;
922
923 if (file->f_op->splice_read && rqstp->rq_splice_ok) { 916 if (file->f_op->splice_read && rqstp->rq_splice_ok) {
924 struct splice_desc sd = { 917 struct splice_desc sd = {
925 .len = 0, 918 .len = 0,
@@ -937,16 +930,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
937 set_fs(oldfs); 930 set_fs(oldfs);
938 } 931 }
939 932
940 /* Write back readahead params */
941 if (ra) {
942 struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
943 spin_lock(&rab->pb_lock);
944 ra->p_ra = file->f_ra;
945 ra->p_set = 1;
946 ra->p_count--;
947 spin_unlock(&rab->pb_lock);
948 }
949
950 if (host_err >= 0) { 933 if (host_err >= 0) {
951 nfsdstats.io_read += host_err; 934 nfsdstats.io_read += host_err;
952 *count = host_err; 935 *count = host_err;
@@ -1086,8 +1069,45 @@ out:
1086 * on entry. On return, *count contains the number of bytes actually read. 1069 * on entry. On return, *count contains the number of bytes actually read.
1087 * N.B. After this call fhp needs an fh_put 1070 * N.B. After this call fhp needs an fh_put
1088 */ 1071 */
1072__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
1073 loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
1074{
1075 struct file *file;
1076 struct inode *inode;
1077 struct raparms *ra;
1078 __be32 err;
1079
1080 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
1081 if (err)
1082 return err;
1083
1084 inode = file->f_path.dentry->d_inode;
1085
1086 /* Get readahead parameters */
1087 ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
1088
1089 if (ra && ra->p_set)
1090 file->f_ra = ra->p_ra;
1091
1092 err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
1093
1094 /* Write back readahead params */
1095 if (ra) {
1096 struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
1097 spin_lock(&rab->pb_lock);
1098 ra->p_ra = file->f_ra;
1099 ra->p_set = 1;
1100 ra->p_count--;
1101 spin_unlock(&rab->pb_lock);
1102 }
1103
1104 nfsd_close(file);
1105 return err;
1106}
1107
1108/* As above, but use the provided file descriptor. */
1089__be32 1109__be32
1090nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 1110nfsd_read_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1091 loff_t offset, struct kvec *vec, int vlen, 1111 loff_t offset, struct kvec *vec, int vlen,
1092 unsigned long *count) 1112 unsigned long *count)
1093{ 1113{
@@ -1099,13 +1119,8 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1099 if (err) 1119 if (err)
1100 goto out; 1120 goto out;
1101 err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); 1121 err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
1102 } else { 1122 } else /* Note file may still be NULL in NFSv4 special stateid case: */
1103 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); 1123 err = nfsd_read(rqstp, fhp, offset, vec, vlen, count);
1104 if (err)
1105 goto out;
1106 err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
1107 nfsd_close(file);
1108 }
1109out: 1124out:
1110 return err; 1125 return err;
1111} 1126}
@@ -1631,7 +1646,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1631 char *name, int len, struct svc_fh *tfhp) 1646 char *name, int len, struct svc_fh *tfhp)
1632{ 1647{
1633 struct dentry *ddir, *dnew, *dold; 1648 struct dentry *ddir, *dnew, *dold;
1634 struct inode *dirp, *dest; 1649 struct inode *dirp;
1635 __be32 err; 1650 __be32 err;
1636 int host_err; 1651 int host_err;
1637 1652
@@ -1659,7 +1674,6 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1659 goto out_nfserr; 1674 goto out_nfserr;
1660 1675
1661 dold = tfhp->fh_dentry; 1676 dold = tfhp->fh_dentry;
1662 dest = dold->d_inode;
1663 1677
1664 host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt); 1678 host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt);
1665 if (host_err) { 1679 if (host_err) {
@@ -2038,7 +2052,6 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
2038 struct dentry *dentry, int acc) 2052 struct dentry *dentry, int acc)
2039{ 2053{
2040 struct inode *inode = dentry->d_inode; 2054 struct inode *inode = dentry->d_inode;
2041 struct path path;
2042 int err; 2055 int err;
2043 2056
2044 if (acc == NFSD_MAY_NOP) 2057 if (acc == NFSD_MAY_NOP)
@@ -2111,15 +2124,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
2111 if (err == -EACCES && S_ISREG(inode->i_mode) && 2124 if (err == -EACCES && S_ISREG(inode->i_mode) &&
2112 acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) 2125 acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
2113 err = inode_permission(inode, MAY_EXEC); 2126 err = inode_permission(inode, MAY_EXEC);
2114 if (err)
2115 goto nfsd_out;
2116 2127
2117 /* Do integrity (permission) checking now, but defer incrementing
2118 * IMA counts to the actual file open.
2119 */
2120 path.mnt = exp->ex_path.mnt;
2121 path.dentry = dentry;
2122nfsd_out:
2123 return err? nfserrno(err) : 0; 2128 return err? nfserrno(err) : 0;
2124} 2129}
2125 2130
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 217a62c2a357..9a370a5e36b7 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -64,7 +64,9 @@ __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
64__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, int, 64__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, int,
65 int, struct file **); 65 int, struct file **);
66void nfsd_close(struct file *); 66void nfsd_close(struct file *);
67__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, struct file *, 67__be32 nfsd_read(struct svc_rqst *, struct svc_fh *,
68 loff_t, struct kvec *, int, unsigned long *);
69__be32 nfsd_read_file(struct svc_rqst *, struct svc_fh *, struct file *,
68 loff_t, struct kvec *, int, unsigned long *); 70 loff_t, struct kvec *, int, unsigned long *);
69__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, 71__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
70 loff_t, struct kvec *,int, unsigned long *, int *); 72 loff_t, struct kvec *,int, unsigned long *, int *);
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index effdbdbe6c11..3dbdc1d356bf 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -26,6 +26,8 @@
26#include "nilfs.h" 26#include "nilfs.h"
27#include "bmap.h" 27#include "bmap.h"
28#include "sb.h" 28#include "sb.h"
29#include "btree.h"
30#include "direct.h"
29#include "btnode.h" 31#include "btnode.h"
30#include "mdt.h" 32#include "mdt.h"
31#include "dat.h" 33#include "dat.h"
@@ -533,7 +535,7 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
533 535
534void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) 536void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
535{ 537{
536 memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union)); 538 memcpy(gcbmap, bmap, sizeof(*bmap));
537 init_rwsem(&gcbmap->b_sem); 539 init_rwsem(&gcbmap->b_sem);
538 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); 540 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
539 gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode; 541 gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode;
@@ -541,7 +543,7 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
541 543
542void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) 544void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
543{ 545{
544 memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union)); 546 memcpy(bmap, gcbmap, sizeof(*bmap));
545 init_rwsem(&bmap->b_sem); 547 init_rwsem(&bmap->b_sem);
546 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); 548 lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key);
547 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; 549 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index 9980d7dbab91..a20569b19929 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -32,11 +32,6 @@
32 32
33#define NILFS_BMAP_INVALID_PTR 0 33#define NILFS_BMAP_INVALID_PTR 0
34 34
35#define nilfs_bmap_dkey_to_key(dkey) le64_to_cpu(dkey)
36#define nilfs_bmap_key_to_dkey(key) cpu_to_le64(key)
37#define nilfs_bmap_dptr_to_ptr(dptr) le64_to_cpu(dptr)
38#define nilfs_bmap_ptr_to_dptr(ptr) cpu_to_le64(ptr)
39
40#define nilfs_bmap_keydiff_abs(diff) ((diff) < 0 ? -(diff) : (diff)) 35#define nilfs_bmap_keydiff_abs(diff) ((diff) < 0 ? -(diff) : (diff))
41 36
42 37
@@ -71,7 +66,7 @@ struct nilfs_bmap_operations {
71 int (*bop_delete)(struct nilfs_bmap *, __u64); 66 int (*bop_delete)(struct nilfs_bmap *, __u64);
72 void (*bop_clear)(struct nilfs_bmap *); 67 void (*bop_clear)(struct nilfs_bmap *);
73 68
74 int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *); 69 int (*bop_propagate)(struct nilfs_bmap *, struct buffer_head *);
75 void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *, 70 void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *,
76 struct list_head *); 71 struct list_head *);
77 72
@@ -110,6 +105,7 @@ static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
110 * @b_last_allocated_ptr: last allocated ptr for data block 105 * @b_last_allocated_ptr: last allocated ptr for data block
111 * @b_ptr_type: pointer type 106 * @b_ptr_type: pointer type
112 * @b_state: state 107 * @b_state: state
108 * @b_nchildren_per_block: maximum number of child nodes for non-root nodes
113 */ 109 */
114struct nilfs_bmap { 110struct nilfs_bmap {
115 union { 111 union {
@@ -123,6 +119,7 @@ struct nilfs_bmap {
123 __u64 b_last_allocated_ptr; 119 __u64 b_last_allocated_ptr;
124 int b_ptr_type; 120 int b_ptr_type;
125 int b_state; 121 int b_state;
122 __u16 b_nchildren_per_block;
126}; 123};
127 124
128/* pointer type */ 125/* pointer type */
@@ -224,6 +221,13 @@ static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
224 nilfs_dat_abort_end(dat, &req->bpr_req); 221 nilfs_dat_abort_end(dat, &req->bpr_req);
225} 222}
226 223
224static inline void nilfs_bmap_set_target_v(struct nilfs_bmap *bmap, __u64 key,
225 __u64 ptr)
226{
227 bmap->b_last_allocated_key = key;
228 bmap->b_last_allocated_ptr = ptr;
229}
230
227__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, 231__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
228 const struct buffer_head *); 232 const struct buffer_head *);
229 233
diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h
deleted file mode 100644
index d41509bff47b..000000000000
--- a/fs/nilfs2/bmap_union.h
+++ /dev/null
@@ -1,42 +0,0 @@
1/*
2 * bmap_union.h - NILFS block mapping.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_BMAP_UNION_H
24#define _NILFS_BMAP_UNION_H
25
26#include "bmap.h"
27#include "direct.h"
28#include "btree.h"
29
30/**
31 * nilfs_bmap_union -
32 * @bi_bmap: bmap structure
33 * @bi_btree: direct map structure
34 * @bi_direct: B-tree structure
35 */
36union nilfs_bmap_union {
37 struct nilfs_bmap bi_bmap;
38 struct nilfs_direct bi_direct;
39 struct nilfs_btree bi_btree;
40};
41
42#endif /* _NILFS_BMAP_UNION_H */
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 447ce47a3306..f78ab1044d1d 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -96,10 +96,12 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
96} 96}
97 97
98int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, 98int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
99 sector_t pblocknr, struct buffer_head **pbh) 99 sector_t pblocknr, int mode,
100 struct buffer_head **pbh, sector_t *submit_ptr)
100{ 101{
101 struct buffer_head *bh; 102 struct buffer_head *bh;
102 struct inode *inode = NILFS_BTNC_I(btnc); 103 struct inode *inode = NILFS_BTNC_I(btnc);
104 struct page *page;
103 int err; 105 int err;
104 106
105 bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); 107 bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
@@ -107,6 +109,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
107 return -ENOMEM; 109 return -ENOMEM;
108 110
109 err = -EEXIST; /* internal code */ 111 err = -EEXIST; /* internal code */
112 page = bh->b_page;
110 113
111 if (buffer_uptodate(bh) || buffer_dirty(bh)) 114 if (buffer_uptodate(bh) || buffer_dirty(bh))
112 goto found; 115 goto found;
@@ -125,7 +128,16 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
125 } 128 }
126 } 129 }
127 } 130 }
128 lock_buffer(bh); 131
132 if (mode == READA) {
133 if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) {
134 err = -EBUSY; /* internal code */
135 brelse(bh);
136 goto out_locked;
137 }
138 } else { /* mode == READ */
139 lock_buffer(bh);
140 }
129 if (buffer_uptodate(bh)) { 141 if (buffer_uptodate(bh)) {
130 unlock_buffer(bh); 142 unlock_buffer(bh);
131 err = -EEXIST; /* internal code */ 143 err = -EEXIST; /* internal code */
@@ -136,15 +148,16 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
136 bh->b_blocknr = pblocknr; /* set block address for read */ 148 bh->b_blocknr = pblocknr; /* set block address for read */
137 bh->b_end_io = end_buffer_read_sync; 149 bh->b_end_io = end_buffer_read_sync;
138 get_bh(bh); 150 get_bh(bh);
139 submit_bh(READ, bh); 151 submit_bh(mode, bh);
140 bh->b_blocknr = blocknr; /* set back to the given block address */ 152 bh->b_blocknr = blocknr; /* set back to the given block address */
153 *submit_ptr = pblocknr;
141 err = 0; 154 err = 0;
142found: 155found:
143 *pbh = bh; 156 *pbh = bh;
144 157
145out_locked: 158out_locked:
146 unlock_page(bh->b_page); 159 unlock_page(page);
147 page_cache_release(bh->b_page); 160 page_cache_release(page);
148 return err; 161 return err;
149} 162}
150 163
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 07da83f07712..79037494f1e0 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -42,8 +42,8 @@ void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
42void nilfs_btnode_cache_clear(struct address_space *); 42void nilfs_btnode_cache_clear(struct address_space *);
43struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, 43struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
44 __u64 blocknr); 44 __u64 blocknr);
45int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, 45int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int,
46 struct buffer_head **); 46 struct buffer_head **, sector_t *);
47void nilfs_btnode_delete(struct buffer_head *); 47void nilfs_btnode_delete(struct buffer_head *);
48int nilfs_btnode_prepare_change_key(struct address_space *, 48int nilfs_btnode_prepare_change_key(struct address_space *,
49 struct nilfs_btnode_chkey_ctxt *); 49 struct nilfs_btnode_chkey_ctxt *);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index b27a342c5af6..300c2bc00c3f 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -66,30 +66,10 @@ static void nilfs_btree_free_path(struct nilfs_btree_path *path)
66/* 66/*
67 * B-tree node operations 67 * B-tree node operations
68 */ 68 */
69static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr, 69static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree,
70 struct buffer_head **bhp)
71{
72 struct address_space *btnc =
73 &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
74 int err;
75
76 err = nilfs_btnode_submit_block(btnc, ptr, 0, bhp);
77 if (err)
78 return err == -EEXIST ? 0 : err;
79
80 wait_on_buffer(*bhp);
81 if (!buffer_uptodate(*bhp)) {
82 brelse(*bhp);
83 return -EIO;
84 }
85 return 0;
86}
87
88static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
89 __u64 ptr, struct buffer_head **bhp) 70 __u64 ptr, struct buffer_head **bhp)
90{ 71{
91 struct address_space *btnc = 72 struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
92 &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache;
93 struct buffer_head *bh; 73 struct buffer_head *bh;
94 74
95 bh = nilfs_btnode_create_block(btnc, ptr); 75 bh = nilfs_btnode_create_block(btnc, ptr);
@@ -101,71 +81,55 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
101 return 0; 81 return 0;
102} 82}
103 83
104static inline int 84static int nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
105nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
106{ 85{
107 return node->bn_flags; 86 return node->bn_flags;
108} 87}
109 88
110static inline void 89static void
111nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags) 90nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
112{ 91{
113 node->bn_flags = flags; 92 node->bn_flags = flags;
114} 93}
115 94
116static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node) 95static int nilfs_btree_node_root(const struct nilfs_btree_node *node)
117{ 96{
118 return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT; 97 return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
119} 98}
120 99
121static inline int 100static int nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
122nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
123{ 101{
124 return node->bn_level; 102 return node->bn_level;
125} 103}
126 104
127static inline void 105static void
128nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level) 106nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
129{ 107{
130 node->bn_level = level; 108 node->bn_level = level;
131} 109}
132 110
133static inline int 111static int nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
134nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
135{ 112{
136 return le16_to_cpu(node->bn_nchildren); 113 return le16_to_cpu(node->bn_nchildren);
137} 114}
138 115
139static inline void 116static void
140nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren) 117nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
141{ 118{
142 node->bn_nchildren = cpu_to_le16(nchildren); 119 node->bn_nchildren = cpu_to_le16(nchildren);
143} 120}
144 121
145static inline int nilfs_btree_node_size(const struct nilfs_btree *btree) 122static int nilfs_btree_node_size(const struct nilfs_bmap *btree)
146{ 123{
147 return 1 << btree->bt_bmap.b_inode->i_blkbits; 124 return 1 << btree->b_inode->i_blkbits;
148} 125}
149 126
150static inline int 127static int nilfs_btree_nchildren_per_block(const struct nilfs_bmap *btree)
151nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node,
152 const struct nilfs_btree *btree)
153{ 128{
154 return nilfs_btree_node_root(node) ? 129 return btree->b_nchildren_per_block;
155 NILFS_BTREE_ROOT_NCHILDREN_MIN :
156 NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
157} 130}
158 131
159static inline int 132static __le64 *
160nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node,
161 const struct nilfs_btree *btree)
162{
163 return nilfs_btree_node_root(node) ?
164 NILFS_BTREE_ROOT_NCHILDREN_MAX :
165 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
166}
167
168static inline __le64 *
169nilfs_btree_node_dkeys(const struct nilfs_btree_node *node) 133nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
170{ 134{
171 return (__le64 *)((char *)(node + 1) + 135 return (__le64 *)((char *)(node + 1) +
@@ -173,45 +137,40 @@ nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
173 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE)); 137 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
174} 138}
175 139
176static inline __le64 * 140static __le64 *
177nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, 141nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, int ncmax)
178 const struct nilfs_btree *btree)
179{ 142{
180 return (__le64 *)(nilfs_btree_node_dkeys(node) + 143 return (__le64 *)(nilfs_btree_node_dkeys(node) + ncmax);
181 nilfs_btree_node_nchildren_max(node, btree));
182} 144}
183 145
184static inline __u64 146static __u64
185nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index) 147nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
186{ 148{
187 return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index)); 149 return le64_to_cpu(*(nilfs_btree_node_dkeys(node) + index));
188} 150}
189 151
190static inline void 152static void
191nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key) 153nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
192{ 154{
193 *(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key); 155 *(nilfs_btree_node_dkeys(node) + index) = cpu_to_le64(key);
194} 156}
195 157
196static inline __u64 158static __u64
197nilfs_btree_node_get_ptr(const struct nilfs_btree *btree, 159nilfs_btree_node_get_ptr(const struct nilfs_btree_node *node, int index,
198 const struct nilfs_btree_node *node, int index) 160 int ncmax)
199{ 161{
200 return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) + 162 return le64_to_cpu(*(nilfs_btree_node_dptrs(node, ncmax) + index));
201 index));
202} 163}
203 164
204static inline void 165static void
205nilfs_btree_node_set_ptr(struct nilfs_btree *btree, 166nilfs_btree_node_set_ptr(struct nilfs_btree_node *node, int index, __u64 ptr,
206 struct nilfs_btree_node *node, int index, __u64 ptr) 167 int ncmax)
207{ 168{
208 *(nilfs_btree_node_dptrs(node, btree) + index) = 169 *(nilfs_btree_node_dptrs(node, ncmax) + index) = cpu_to_le64(ptr);
209 nilfs_bmap_ptr_to_dptr(ptr);
210} 170}
211 171
212static void nilfs_btree_node_init(struct nilfs_btree *btree, 172static void nilfs_btree_node_init(struct nilfs_btree_node *node, int flags,
213 struct nilfs_btree_node *node, 173 int level, int nchildren, int ncmax,
214 int flags, int level, int nchildren,
215 const __u64 *keys, const __u64 *ptrs) 174 const __u64 *keys, const __u64 *ptrs)
216{ 175{
217 __le64 *dkeys; 176 __le64 *dkeys;
@@ -223,29 +182,28 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree,
223 nilfs_btree_node_set_nchildren(node, nchildren); 182 nilfs_btree_node_set_nchildren(node, nchildren);
224 183
225 dkeys = nilfs_btree_node_dkeys(node); 184 dkeys = nilfs_btree_node_dkeys(node);
226 dptrs = nilfs_btree_node_dptrs(node, btree); 185 dptrs = nilfs_btree_node_dptrs(node, ncmax);
227 for (i = 0; i < nchildren; i++) { 186 for (i = 0; i < nchildren; i++) {
228 dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]); 187 dkeys[i] = cpu_to_le64(keys[i]);
229 dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]); 188 dptrs[i] = cpu_to_le64(ptrs[i]);
230 } 189 }
231} 190}
232 191
233/* Assume the buffer heads corresponding to left and right are locked. */ 192/* Assume the buffer heads corresponding to left and right are locked. */
234static void nilfs_btree_node_move_left(struct nilfs_btree *btree, 193static void nilfs_btree_node_move_left(struct nilfs_btree_node *left,
235 struct nilfs_btree_node *left,
236 struct nilfs_btree_node *right, 194 struct nilfs_btree_node *right,
237 int n) 195 int n, int lncmax, int rncmax)
238{ 196{
239 __le64 *ldkeys, *rdkeys; 197 __le64 *ldkeys, *rdkeys;
240 __le64 *ldptrs, *rdptrs; 198 __le64 *ldptrs, *rdptrs;
241 int lnchildren, rnchildren; 199 int lnchildren, rnchildren;
242 200
243 ldkeys = nilfs_btree_node_dkeys(left); 201 ldkeys = nilfs_btree_node_dkeys(left);
244 ldptrs = nilfs_btree_node_dptrs(left, btree); 202 ldptrs = nilfs_btree_node_dptrs(left, lncmax);
245 lnchildren = nilfs_btree_node_get_nchildren(left); 203 lnchildren = nilfs_btree_node_get_nchildren(left);
246 204
247 rdkeys = nilfs_btree_node_dkeys(right); 205 rdkeys = nilfs_btree_node_dkeys(right);
248 rdptrs = nilfs_btree_node_dptrs(right, btree); 206 rdptrs = nilfs_btree_node_dptrs(right, rncmax);
249 rnchildren = nilfs_btree_node_get_nchildren(right); 207 rnchildren = nilfs_btree_node_get_nchildren(right);
250 208
251 memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys)); 209 memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
@@ -260,21 +218,20 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
260} 218}
261 219
262/* Assume that the buffer heads corresponding to left and right are locked. */ 220/* Assume that the buffer heads corresponding to left and right are locked. */
263static void nilfs_btree_node_move_right(struct nilfs_btree *btree, 221static void nilfs_btree_node_move_right(struct nilfs_btree_node *left,
264 struct nilfs_btree_node *left,
265 struct nilfs_btree_node *right, 222 struct nilfs_btree_node *right,
266 int n) 223 int n, int lncmax, int rncmax)
267{ 224{
268 __le64 *ldkeys, *rdkeys; 225 __le64 *ldkeys, *rdkeys;
269 __le64 *ldptrs, *rdptrs; 226 __le64 *ldptrs, *rdptrs;
270 int lnchildren, rnchildren; 227 int lnchildren, rnchildren;
271 228
272 ldkeys = nilfs_btree_node_dkeys(left); 229 ldkeys = nilfs_btree_node_dkeys(left);
273 ldptrs = nilfs_btree_node_dptrs(left, btree); 230 ldptrs = nilfs_btree_node_dptrs(left, lncmax);
274 lnchildren = nilfs_btree_node_get_nchildren(left); 231 lnchildren = nilfs_btree_node_get_nchildren(left);
275 232
276 rdkeys = nilfs_btree_node_dkeys(right); 233 rdkeys = nilfs_btree_node_dkeys(right);
277 rdptrs = nilfs_btree_node_dptrs(right, btree); 234 rdptrs = nilfs_btree_node_dptrs(right, rncmax);
278 rnchildren = nilfs_btree_node_get_nchildren(right); 235 rnchildren = nilfs_btree_node_get_nchildren(right);
279 236
280 memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys)); 237 memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
@@ -289,16 +246,15 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
289} 246}
290 247
291/* Assume that the buffer head corresponding to node is locked. */ 248/* Assume that the buffer head corresponding to node is locked. */
292static void nilfs_btree_node_insert(struct nilfs_btree *btree, 249static void nilfs_btree_node_insert(struct nilfs_btree_node *node, int index,
293 struct nilfs_btree_node *node, 250 __u64 key, __u64 ptr, int ncmax)
294 __u64 key, __u64 ptr, int index)
295{ 251{
296 __le64 *dkeys; 252 __le64 *dkeys;
297 __le64 *dptrs; 253 __le64 *dptrs;
298 int nchildren; 254 int nchildren;
299 255
300 dkeys = nilfs_btree_node_dkeys(node); 256 dkeys = nilfs_btree_node_dkeys(node);
301 dptrs = nilfs_btree_node_dptrs(node, btree); 257 dptrs = nilfs_btree_node_dptrs(node, ncmax);
302 nchildren = nilfs_btree_node_get_nchildren(node); 258 nchildren = nilfs_btree_node_get_nchildren(node);
303 if (index < nchildren) { 259 if (index < nchildren) {
304 memmove(dkeys + index + 1, dkeys + index, 260 memmove(dkeys + index + 1, dkeys + index,
@@ -306,16 +262,15 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
306 memmove(dptrs + index + 1, dptrs + index, 262 memmove(dptrs + index + 1, dptrs + index,
307 (nchildren - index) * sizeof(*dptrs)); 263 (nchildren - index) * sizeof(*dptrs));
308 } 264 }
309 dkeys[index] = nilfs_bmap_key_to_dkey(key); 265 dkeys[index] = cpu_to_le64(key);
310 dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr); 266 dptrs[index] = cpu_to_le64(ptr);
311 nchildren++; 267 nchildren++;
312 nilfs_btree_node_set_nchildren(node, nchildren); 268 nilfs_btree_node_set_nchildren(node, nchildren);
313} 269}
314 270
315/* Assume that the buffer head corresponding to node is locked. */ 271/* Assume that the buffer head corresponding to node is locked. */
316static void nilfs_btree_node_delete(struct nilfs_btree *btree, 272static void nilfs_btree_node_delete(struct nilfs_btree_node *node, int index,
317 struct nilfs_btree_node *node, 273 __u64 *keyp, __u64 *ptrp, int ncmax)
318 __u64 *keyp, __u64 *ptrp, int index)
319{ 274{
320 __u64 key; 275 __u64 key;
321 __u64 ptr; 276 __u64 ptr;
@@ -324,9 +279,9 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
324 int nchildren; 279 int nchildren;
325 280
326 dkeys = nilfs_btree_node_dkeys(node); 281 dkeys = nilfs_btree_node_dkeys(node);
327 dptrs = nilfs_btree_node_dptrs(node, btree); 282 dptrs = nilfs_btree_node_dptrs(node, ncmax);
328 key = nilfs_bmap_dkey_to_key(dkeys[index]); 283 key = le64_to_cpu(dkeys[index]);
329 ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]); 284 ptr = le64_to_cpu(dptrs[index]);
330 nchildren = nilfs_btree_node_get_nchildren(node); 285 nchildren = nilfs_btree_node_get_nchildren(node);
331 if (keyp != NULL) 286 if (keyp != NULL)
332 *keyp = key; 287 *keyp = key;
@@ -382,40 +337,92 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
382 return s == 0; 337 return s == 0;
383} 338}
384 339
385static inline struct nilfs_btree_node * 340/**
386nilfs_btree_get_root(const struct nilfs_btree *btree) 341 * nilfs_btree_node_broken - verify consistency of btree node
342 * @node: btree node block to be examined
343 * @size: node size (in bytes)
344 * @blocknr: block number
345 *
346 * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
347 */
348static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
349 size_t size, sector_t blocknr)
387{ 350{
388 return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data; 351 int level, flags, nchildren;
352 int ret = 0;
353
354 level = nilfs_btree_node_get_level(node);
355 flags = nilfs_btree_node_get_flags(node);
356 nchildren = nilfs_btree_node_get_nchildren(node);
357
358 if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
359 level >= NILFS_BTREE_LEVEL_MAX ||
360 (flags & NILFS_BTREE_NODE_ROOT) ||
361 nchildren < 0 ||
362 nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) {
363 printk(KERN_CRIT "NILFS: bad btree node (blocknr=%llu): "
364 "level = %d, flags = 0x%x, nchildren = %d\n",
365 (unsigned long long)blocknr, level, flags, nchildren);
366 ret = 1;
367 }
368 return ret;
389} 369}
390 370
391static inline struct nilfs_btree_node * 371int nilfs_btree_broken_node_block(struct buffer_head *bh)
372{
373 int ret;
374
375 if (buffer_nilfs_checked(bh))
376 return 0;
377
378 ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data,
379 bh->b_size, bh->b_blocknr);
380 if (likely(!ret))
381 set_buffer_nilfs_checked(bh);
382 return ret;
383}
384
385static struct nilfs_btree_node *
386nilfs_btree_get_root(const struct nilfs_bmap *btree)
387{
388 return (struct nilfs_btree_node *)btree->b_u.u_data;
389}
390
391static struct nilfs_btree_node *
392nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level) 392nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
393{ 393{
394 return (struct nilfs_btree_node *)path[level].bp_bh->b_data; 394 return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
395} 395}
396 396
397static inline struct nilfs_btree_node * 397static struct nilfs_btree_node *
398nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level) 398nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
399{ 399{
400 return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data; 400 return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
401} 401}
402 402
403static inline int nilfs_btree_height(const struct nilfs_btree *btree) 403static int nilfs_btree_height(const struct nilfs_bmap *btree)
404{ 404{
405 return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1; 405 return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
406} 406}
407 407
408static inline struct nilfs_btree_node * 408static struct nilfs_btree_node *
409nilfs_btree_get_node(const struct nilfs_btree *btree, 409nilfs_btree_get_node(const struct nilfs_bmap *btree,
410 const struct nilfs_btree_path *path, 410 const struct nilfs_btree_path *path,
411 int level) 411 int level, int *ncmaxp)
412{ 412{
413 return (level == nilfs_btree_height(btree) - 1) ? 413 struct nilfs_btree_node *node;
414 nilfs_btree_get_root(btree) : 414
415 nilfs_btree_get_nonroot_node(path, level); 415 if (level == nilfs_btree_height(btree) - 1) {
416 node = nilfs_btree_get_root(btree);
417 *ncmaxp = NILFS_BTREE_ROOT_NCHILDREN_MAX;
418 } else {
419 node = nilfs_btree_get_nonroot_node(path, level);
420 *ncmaxp = nilfs_btree_nchildren_per_block(btree);
421 }
422 return node;
416} 423}
417 424
418static inline int 425static int
419nilfs_btree_bad_node(struct nilfs_btree_node *node, int level) 426nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
420{ 427{
421 if (unlikely(nilfs_btree_node_get_level(node) != level)) { 428 if (unlikely(nilfs_btree_node_get_level(node) != level)) {
@@ -427,13 +434,83 @@ nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
427 return 0; 434 return 0;
428} 435}
429 436
430static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, 437struct nilfs_btree_readahead_info {
438 struct nilfs_btree_node *node; /* parent node */
439 int max_ra_blocks; /* max nof blocks to read ahead */
440 int index; /* current index on the parent node */
441 int ncmax; /* nof children in the parent node */
442};
443
444static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
445 struct buffer_head **bhp,
446 const struct nilfs_btree_readahead_info *ra)
447{
448 struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
449 struct buffer_head *bh, *ra_bh;
450 sector_t submit_ptr = 0;
451 int ret;
452
453 ret = nilfs_btnode_submit_block(btnc, ptr, 0, READ, &bh, &submit_ptr);
454 if (ret) {
455 if (ret != -EEXIST)
456 return ret;
457 goto out_check;
458 }
459
460 if (ra) {
461 int i, n;
462 __u64 ptr2;
463
464 /* read ahead sibling nodes */
465 for (n = ra->max_ra_blocks, i = ra->index + 1;
466 n > 0 && i < ra->ncmax; n--, i++) {
467 ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax);
468
469 ret = nilfs_btnode_submit_block(btnc, ptr2, 0, READA,
470 &ra_bh, &submit_ptr);
471 if (likely(!ret || ret == -EEXIST))
472 brelse(ra_bh);
473 else if (ret != -EBUSY)
474 break;
475 if (!buffer_locked(bh))
476 goto out_no_wait;
477 }
478 }
479
480 wait_on_buffer(bh);
481
482 out_no_wait:
483 if (!buffer_uptodate(bh)) {
484 brelse(bh);
485 return -EIO;
486 }
487
488 out_check:
489 if (nilfs_btree_broken_node_block(bh)) {
490 clear_buffer_uptodate(bh);
491 brelse(bh);
492 return -EINVAL;
493 }
494
495 *bhp = bh;
496 return 0;
497}
498
499static int nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
500 struct buffer_head **bhp)
501{
502 return __nilfs_btree_get_block(btree, ptr, bhp, NULL);
503}
504
505static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree,
431 struct nilfs_btree_path *path, 506 struct nilfs_btree_path *path,
432 __u64 key, __u64 *ptrp, int minlevel) 507 __u64 key, __u64 *ptrp, int minlevel,
508 int readahead)
433{ 509{
434 struct nilfs_btree_node *node; 510 struct nilfs_btree_node *node;
511 struct nilfs_btree_readahead_info p, *ra;
435 __u64 ptr; 512 __u64 ptr;
436 int level, index, found, ret; 513 int level, index, found, ncmax, ret;
437 514
438 node = nilfs_btree_get_root(btree); 515 node = nilfs_btree_get_root(btree);
439 level = nilfs_btree_node_get_level(node); 516 level = nilfs_btree_node_get_level(node);
@@ -441,14 +518,27 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
441 return -ENOENT; 518 return -ENOENT;
442 519
443 found = nilfs_btree_node_lookup(node, key, &index); 520 found = nilfs_btree_node_lookup(node, key, &index);
444 ptr = nilfs_btree_node_get_ptr(btree, node, index); 521 ptr = nilfs_btree_node_get_ptr(node, index,
522 NILFS_BTREE_ROOT_NCHILDREN_MAX);
445 path[level].bp_bh = NULL; 523 path[level].bp_bh = NULL;
446 path[level].bp_index = index; 524 path[level].bp_index = index;
447 525
448 for (level--; level >= minlevel; level--) { 526 ncmax = nilfs_btree_nchildren_per_block(btree);
449 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); 527
528 while (--level >= minlevel) {
529 ra = NULL;
530 if (level == NILFS_BTREE_LEVEL_NODE_MIN && readahead) {
531 p.node = nilfs_btree_get_node(btree, path, level + 1,
532 &p.ncmax);
533 p.index = index;
534 p.max_ra_blocks = 7;
535 ra = &p;
536 }
537 ret = __nilfs_btree_get_block(btree, ptr, &path[level].bp_bh,
538 ra);
450 if (ret < 0) 539 if (ret < 0)
451 return ret; 540 return ret;
541
452 node = nilfs_btree_get_nonroot_node(path, level); 542 node = nilfs_btree_get_nonroot_node(path, level);
453 if (nilfs_btree_bad_node(node, level)) 543 if (nilfs_btree_bad_node(node, level))
454 return -EINVAL; 544 return -EINVAL;
@@ -456,9 +546,9 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
456 found = nilfs_btree_node_lookup(node, key, &index); 546 found = nilfs_btree_node_lookup(node, key, &index);
457 else 547 else
458 index = 0; 548 index = 0;
459 if (index < nilfs_btree_node_nchildren_max(node, btree)) 549 if (index < ncmax) {
460 ptr = nilfs_btree_node_get_ptr(btree, node, index); 550 ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
461 else { 551 } else {
462 WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN); 552 WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
463 /* insert */ 553 /* insert */
464 ptr = NILFS_BMAP_INVALID_PTR; 554 ptr = NILFS_BMAP_INVALID_PTR;
@@ -474,22 +564,24 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
474 return 0; 564 return 0;
475} 565}
476 566
477static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, 567static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree,
478 struct nilfs_btree_path *path, 568 struct nilfs_btree_path *path,
479 __u64 *keyp, __u64 *ptrp) 569 __u64 *keyp, __u64 *ptrp)
480{ 570{
481 struct nilfs_btree_node *node; 571 struct nilfs_btree_node *node;
482 __u64 ptr; 572 __u64 ptr;
483 int index, level, ret; 573 int index, level, ncmax, ret;
484 574
485 node = nilfs_btree_get_root(btree); 575 node = nilfs_btree_get_root(btree);
486 index = nilfs_btree_node_get_nchildren(node) - 1; 576 index = nilfs_btree_node_get_nchildren(node) - 1;
487 if (index < 0) 577 if (index < 0)
488 return -ENOENT; 578 return -ENOENT;
489 level = nilfs_btree_node_get_level(node); 579 level = nilfs_btree_node_get_level(node);
490 ptr = nilfs_btree_node_get_ptr(btree, node, index); 580 ptr = nilfs_btree_node_get_ptr(node, index,
581 NILFS_BTREE_ROOT_NCHILDREN_MAX);
491 path[level].bp_bh = NULL; 582 path[level].bp_bh = NULL;
492 path[level].bp_index = index; 583 path[level].bp_index = index;
584 ncmax = nilfs_btree_nchildren_per_block(btree);
493 585
494 for (level--; level > 0; level--) { 586 for (level--; level > 0; level--) {
495 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); 587 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
@@ -499,7 +591,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
499 if (nilfs_btree_bad_node(node, level)) 591 if (nilfs_btree_bad_node(node, level))
500 return -EINVAL; 592 return -EINVAL;
501 index = nilfs_btree_node_get_nchildren(node) - 1; 593 index = nilfs_btree_node_get_nchildren(node) - 1;
502 ptr = nilfs_btree_node_get_ptr(btree, node, index); 594 ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
503 path[level].bp_index = index; 595 path[level].bp_index = index;
504 } 596 }
505 597
@@ -511,51 +603,45 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
511 return 0; 603 return 0;
512} 604}
513 605
514static int nilfs_btree_lookup(const struct nilfs_bmap *bmap, 606static int nilfs_btree_lookup(const struct nilfs_bmap *btree,
515 __u64 key, int level, __u64 *ptrp) 607 __u64 key, int level, __u64 *ptrp)
516{ 608{
517 struct nilfs_btree *btree;
518 struct nilfs_btree_path *path; 609 struct nilfs_btree_path *path;
519 __u64 ptr;
520 int ret; 610 int ret;
521 611
522 btree = (struct nilfs_btree *)bmap;
523 path = nilfs_btree_alloc_path(); 612 path = nilfs_btree_alloc_path();
524 if (path == NULL) 613 if (path == NULL)
525 return -ENOMEM; 614 return -ENOMEM;
526 615
527 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); 616 ret = nilfs_btree_do_lookup(btree, path, key, ptrp, level, 0);
528
529 if (ptrp != NULL)
530 *ptrp = ptr;
531 617
532 nilfs_btree_free_path(path); 618 nilfs_btree_free_path(path);
533 619
534 return ret; 620 return ret;
535} 621}
536 622
537static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, 623static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
538 __u64 key, __u64 *ptrp, unsigned maxblocks) 624 __u64 key, __u64 *ptrp, unsigned maxblocks)
539{ 625{
540 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
541 struct nilfs_btree_path *path; 626 struct nilfs_btree_path *path;
542 struct nilfs_btree_node *node; 627 struct nilfs_btree_node *node;
543 struct inode *dat = NULL; 628 struct inode *dat = NULL;
544 __u64 ptr, ptr2; 629 __u64 ptr, ptr2;
545 sector_t blocknr; 630 sector_t blocknr;
546 int level = NILFS_BTREE_LEVEL_NODE_MIN; 631 int level = NILFS_BTREE_LEVEL_NODE_MIN;
547 int ret, cnt, index, maxlevel; 632 int ret, cnt, index, maxlevel, ncmax;
633 struct nilfs_btree_readahead_info p;
548 634
549 path = nilfs_btree_alloc_path(); 635 path = nilfs_btree_alloc_path();
550 if (path == NULL) 636 if (path == NULL)
551 return -ENOMEM; 637 return -ENOMEM;
552 638
553 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); 639 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level, 1);
554 if (ret < 0) 640 if (ret < 0)
555 goto out; 641 goto out;
556 642
557 if (NILFS_BMAP_USE_VBN(bmap)) { 643 if (NILFS_BMAP_USE_VBN(btree)) {
558 dat = nilfs_bmap_get_dat(bmap); 644 dat = nilfs_bmap_get_dat(btree);
559 ret = nilfs_dat_translate(dat, ptr, &blocknr); 645 ret = nilfs_dat_translate(dat, ptr, &blocknr);
560 if (ret < 0) 646 if (ret < 0)
561 goto out; 647 goto out;
@@ -566,14 +652,14 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
566 goto end; 652 goto end;
567 653
568 maxlevel = nilfs_btree_height(btree) - 1; 654 maxlevel = nilfs_btree_height(btree) - 1;
569 node = nilfs_btree_get_node(btree, path, level); 655 node = nilfs_btree_get_node(btree, path, level, &ncmax);
570 index = path[level].bp_index + 1; 656 index = path[level].bp_index + 1;
571 for (;;) { 657 for (;;) {
572 while (index < nilfs_btree_node_get_nchildren(node)) { 658 while (index < nilfs_btree_node_get_nchildren(node)) {
573 if (nilfs_btree_node_get_key(node, index) != 659 if (nilfs_btree_node_get_key(node, index) !=
574 key + cnt) 660 key + cnt)
575 goto end; 661 goto end;
576 ptr2 = nilfs_btree_node_get_ptr(btree, node, index); 662 ptr2 = nilfs_btree_node_get_ptr(node, index, ncmax);
577 if (dat) { 663 if (dat) {
578 ret = nilfs_dat_translate(dat, ptr2, &blocknr); 664 ret = nilfs_dat_translate(dat, ptr2, &blocknr);
579 if (ret < 0) 665 if (ret < 0)
@@ -589,20 +675,24 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
589 break; 675 break;
590 676
591 /* look-up right sibling node */ 677 /* look-up right sibling node */
592 node = nilfs_btree_get_node(btree, path, level + 1); 678 p.node = nilfs_btree_get_node(btree, path, level + 1, &p.ncmax);
593 index = path[level + 1].bp_index + 1; 679 p.index = path[level + 1].bp_index + 1;
594 if (index >= nilfs_btree_node_get_nchildren(node) || 680 p.max_ra_blocks = 7;
595 nilfs_btree_node_get_key(node, index) != key + cnt) 681 if (p.index >= nilfs_btree_node_get_nchildren(p.node) ||
682 nilfs_btree_node_get_key(p.node, p.index) != key + cnt)
596 break; 683 break;
597 ptr2 = nilfs_btree_node_get_ptr(btree, node, index); 684 ptr2 = nilfs_btree_node_get_ptr(p.node, p.index, p.ncmax);
598 path[level + 1].bp_index = index; 685 path[level + 1].bp_index = p.index;
599 686
600 brelse(path[level].bp_bh); 687 brelse(path[level].bp_bh);
601 path[level].bp_bh = NULL; 688 path[level].bp_bh = NULL;
602 ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh); 689
690 ret = __nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh,
691 &p);
603 if (ret < 0) 692 if (ret < 0)
604 goto out; 693 goto out;
605 node = nilfs_btree_get_nonroot_node(path, level); 694 node = nilfs_btree_get_nonroot_node(path, level);
695 ncmax = nilfs_btree_nchildren_per_block(btree);
606 index = 0; 696 index = 0;
607 path[level].bp_index = index; 697 path[level].bp_index = index;
608 } 698 }
@@ -614,7 +704,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
614 return ret; 704 return ret;
615} 705}
616 706
617static void nilfs_btree_promote_key(struct nilfs_btree *btree, 707static void nilfs_btree_promote_key(struct nilfs_bmap *btree,
618 struct nilfs_btree_path *path, 708 struct nilfs_btree_path *path,
619 int level, __u64 key) 709 int level, __u64 key)
620{ 710{
@@ -636,16 +726,18 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
636 } 726 }
637} 727}
638 728
639static void nilfs_btree_do_insert(struct nilfs_btree *btree, 729static void nilfs_btree_do_insert(struct nilfs_bmap *btree,
640 struct nilfs_btree_path *path, 730 struct nilfs_btree_path *path,
641 int level, __u64 *keyp, __u64 *ptrp) 731 int level, __u64 *keyp, __u64 *ptrp)
642{ 732{
643 struct nilfs_btree_node *node; 733 struct nilfs_btree_node *node;
734 int ncblk;
644 735
645 if (level < nilfs_btree_height(btree) - 1) { 736 if (level < nilfs_btree_height(btree) - 1) {
646 node = nilfs_btree_get_nonroot_node(path, level); 737 node = nilfs_btree_get_nonroot_node(path, level);
647 nilfs_btree_node_insert(btree, node, *keyp, *ptrp, 738 ncblk = nilfs_btree_nchildren_per_block(btree);
648 path[level].bp_index); 739 nilfs_btree_node_insert(node, path[level].bp_index,
740 *keyp, *ptrp, ncblk);
649 if (!buffer_dirty(path[level].bp_bh)) 741 if (!buffer_dirty(path[level].bp_bh))
650 nilfs_btnode_mark_dirty(path[level].bp_bh); 742 nilfs_btnode_mark_dirty(path[level].bp_bh);
651 743
@@ -655,22 +747,24 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
655 0)); 747 0));
656 } else { 748 } else {
657 node = nilfs_btree_get_root(btree); 749 node = nilfs_btree_get_root(btree);
658 nilfs_btree_node_insert(btree, node, *keyp, *ptrp, 750 nilfs_btree_node_insert(node, path[level].bp_index,
659 path[level].bp_index); 751 *keyp, *ptrp,
752 NILFS_BTREE_ROOT_NCHILDREN_MAX);
660 } 753 }
661} 754}
662 755
663static void nilfs_btree_carry_left(struct nilfs_btree *btree, 756static void nilfs_btree_carry_left(struct nilfs_bmap *btree,
664 struct nilfs_btree_path *path, 757 struct nilfs_btree_path *path,
665 int level, __u64 *keyp, __u64 *ptrp) 758 int level, __u64 *keyp, __u64 *ptrp)
666{ 759{
667 struct nilfs_btree_node *node, *left; 760 struct nilfs_btree_node *node, *left;
668 int nchildren, lnchildren, n, move; 761 int nchildren, lnchildren, n, move, ncblk;
669 762
670 node = nilfs_btree_get_nonroot_node(path, level); 763 node = nilfs_btree_get_nonroot_node(path, level);
671 left = nilfs_btree_get_sib_node(path, level); 764 left = nilfs_btree_get_sib_node(path, level);
672 nchildren = nilfs_btree_node_get_nchildren(node); 765 nchildren = nilfs_btree_node_get_nchildren(node);
673 lnchildren = nilfs_btree_node_get_nchildren(left); 766 lnchildren = nilfs_btree_node_get_nchildren(left);
767 ncblk = nilfs_btree_nchildren_per_block(btree);
674 move = 0; 768 move = 0;
675 769
676 n = (nchildren + lnchildren + 1) / 2 - lnchildren; 770 n = (nchildren + lnchildren + 1) / 2 - lnchildren;
@@ -680,7 +774,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
680 move = 1; 774 move = 1;
681 } 775 }
682 776
683 nilfs_btree_node_move_left(btree, left, node, n); 777 nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
684 778
685 if (!buffer_dirty(path[level].bp_bh)) 779 if (!buffer_dirty(path[level].bp_bh))
686 nilfs_btnode_mark_dirty(path[level].bp_bh); 780 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -705,17 +799,18 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
705 nilfs_btree_do_insert(btree, path, level, keyp, ptrp); 799 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
706} 800}
707 801
708static void nilfs_btree_carry_right(struct nilfs_btree *btree, 802static void nilfs_btree_carry_right(struct nilfs_bmap *btree,
709 struct nilfs_btree_path *path, 803 struct nilfs_btree_path *path,
710 int level, __u64 *keyp, __u64 *ptrp) 804 int level, __u64 *keyp, __u64 *ptrp)
711{ 805{
712 struct nilfs_btree_node *node, *right; 806 struct nilfs_btree_node *node, *right;
713 int nchildren, rnchildren, n, move; 807 int nchildren, rnchildren, n, move, ncblk;
714 808
715 node = nilfs_btree_get_nonroot_node(path, level); 809 node = nilfs_btree_get_nonroot_node(path, level);
716 right = nilfs_btree_get_sib_node(path, level); 810 right = nilfs_btree_get_sib_node(path, level);
717 nchildren = nilfs_btree_node_get_nchildren(node); 811 nchildren = nilfs_btree_node_get_nchildren(node);
718 rnchildren = nilfs_btree_node_get_nchildren(right); 812 rnchildren = nilfs_btree_node_get_nchildren(right);
813 ncblk = nilfs_btree_nchildren_per_block(btree);
719 move = 0; 814 move = 0;
720 815
721 n = (nchildren + rnchildren + 1) / 2 - rnchildren; 816 n = (nchildren + rnchildren + 1) / 2 - rnchildren;
@@ -725,7 +820,7 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
725 move = 1; 820 move = 1;
726 } 821 }
727 822
728 nilfs_btree_node_move_right(btree, node, right, n); 823 nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
729 824
730 if (!buffer_dirty(path[level].bp_bh)) 825 if (!buffer_dirty(path[level].bp_bh))
731 nilfs_btnode_mark_dirty(path[level].bp_bh); 826 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -751,18 +846,19 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
751 nilfs_btree_do_insert(btree, path, level, keyp, ptrp); 846 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
752} 847}
753 848
754static void nilfs_btree_split(struct nilfs_btree *btree, 849static void nilfs_btree_split(struct nilfs_bmap *btree,
755 struct nilfs_btree_path *path, 850 struct nilfs_btree_path *path,
756 int level, __u64 *keyp, __u64 *ptrp) 851 int level, __u64 *keyp, __u64 *ptrp)
757{ 852{
758 struct nilfs_btree_node *node, *right; 853 struct nilfs_btree_node *node, *right;
759 __u64 newkey; 854 __u64 newkey;
760 __u64 newptr; 855 __u64 newptr;
761 int nchildren, n, move; 856 int nchildren, n, move, ncblk;
762 857
763 node = nilfs_btree_get_nonroot_node(path, level); 858 node = nilfs_btree_get_nonroot_node(path, level);
764 right = nilfs_btree_get_sib_node(path, level); 859 right = nilfs_btree_get_sib_node(path, level);
765 nchildren = nilfs_btree_node_get_nchildren(node); 860 nchildren = nilfs_btree_node_get_nchildren(node);
861 ncblk = nilfs_btree_nchildren_per_block(btree);
766 move = 0; 862 move = 0;
767 863
768 n = (nchildren + 1) / 2; 864 n = (nchildren + 1) / 2;
@@ -771,7 +867,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
771 move = 1; 867 move = 1;
772 } 868 }
773 869
774 nilfs_btree_node_move_right(btree, node, right, n); 870 nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
775 871
776 if (!buffer_dirty(path[level].bp_bh)) 872 if (!buffer_dirty(path[level].bp_bh))
777 nilfs_btnode_mark_dirty(path[level].bp_bh); 873 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -783,8 +879,8 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
783 879
784 if (move) { 880 if (move) {
785 path[level].bp_index -= nilfs_btree_node_get_nchildren(node); 881 path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
786 nilfs_btree_node_insert(btree, right, *keyp, *ptrp, 882 nilfs_btree_node_insert(right, path[level].bp_index,
787 path[level].bp_index); 883 *keyp, *ptrp, ncblk);
788 884
789 *keyp = nilfs_btree_node_get_key(right, 0); 885 *keyp = nilfs_btree_node_get_key(right, 0);
790 *ptrp = path[level].bp_newreq.bpr_ptr; 886 *ptrp = path[level].bp_newreq.bpr_ptr;
@@ -805,19 +901,21 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
805 path[level + 1].bp_index++; 901 path[level + 1].bp_index++;
806} 902}
807 903
808static void nilfs_btree_grow(struct nilfs_btree *btree, 904static void nilfs_btree_grow(struct nilfs_bmap *btree,
809 struct nilfs_btree_path *path, 905 struct nilfs_btree_path *path,
810 int level, __u64 *keyp, __u64 *ptrp) 906 int level, __u64 *keyp, __u64 *ptrp)
811{ 907{
812 struct nilfs_btree_node *root, *child; 908 struct nilfs_btree_node *root, *child;
813 int n; 909 int n, ncblk;
814 910
815 root = nilfs_btree_get_root(btree); 911 root = nilfs_btree_get_root(btree);
816 child = nilfs_btree_get_sib_node(path, level); 912 child = nilfs_btree_get_sib_node(path, level);
913 ncblk = nilfs_btree_nchildren_per_block(btree);
817 914
818 n = nilfs_btree_node_get_nchildren(root); 915 n = nilfs_btree_node_get_nchildren(root);
819 916
820 nilfs_btree_node_move_right(btree, root, child, n); 917 nilfs_btree_node_move_right(root, child, n,
918 NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
821 nilfs_btree_node_set_level(root, level + 1); 919 nilfs_btree_node_set_level(root, level + 1);
822 920
823 if (!buffer_dirty(path[level].bp_sib_bh)) 921 if (!buffer_dirty(path[level].bp_sib_bh))
@@ -832,11 +930,11 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
832 *ptrp = path[level].bp_newreq.bpr_ptr; 930 *ptrp = path[level].bp_newreq.bpr_ptr;
833} 931}
834 932
835static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree, 933static __u64 nilfs_btree_find_near(const struct nilfs_bmap *btree,
836 const struct nilfs_btree_path *path) 934 const struct nilfs_btree_path *path)
837{ 935{
838 struct nilfs_btree_node *node; 936 struct nilfs_btree_node *node;
839 int level; 937 int level, ncmax;
840 938
841 if (path == NULL) 939 if (path == NULL)
842 return NILFS_BMAP_INVALID_PTR; 940 return NILFS_BMAP_INVALID_PTR;
@@ -844,29 +942,30 @@ static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree,
844 /* left sibling */ 942 /* left sibling */
845 level = NILFS_BTREE_LEVEL_NODE_MIN; 943 level = NILFS_BTREE_LEVEL_NODE_MIN;
846 if (path[level].bp_index > 0) { 944 if (path[level].bp_index > 0) {
847 node = nilfs_btree_get_node(btree, path, level); 945 node = nilfs_btree_get_node(btree, path, level, &ncmax);
848 return nilfs_btree_node_get_ptr(btree, node, 946 return nilfs_btree_node_get_ptr(node,
849 path[level].bp_index - 1); 947 path[level].bp_index - 1,
948 ncmax);
850 } 949 }
851 950
852 /* parent */ 951 /* parent */
853 level = NILFS_BTREE_LEVEL_NODE_MIN + 1; 952 level = NILFS_BTREE_LEVEL_NODE_MIN + 1;
854 if (level <= nilfs_btree_height(btree) - 1) { 953 if (level <= nilfs_btree_height(btree) - 1) {
855 node = nilfs_btree_get_node(btree, path, level); 954 node = nilfs_btree_get_node(btree, path, level, &ncmax);
856 return nilfs_btree_node_get_ptr(btree, node, 955 return nilfs_btree_node_get_ptr(node, path[level].bp_index,
857 path[level].bp_index); 956 ncmax);
858 } 957 }
859 958
860 return NILFS_BMAP_INVALID_PTR; 959 return NILFS_BMAP_INVALID_PTR;
861} 960}
862 961
863static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree, 962static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree,
864 const struct nilfs_btree_path *path, 963 const struct nilfs_btree_path *path,
865 __u64 key) 964 __u64 key)
866{ 965{
867 __u64 ptr; 966 __u64 ptr;
868 967
869 ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key); 968 ptr = nilfs_bmap_find_target_seq(btree, key);
870 if (ptr != NILFS_BMAP_INVALID_PTR) 969 if (ptr != NILFS_BMAP_INVALID_PTR)
871 /* sequential access */ 970 /* sequential access */
872 return ptr; 971 return ptr;
@@ -877,17 +976,10 @@ static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree,
877 return ptr; 976 return ptr;
878 } 977 }
879 /* block group */ 978 /* block group */
880 return nilfs_bmap_find_target_in_group(&btree->bt_bmap); 979 return nilfs_bmap_find_target_in_group(btree);
881}
882
883static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key,
884 __u64 ptr)
885{
886 btree->bt_bmap.b_last_allocated_key = key;
887 btree->bt_bmap.b_last_allocated_ptr = ptr;
888} 980}
889 981
890static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, 982static int nilfs_btree_prepare_insert(struct nilfs_bmap *btree,
891 struct nilfs_btree_path *path, 983 struct nilfs_btree_path *path,
892 int *levelp, __u64 key, __u64 ptr, 984 int *levelp, __u64 key, __u64 ptr,
893 struct nilfs_bmap_stats *stats) 985 struct nilfs_bmap_stats *stats)
@@ -895,79 +987,78 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
895 struct buffer_head *bh; 987 struct buffer_head *bh;
896 struct nilfs_btree_node *node, *parent, *sib; 988 struct nilfs_btree_node *node, *parent, *sib;
897 __u64 sibptr; 989 __u64 sibptr;
898 int pindex, level, ret; 990 int pindex, level, ncmax, ncblk, ret;
899 struct inode *dat = NULL; 991 struct inode *dat = NULL;
900 992
901 stats->bs_nblocks = 0; 993 stats->bs_nblocks = 0;
902 level = NILFS_BTREE_LEVEL_DATA; 994 level = NILFS_BTREE_LEVEL_DATA;
903 995
904 /* allocate a new ptr for data block */ 996 /* allocate a new ptr for data block */
905 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) { 997 if (NILFS_BMAP_USE_VBN(btree)) {
906 path[level].bp_newreq.bpr_ptr = 998 path[level].bp_newreq.bpr_ptr =
907 nilfs_btree_find_target_v(btree, path, key); 999 nilfs_btree_find_target_v(btree, path, key);
908 dat = nilfs_bmap_get_dat(&btree->bt_bmap); 1000 dat = nilfs_bmap_get_dat(btree);
909 } 1001 }
910 1002
911 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, 1003 ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
912 &path[level].bp_newreq, dat);
913 if (ret < 0) 1004 if (ret < 0)
914 goto err_out_data; 1005 goto err_out_data;
915 1006
1007 ncblk = nilfs_btree_nchildren_per_block(btree);
1008
916 for (level = NILFS_BTREE_LEVEL_NODE_MIN; 1009 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
917 level < nilfs_btree_height(btree) - 1; 1010 level < nilfs_btree_height(btree) - 1;
918 level++) { 1011 level++) {
919 node = nilfs_btree_get_nonroot_node(path, level); 1012 node = nilfs_btree_get_nonroot_node(path, level);
920 if (nilfs_btree_node_get_nchildren(node) < 1013 if (nilfs_btree_node_get_nchildren(node) < ncblk) {
921 nilfs_btree_node_nchildren_max(node, btree)) {
922 path[level].bp_op = nilfs_btree_do_insert; 1014 path[level].bp_op = nilfs_btree_do_insert;
923 stats->bs_nblocks++; 1015 stats->bs_nblocks++;
924 goto out; 1016 goto out;
925 } 1017 }
926 1018
927 parent = nilfs_btree_get_node(btree, path, level + 1); 1019 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
928 pindex = path[level + 1].bp_index; 1020 pindex = path[level + 1].bp_index;
929 1021
930 /* left sibling */ 1022 /* left sibling */
931 if (pindex > 0) { 1023 if (pindex > 0) {
932 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1024 sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
933 pindex - 1); 1025 ncmax);
934 ret = nilfs_btree_get_block(btree, sibptr, &bh); 1026 ret = nilfs_btree_get_block(btree, sibptr, &bh);
935 if (ret < 0) 1027 if (ret < 0)
936 goto err_out_child_node; 1028 goto err_out_child_node;
937 sib = (struct nilfs_btree_node *)bh->b_data; 1029 sib = (struct nilfs_btree_node *)bh->b_data;
938 if (nilfs_btree_node_get_nchildren(sib) < 1030 if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
939 nilfs_btree_node_nchildren_max(sib, btree)) {
940 path[level].bp_sib_bh = bh; 1031 path[level].bp_sib_bh = bh;
941 path[level].bp_op = nilfs_btree_carry_left; 1032 path[level].bp_op = nilfs_btree_carry_left;
942 stats->bs_nblocks++; 1033 stats->bs_nblocks++;
943 goto out; 1034 goto out;
944 } else 1035 } else {
945 brelse(bh); 1036 brelse(bh);
1037 }
946 } 1038 }
947 1039
948 /* right sibling */ 1040 /* right sibling */
949 if (pindex < 1041 if (pindex < nilfs_btree_node_get_nchildren(parent) - 1) {
950 nilfs_btree_node_get_nchildren(parent) - 1) { 1042 sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
951 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1043 ncmax);
952 pindex + 1);
953 ret = nilfs_btree_get_block(btree, sibptr, &bh); 1044 ret = nilfs_btree_get_block(btree, sibptr, &bh);
954 if (ret < 0) 1045 if (ret < 0)
955 goto err_out_child_node; 1046 goto err_out_child_node;
956 sib = (struct nilfs_btree_node *)bh->b_data; 1047 sib = (struct nilfs_btree_node *)bh->b_data;
957 if (nilfs_btree_node_get_nchildren(sib) < 1048 if (nilfs_btree_node_get_nchildren(sib) < ncblk) {
958 nilfs_btree_node_nchildren_max(sib, btree)) {
959 path[level].bp_sib_bh = bh; 1049 path[level].bp_sib_bh = bh;
960 path[level].bp_op = nilfs_btree_carry_right; 1050 path[level].bp_op = nilfs_btree_carry_right;
961 stats->bs_nblocks++; 1051 stats->bs_nblocks++;
962 goto out; 1052 goto out;
963 } else 1053 } else {
964 brelse(bh); 1054 brelse(bh);
1055 }
965 } 1056 }
966 1057
967 /* split */ 1058 /* split */
968 path[level].bp_newreq.bpr_ptr = 1059 path[level].bp_newreq.bpr_ptr =
969 path[level - 1].bp_newreq.bpr_ptr + 1; 1060 path[level - 1].bp_newreq.bpr_ptr + 1;
970 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, 1061 ret = nilfs_bmap_prepare_alloc_ptr(btree,
971 &path[level].bp_newreq, dat); 1062 &path[level].bp_newreq, dat);
972 if (ret < 0) 1063 if (ret < 0)
973 goto err_out_child_node; 1064 goto err_out_child_node;
@@ -979,9 +1070,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
979 1070
980 stats->bs_nblocks++; 1071 stats->bs_nblocks++;
981 1072
982 nilfs_btree_node_init(btree, 1073 sib = (struct nilfs_btree_node *)bh->b_data;
983 (struct nilfs_btree_node *)bh->b_data, 1074 nilfs_btree_node_init(sib, 0, level, 0, ncblk, NULL, NULL);
984 0, level, 0, NULL, NULL);
985 path[level].bp_sib_bh = bh; 1075 path[level].bp_sib_bh = bh;
986 path[level].bp_op = nilfs_btree_split; 1076 path[level].bp_op = nilfs_btree_split;
987 } 1077 }
@@ -989,7 +1079,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
989 /* root */ 1079 /* root */
990 node = nilfs_btree_get_root(btree); 1080 node = nilfs_btree_get_root(btree);
991 if (nilfs_btree_node_get_nchildren(node) < 1081 if (nilfs_btree_node_get_nchildren(node) <
992 nilfs_btree_node_nchildren_max(node, btree)) { 1082 NILFS_BTREE_ROOT_NCHILDREN_MAX) {
993 path[level].bp_op = nilfs_btree_do_insert; 1083 path[level].bp_op = nilfs_btree_do_insert;
994 stats->bs_nblocks++; 1084 stats->bs_nblocks++;
995 goto out; 1085 goto out;
@@ -997,8 +1087,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
997 1087
998 /* grow */ 1088 /* grow */
999 path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; 1089 path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
1000 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, 1090 ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat);
1001 &path[level].bp_newreq, dat);
1002 if (ret < 0) 1091 if (ret < 0)
1003 goto err_out_child_node; 1092 goto err_out_child_node;
1004 ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr, 1093 ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
@@ -1006,8 +1095,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1006 if (ret < 0) 1095 if (ret < 0)
1007 goto err_out_curr_node; 1096 goto err_out_curr_node;
1008 1097
1009 nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data, 1098 nilfs_btree_node_init((struct nilfs_btree_node *)bh->b_data,
1010 0, level, 0, NULL, NULL); 1099 0, level, 0, ncblk, NULL, NULL);
1011 path[level].bp_sib_bh = bh; 1100 path[level].bp_sib_bh = bh;
1012 path[level].bp_op = nilfs_btree_grow; 1101 path[level].bp_op = nilfs_btree_grow;
1013 1102
@@ -1024,25 +1113,22 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1024 1113
1025 /* error */ 1114 /* error */
1026 err_out_curr_node: 1115 err_out_curr_node:
1027 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq, 1116 nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
1028 dat);
1029 err_out_child_node: 1117 err_out_child_node:
1030 for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { 1118 for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
1031 nilfs_btnode_delete(path[level].bp_sib_bh); 1119 nilfs_btnode_delete(path[level].bp_sib_bh);
1032 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, 1120 nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
1033 &path[level].bp_newreq, dat);
1034 1121
1035 } 1122 }
1036 1123
1037 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq, 1124 nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat);
1038 dat);
1039 err_out_data: 1125 err_out_data:
1040 *levelp = level; 1126 *levelp = level;
1041 stats->bs_nblocks = 0; 1127 stats->bs_nblocks = 0;
1042 return ret; 1128 return ret;
1043} 1129}
1044 1130
1045static void nilfs_btree_commit_insert(struct nilfs_btree *btree, 1131static void nilfs_btree_commit_insert(struct nilfs_bmap *btree,
1046 struct nilfs_btree_path *path, 1132 struct nilfs_btree_path *path,
1047 int maxlevel, __u64 key, __u64 ptr) 1133 int maxlevel, __u64 key, __u64 ptr)
1048{ 1134{
@@ -1051,35 +1137,33 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
1051 1137
1052 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); 1138 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1053 ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; 1139 ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
1054 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) { 1140 if (NILFS_BMAP_USE_VBN(btree)) {
1055 nilfs_btree_set_target_v(btree, key, ptr); 1141 nilfs_bmap_set_target_v(btree, key, ptr);
1056 dat = nilfs_bmap_get_dat(&btree->bt_bmap); 1142 dat = nilfs_bmap_get_dat(btree);
1057 } 1143 }
1058 1144
1059 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { 1145 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1060 nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap, 1146 nilfs_bmap_commit_alloc_ptr(btree,
1061 &path[level - 1].bp_newreq, dat); 1147 &path[level - 1].bp_newreq, dat);
1062 path[level].bp_op(btree, path, level, &key, &ptr); 1148 path[level].bp_op(btree, path, level, &key, &ptr);
1063 } 1149 }
1064 1150
1065 if (!nilfs_bmap_dirty(&btree->bt_bmap)) 1151 if (!nilfs_bmap_dirty(btree))
1066 nilfs_bmap_set_dirty(&btree->bt_bmap); 1152 nilfs_bmap_set_dirty(btree);
1067} 1153}
1068 1154
1069static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) 1155static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr)
1070{ 1156{
1071 struct nilfs_btree *btree;
1072 struct nilfs_btree_path *path; 1157 struct nilfs_btree_path *path;
1073 struct nilfs_bmap_stats stats; 1158 struct nilfs_bmap_stats stats;
1074 int level, ret; 1159 int level, ret;
1075 1160
1076 btree = (struct nilfs_btree *)bmap;
1077 path = nilfs_btree_alloc_path(); 1161 path = nilfs_btree_alloc_path();
1078 if (path == NULL) 1162 if (path == NULL)
1079 return -ENOMEM; 1163 return -ENOMEM;
1080 1164
1081 ret = nilfs_btree_do_lookup(btree, path, key, NULL, 1165 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1082 NILFS_BTREE_LEVEL_NODE_MIN); 1166 NILFS_BTREE_LEVEL_NODE_MIN, 0);
1083 if (ret != -ENOENT) { 1167 if (ret != -ENOENT) {
1084 if (ret == 0) 1168 if (ret == 0)
1085 ret = -EEXIST; 1169 ret = -EEXIST;
@@ -1090,23 +1174,25 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1090 if (ret < 0) 1174 if (ret < 0)
1091 goto out; 1175 goto out;
1092 nilfs_btree_commit_insert(btree, path, level, key, ptr); 1176 nilfs_btree_commit_insert(btree, path, level, key, ptr);
1093 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); 1177 nilfs_bmap_add_blocks(btree, stats.bs_nblocks);
1094 1178
1095 out: 1179 out:
1096 nilfs_btree_free_path(path); 1180 nilfs_btree_free_path(path);
1097 return ret; 1181 return ret;
1098} 1182}
1099 1183
1100static void nilfs_btree_do_delete(struct nilfs_btree *btree, 1184static void nilfs_btree_do_delete(struct nilfs_bmap *btree,
1101 struct nilfs_btree_path *path, 1185 struct nilfs_btree_path *path,
1102 int level, __u64 *keyp, __u64 *ptrp) 1186 int level, __u64 *keyp, __u64 *ptrp)
1103{ 1187{
1104 struct nilfs_btree_node *node; 1188 struct nilfs_btree_node *node;
1189 int ncblk;
1105 1190
1106 if (level < nilfs_btree_height(btree) - 1) { 1191 if (level < nilfs_btree_height(btree) - 1) {
1107 node = nilfs_btree_get_nonroot_node(path, level); 1192 node = nilfs_btree_get_nonroot_node(path, level);
1108 nilfs_btree_node_delete(btree, node, keyp, ptrp, 1193 ncblk = nilfs_btree_nchildren_per_block(btree);
1109 path[level].bp_index); 1194 nilfs_btree_node_delete(node, path[level].bp_index,
1195 keyp, ptrp, ncblk);
1110 if (!buffer_dirty(path[level].bp_bh)) 1196 if (!buffer_dirty(path[level].bp_bh))
1111 nilfs_btnode_mark_dirty(path[level].bp_bh); 1197 nilfs_btnode_mark_dirty(path[level].bp_bh);
1112 if (path[level].bp_index == 0) 1198 if (path[level].bp_index == 0)
@@ -1114,17 +1200,18 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
1114 nilfs_btree_node_get_key(node, 0)); 1200 nilfs_btree_node_get_key(node, 0));
1115 } else { 1201 } else {
1116 node = nilfs_btree_get_root(btree); 1202 node = nilfs_btree_get_root(btree);
1117 nilfs_btree_node_delete(btree, node, keyp, ptrp, 1203 nilfs_btree_node_delete(node, path[level].bp_index,
1118 path[level].bp_index); 1204 keyp, ptrp,
1205 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1119 } 1206 }
1120} 1207}
1121 1208
1122static void nilfs_btree_borrow_left(struct nilfs_btree *btree, 1209static void nilfs_btree_borrow_left(struct nilfs_bmap *btree,
1123 struct nilfs_btree_path *path, 1210 struct nilfs_btree_path *path,
1124 int level, __u64 *keyp, __u64 *ptrp) 1211 int level, __u64 *keyp, __u64 *ptrp)
1125{ 1212{
1126 struct nilfs_btree_node *node, *left; 1213 struct nilfs_btree_node *node, *left;
1127 int nchildren, lnchildren, n; 1214 int nchildren, lnchildren, n, ncblk;
1128 1215
1129 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1216 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1130 1217
@@ -1132,10 +1219,11 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1132 left = nilfs_btree_get_sib_node(path, level); 1219 left = nilfs_btree_get_sib_node(path, level);
1133 nchildren = nilfs_btree_node_get_nchildren(node); 1220 nchildren = nilfs_btree_node_get_nchildren(node);
1134 lnchildren = nilfs_btree_node_get_nchildren(left); 1221 lnchildren = nilfs_btree_node_get_nchildren(left);
1222 ncblk = nilfs_btree_nchildren_per_block(btree);
1135 1223
1136 n = (nchildren + lnchildren) / 2 - nchildren; 1224 n = (nchildren + lnchildren) / 2 - nchildren;
1137 1225
1138 nilfs_btree_node_move_right(btree, left, node, n); 1226 nilfs_btree_node_move_right(left, node, n, ncblk, ncblk);
1139 1227
1140 if (!buffer_dirty(path[level].bp_bh)) 1228 if (!buffer_dirty(path[level].bp_bh))
1141 nilfs_btnode_mark_dirty(path[level].bp_bh); 1229 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -1150,12 +1238,12 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1150 path[level].bp_index += n; 1238 path[level].bp_index += n;
1151} 1239}
1152 1240
1153static void nilfs_btree_borrow_right(struct nilfs_btree *btree, 1241static void nilfs_btree_borrow_right(struct nilfs_bmap *btree,
1154 struct nilfs_btree_path *path, 1242 struct nilfs_btree_path *path,
1155 int level, __u64 *keyp, __u64 *ptrp) 1243 int level, __u64 *keyp, __u64 *ptrp)
1156{ 1244{
1157 struct nilfs_btree_node *node, *right; 1245 struct nilfs_btree_node *node, *right;
1158 int nchildren, rnchildren, n; 1246 int nchildren, rnchildren, n, ncblk;
1159 1247
1160 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1248 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1161 1249
@@ -1163,10 +1251,11 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1163 right = nilfs_btree_get_sib_node(path, level); 1251 right = nilfs_btree_get_sib_node(path, level);
1164 nchildren = nilfs_btree_node_get_nchildren(node); 1252 nchildren = nilfs_btree_node_get_nchildren(node);
1165 rnchildren = nilfs_btree_node_get_nchildren(right); 1253 rnchildren = nilfs_btree_node_get_nchildren(right);
1254 ncblk = nilfs_btree_nchildren_per_block(btree);
1166 1255
1167 n = (nchildren + rnchildren) / 2 - nchildren; 1256 n = (nchildren + rnchildren) / 2 - nchildren;
1168 1257
1169 nilfs_btree_node_move_left(btree, node, right, n); 1258 nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
1170 1259
1171 if (!buffer_dirty(path[level].bp_bh)) 1260 if (!buffer_dirty(path[level].bp_bh))
1172 nilfs_btnode_mark_dirty(path[level].bp_bh); 1261 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -1182,21 +1271,22 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1182 path[level].bp_sib_bh = NULL; 1271 path[level].bp_sib_bh = NULL;
1183} 1272}
1184 1273
1185static void nilfs_btree_concat_left(struct nilfs_btree *btree, 1274static void nilfs_btree_concat_left(struct nilfs_bmap *btree,
1186 struct nilfs_btree_path *path, 1275 struct nilfs_btree_path *path,
1187 int level, __u64 *keyp, __u64 *ptrp) 1276 int level, __u64 *keyp, __u64 *ptrp)
1188{ 1277{
1189 struct nilfs_btree_node *node, *left; 1278 struct nilfs_btree_node *node, *left;
1190 int n; 1279 int n, ncblk;
1191 1280
1192 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1281 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1193 1282
1194 node = nilfs_btree_get_nonroot_node(path, level); 1283 node = nilfs_btree_get_nonroot_node(path, level);
1195 left = nilfs_btree_get_sib_node(path, level); 1284 left = nilfs_btree_get_sib_node(path, level);
1285 ncblk = nilfs_btree_nchildren_per_block(btree);
1196 1286
1197 n = nilfs_btree_node_get_nchildren(node); 1287 n = nilfs_btree_node_get_nchildren(node);
1198 1288
1199 nilfs_btree_node_move_left(btree, left, node, n); 1289 nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
1200 1290
1201 if (!buffer_dirty(path[level].bp_sib_bh)) 1291 if (!buffer_dirty(path[level].bp_sib_bh))
1202 nilfs_btnode_mark_dirty(path[level].bp_sib_bh); 1292 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
@@ -1207,21 +1297,22 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
1207 path[level].bp_index += nilfs_btree_node_get_nchildren(left); 1297 path[level].bp_index += nilfs_btree_node_get_nchildren(left);
1208} 1298}
1209 1299
1210static void nilfs_btree_concat_right(struct nilfs_btree *btree, 1300static void nilfs_btree_concat_right(struct nilfs_bmap *btree,
1211 struct nilfs_btree_path *path, 1301 struct nilfs_btree_path *path,
1212 int level, __u64 *keyp, __u64 *ptrp) 1302 int level, __u64 *keyp, __u64 *ptrp)
1213{ 1303{
1214 struct nilfs_btree_node *node, *right; 1304 struct nilfs_btree_node *node, *right;
1215 int n; 1305 int n, ncblk;
1216 1306
1217 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1307 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1218 1308
1219 node = nilfs_btree_get_nonroot_node(path, level); 1309 node = nilfs_btree_get_nonroot_node(path, level);
1220 right = nilfs_btree_get_sib_node(path, level); 1310 right = nilfs_btree_get_sib_node(path, level);
1311 ncblk = nilfs_btree_nchildren_per_block(btree);
1221 1312
1222 n = nilfs_btree_node_get_nchildren(right); 1313 n = nilfs_btree_node_get_nchildren(right);
1223 1314
1224 nilfs_btree_node_move_left(btree, node, right, n); 1315 nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
1225 1316
1226 if (!buffer_dirty(path[level].bp_bh)) 1317 if (!buffer_dirty(path[level].bp_bh))
1227 nilfs_btnode_mark_dirty(path[level].bp_bh); 1318 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -1231,29 +1322,32 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
1231 path[level + 1].bp_index++; 1322 path[level + 1].bp_index++;
1232} 1323}
1233 1324
1234static void nilfs_btree_shrink(struct nilfs_btree *btree, 1325static void nilfs_btree_shrink(struct nilfs_bmap *btree,
1235 struct nilfs_btree_path *path, 1326 struct nilfs_btree_path *path,
1236 int level, __u64 *keyp, __u64 *ptrp) 1327 int level, __u64 *keyp, __u64 *ptrp)
1237{ 1328{
1238 struct nilfs_btree_node *root, *child; 1329 struct nilfs_btree_node *root, *child;
1239 int n; 1330 int n, ncblk;
1240 1331
1241 nilfs_btree_do_delete(btree, path, level, keyp, ptrp); 1332 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1242 1333
1243 root = nilfs_btree_get_root(btree); 1334 root = nilfs_btree_get_root(btree);
1244 child = nilfs_btree_get_nonroot_node(path, level); 1335 child = nilfs_btree_get_nonroot_node(path, level);
1336 ncblk = nilfs_btree_nchildren_per_block(btree);
1245 1337
1246 nilfs_btree_node_delete(btree, root, NULL, NULL, 0); 1338 nilfs_btree_node_delete(root, 0, NULL, NULL,
1339 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1247 nilfs_btree_node_set_level(root, level); 1340 nilfs_btree_node_set_level(root, level);
1248 n = nilfs_btree_node_get_nchildren(child); 1341 n = nilfs_btree_node_get_nchildren(child);
1249 nilfs_btree_node_move_left(btree, root, child, n); 1342 nilfs_btree_node_move_left(root, child, n,
1343 NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk);
1250 1344
1251 nilfs_btnode_delete(path[level].bp_bh); 1345 nilfs_btnode_delete(path[level].bp_bh);
1252 path[level].bp_bh = NULL; 1346 path[level].bp_bh = NULL;
1253} 1347}
1254 1348
1255 1349
1256static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, 1350static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree,
1257 struct nilfs_btree_path *path, 1351 struct nilfs_btree_path *path,
1258 int *levelp, 1352 int *levelp,
1259 struct nilfs_bmap_stats *stats, 1353 struct nilfs_bmap_stats *stats,
@@ -1262,42 +1356,43 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1262 struct buffer_head *bh; 1356 struct buffer_head *bh;
1263 struct nilfs_btree_node *node, *parent, *sib; 1357 struct nilfs_btree_node *node, *parent, *sib;
1264 __u64 sibptr; 1358 __u64 sibptr;
1265 int pindex, level, ret; 1359 int pindex, level, ncmin, ncmax, ncblk, ret;
1266 1360
1267 ret = 0; 1361 ret = 0;
1268 stats->bs_nblocks = 0; 1362 stats->bs_nblocks = 0;
1363 ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
1364 ncblk = nilfs_btree_nchildren_per_block(btree);
1365
1269 for (level = NILFS_BTREE_LEVEL_NODE_MIN; 1366 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
1270 level < nilfs_btree_height(btree) - 1; 1367 level < nilfs_btree_height(btree) - 1;
1271 level++) { 1368 level++) {
1272 node = nilfs_btree_get_nonroot_node(path, level); 1369 node = nilfs_btree_get_nonroot_node(path, level);
1273 path[level].bp_oldreq.bpr_ptr = 1370 path[level].bp_oldreq.bpr_ptr =
1274 nilfs_btree_node_get_ptr(btree, node, 1371 nilfs_btree_node_get_ptr(node, path[level].bp_index,
1275 path[level].bp_index); 1372 ncblk);
1276 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, 1373 ret = nilfs_bmap_prepare_end_ptr(btree,
1277 &path[level].bp_oldreq, dat); 1374 &path[level].bp_oldreq, dat);
1278 if (ret < 0) 1375 if (ret < 0)
1279 goto err_out_child_node; 1376 goto err_out_child_node;
1280 1377
1281 if (nilfs_btree_node_get_nchildren(node) > 1378 if (nilfs_btree_node_get_nchildren(node) > ncmin) {
1282 nilfs_btree_node_nchildren_min(node, btree)) {
1283 path[level].bp_op = nilfs_btree_do_delete; 1379 path[level].bp_op = nilfs_btree_do_delete;
1284 stats->bs_nblocks++; 1380 stats->bs_nblocks++;
1285 goto out; 1381 goto out;
1286 } 1382 }
1287 1383
1288 parent = nilfs_btree_get_node(btree, path, level + 1); 1384 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
1289 pindex = path[level + 1].bp_index; 1385 pindex = path[level + 1].bp_index;
1290 1386
1291 if (pindex > 0) { 1387 if (pindex > 0) {
1292 /* left sibling */ 1388 /* left sibling */
1293 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1389 sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1,
1294 pindex - 1); 1390 ncmax);
1295 ret = nilfs_btree_get_block(btree, sibptr, &bh); 1391 ret = nilfs_btree_get_block(btree, sibptr, &bh);
1296 if (ret < 0) 1392 if (ret < 0)
1297 goto err_out_curr_node; 1393 goto err_out_curr_node;
1298 sib = (struct nilfs_btree_node *)bh->b_data; 1394 sib = (struct nilfs_btree_node *)bh->b_data;
1299 if (nilfs_btree_node_get_nchildren(sib) > 1395 if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
1300 nilfs_btree_node_nchildren_min(sib, btree)) {
1301 path[level].bp_sib_bh = bh; 1396 path[level].bp_sib_bh = bh;
1302 path[level].bp_op = nilfs_btree_borrow_left; 1397 path[level].bp_op = nilfs_btree_borrow_left;
1303 stats->bs_nblocks++; 1398 stats->bs_nblocks++;
@@ -1311,14 +1406,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1311 } else if (pindex < 1406 } else if (pindex <
1312 nilfs_btree_node_get_nchildren(parent) - 1) { 1407 nilfs_btree_node_get_nchildren(parent) - 1) {
1313 /* right sibling */ 1408 /* right sibling */
1314 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1409 sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1,
1315 pindex + 1); 1410 ncmax);
1316 ret = nilfs_btree_get_block(btree, sibptr, &bh); 1411 ret = nilfs_btree_get_block(btree, sibptr, &bh);
1317 if (ret < 0) 1412 if (ret < 0)
1318 goto err_out_curr_node; 1413 goto err_out_curr_node;
1319 sib = (struct nilfs_btree_node *)bh->b_data; 1414 sib = (struct nilfs_btree_node *)bh->b_data;
1320 if (nilfs_btree_node_get_nchildren(sib) > 1415 if (nilfs_btree_node_get_nchildren(sib) > ncmin) {
1321 nilfs_btree_node_nchildren_min(sib, btree)) {
1322 path[level].bp_sib_bh = bh; 1416 path[level].bp_sib_bh = bh;
1323 path[level].bp_op = nilfs_btree_borrow_right; 1417 path[level].bp_op = nilfs_btree_borrow_right;
1324 stats->bs_nblocks++; 1418 stats->bs_nblocks++;
@@ -1349,10 +1443,10 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1349 1443
1350 node = nilfs_btree_get_root(btree); 1444 node = nilfs_btree_get_root(btree);
1351 path[level].bp_oldreq.bpr_ptr = 1445 path[level].bp_oldreq.bpr_ptr =
1352 nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); 1446 nilfs_btree_node_get_ptr(node, path[level].bp_index,
1447 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1353 1448
1354 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, 1449 ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat);
1355 &path[level].bp_oldreq, dat);
1356 if (ret < 0) 1450 if (ret < 0)
1357 goto err_out_child_node; 1451 goto err_out_child_node;
1358 1452
@@ -1367,75 +1461,68 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1367 1461
1368 /* error */ 1462 /* error */
1369 err_out_curr_node: 1463 err_out_curr_node:
1370 nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat); 1464 nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
1371 err_out_child_node: 1465 err_out_child_node:
1372 for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { 1466 for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
1373 brelse(path[level].bp_sib_bh); 1467 brelse(path[level].bp_sib_bh);
1374 nilfs_bmap_abort_end_ptr(&btree->bt_bmap, 1468 nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat);
1375 &path[level].bp_oldreq, dat);
1376 } 1469 }
1377 *levelp = level; 1470 *levelp = level;
1378 stats->bs_nblocks = 0; 1471 stats->bs_nblocks = 0;
1379 return ret; 1472 return ret;
1380} 1473}
1381 1474
1382static void nilfs_btree_commit_delete(struct nilfs_btree *btree, 1475static void nilfs_btree_commit_delete(struct nilfs_bmap *btree,
1383 struct nilfs_btree_path *path, 1476 struct nilfs_btree_path *path,
1384 int maxlevel, struct inode *dat) 1477 int maxlevel, struct inode *dat)
1385{ 1478{
1386 int level; 1479 int level;
1387 1480
1388 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { 1481 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1389 nilfs_bmap_commit_end_ptr(&btree->bt_bmap, 1482 nilfs_bmap_commit_end_ptr(btree, &path[level].bp_oldreq, dat);
1390 &path[level].bp_oldreq, dat);
1391 path[level].bp_op(btree, path, level, NULL, NULL); 1483 path[level].bp_op(btree, path, level, NULL, NULL);
1392 } 1484 }
1393 1485
1394 if (!nilfs_bmap_dirty(&btree->bt_bmap)) 1486 if (!nilfs_bmap_dirty(btree))
1395 nilfs_bmap_set_dirty(&btree->bt_bmap); 1487 nilfs_bmap_set_dirty(btree);
1396} 1488}
1397 1489
1398static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key) 1490static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key)
1399 1491
1400{ 1492{
1401 struct nilfs_btree *btree;
1402 struct nilfs_btree_path *path; 1493 struct nilfs_btree_path *path;
1403 struct nilfs_bmap_stats stats; 1494 struct nilfs_bmap_stats stats;
1404 struct inode *dat; 1495 struct inode *dat;
1405 int level, ret; 1496 int level, ret;
1406 1497
1407 btree = (struct nilfs_btree *)bmap;
1408 path = nilfs_btree_alloc_path(); 1498 path = nilfs_btree_alloc_path();
1409 if (path == NULL) 1499 if (path == NULL)
1410 return -ENOMEM; 1500 return -ENOMEM;
1411 1501
1412 ret = nilfs_btree_do_lookup(btree, path, key, NULL, 1502 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1413 NILFS_BTREE_LEVEL_NODE_MIN); 1503 NILFS_BTREE_LEVEL_NODE_MIN, 0);
1414 if (ret < 0) 1504 if (ret < 0)
1415 goto out; 1505 goto out;
1416 1506
1417 1507
1418 dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ? 1508 dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
1419 nilfs_bmap_get_dat(&btree->bt_bmap) : NULL;
1420 1509
1421 ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat); 1510 ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
1422 if (ret < 0) 1511 if (ret < 0)
1423 goto out; 1512 goto out;
1424 nilfs_btree_commit_delete(btree, path, level, dat); 1513 nilfs_btree_commit_delete(btree, path, level, dat);
1425 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); 1514 nilfs_bmap_sub_blocks(btree, stats.bs_nblocks);
1426 1515
1427out: 1516out:
1428 nilfs_btree_free_path(path); 1517 nilfs_btree_free_path(path);
1429 return ret; 1518 return ret;
1430} 1519}
1431 1520
1432static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) 1521static int nilfs_btree_last_key(const struct nilfs_bmap *btree, __u64 *keyp)
1433{ 1522{
1434 struct nilfs_btree *btree;
1435 struct nilfs_btree_path *path; 1523 struct nilfs_btree_path *path;
1436 int ret; 1524 int ret;
1437 1525
1438 btree = (struct nilfs_btree *)bmap;
1439 path = nilfs_btree_alloc_path(); 1526 path = nilfs_btree_alloc_path();
1440 if (path == NULL) 1527 if (path == NULL)
1441 return -ENOMEM; 1528 return -ENOMEM;
@@ -1447,16 +1534,14 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
1447 return ret; 1534 return ret;
1448} 1535}
1449 1536
1450static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) 1537static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key)
1451{ 1538{
1452 struct buffer_head *bh; 1539 struct buffer_head *bh;
1453 struct nilfs_btree *btree;
1454 struct nilfs_btree_node *root, *node; 1540 struct nilfs_btree_node *root, *node;
1455 __u64 maxkey, nextmaxkey; 1541 __u64 maxkey, nextmaxkey;
1456 __u64 ptr; 1542 __u64 ptr;
1457 int nchildren, ret; 1543 int nchildren, ret;
1458 1544
1459 btree = (struct nilfs_btree *)bmap;
1460 root = nilfs_btree_get_root(btree); 1545 root = nilfs_btree_get_root(btree);
1461 switch (nilfs_btree_height(btree)) { 1546 switch (nilfs_btree_height(btree)) {
1462 case 2: 1547 case 2:
@@ -1467,7 +1552,8 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
1467 nchildren = nilfs_btree_node_get_nchildren(root); 1552 nchildren = nilfs_btree_node_get_nchildren(root);
1468 if (nchildren > 1) 1553 if (nchildren > 1)
1469 return 0; 1554 return 0;
1470 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); 1555 ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
1556 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1471 ret = nilfs_btree_get_block(btree, ptr, &bh); 1557 ret = nilfs_btree_get_block(btree, ptr, &bh);
1472 if (ret < 0) 1558 if (ret < 0)
1473 return ret; 1559 return ret;
@@ -1487,32 +1573,33 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
1487 return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW); 1573 return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW);
1488} 1574}
1489 1575
1490static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, 1576static int nilfs_btree_gather_data(struct nilfs_bmap *btree,
1491 __u64 *keys, __u64 *ptrs, int nitems) 1577 __u64 *keys, __u64 *ptrs, int nitems)
1492{ 1578{
1493 struct buffer_head *bh; 1579 struct buffer_head *bh;
1494 struct nilfs_btree *btree;
1495 struct nilfs_btree_node *node, *root; 1580 struct nilfs_btree_node *node, *root;
1496 __le64 *dkeys; 1581 __le64 *dkeys;
1497 __le64 *dptrs; 1582 __le64 *dptrs;
1498 __u64 ptr; 1583 __u64 ptr;
1499 int nchildren, i, ret; 1584 int nchildren, ncmax, i, ret;
1500 1585
1501 btree = (struct nilfs_btree *)bmap;
1502 root = nilfs_btree_get_root(btree); 1586 root = nilfs_btree_get_root(btree);
1503 switch (nilfs_btree_height(btree)) { 1587 switch (nilfs_btree_height(btree)) {
1504 case 2: 1588 case 2:
1505 bh = NULL; 1589 bh = NULL;
1506 node = root; 1590 node = root;
1591 ncmax = NILFS_BTREE_ROOT_NCHILDREN_MAX;
1507 break; 1592 break;
1508 case 3: 1593 case 3:
1509 nchildren = nilfs_btree_node_get_nchildren(root); 1594 nchildren = nilfs_btree_node_get_nchildren(root);
1510 WARN_ON(nchildren > 1); 1595 WARN_ON(nchildren > 1);
1511 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); 1596 ptr = nilfs_btree_node_get_ptr(root, nchildren - 1,
1597 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1512 ret = nilfs_btree_get_block(btree, ptr, &bh); 1598 ret = nilfs_btree_get_block(btree, ptr, &bh);
1513 if (ret < 0) 1599 if (ret < 0)
1514 return ret; 1600 return ret;
1515 node = (struct nilfs_btree_node *)bh->b_data; 1601 node = (struct nilfs_btree_node *)bh->b_data;
1602 ncmax = nilfs_btree_nchildren_per_block(btree);
1516 break; 1603 break;
1517 default: 1604 default:
1518 node = NULL; 1605 node = NULL;
@@ -1523,10 +1610,10 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
1523 if (nchildren < nitems) 1610 if (nchildren < nitems)
1524 nitems = nchildren; 1611 nitems = nchildren;
1525 dkeys = nilfs_btree_node_dkeys(node); 1612 dkeys = nilfs_btree_node_dkeys(node);
1526 dptrs = nilfs_btree_node_dptrs(node, btree); 1613 dptrs = nilfs_btree_node_dptrs(node, ncmax);
1527 for (i = 0; i < nitems; i++) { 1614 for (i = 0; i < nitems; i++) {
1528 keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]); 1615 keys[i] = le64_to_cpu(dkeys[i]);
1529 ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]); 1616 ptrs[i] = le64_to_cpu(dptrs[i]);
1530 } 1617 }
1531 1618
1532 if (bh != NULL) 1619 if (bh != NULL)
@@ -1536,14 +1623,13 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
1536} 1623}
1537 1624
1538static int 1625static int
1539nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, 1626nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key,
1540 union nilfs_bmap_ptr_req *dreq, 1627 union nilfs_bmap_ptr_req *dreq,
1541 union nilfs_bmap_ptr_req *nreq, 1628 union nilfs_bmap_ptr_req *nreq,
1542 struct buffer_head **bhp, 1629 struct buffer_head **bhp,
1543 struct nilfs_bmap_stats *stats) 1630 struct nilfs_bmap_stats *stats)
1544{ 1631{
1545 struct buffer_head *bh; 1632 struct buffer_head *bh;
1546 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
1547 struct inode *dat = NULL; 1633 struct inode *dat = NULL;
1548 int ret; 1634 int ret;
1549 1635
@@ -1551,12 +1637,12 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1551 1637
1552 /* for data */ 1638 /* for data */
1553 /* cannot find near ptr */ 1639 /* cannot find near ptr */
1554 if (NILFS_BMAP_USE_VBN(bmap)) { 1640 if (NILFS_BMAP_USE_VBN(btree)) {
1555 dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key); 1641 dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
1556 dat = nilfs_bmap_get_dat(bmap); 1642 dat = nilfs_bmap_get_dat(btree);
1557 } 1643 }
1558 1644
1559 ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat); 1645 ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat);
1560 if (ret < 0) 1646 if (ret < 0)
1561 return ret; 1647 return ret;
1562 1648
@@ -1564,7 +1650,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1564 stats->bs_nblocks++; 1650 stats->bs_nblocks++;
1565 if (nreq != NULL) { 1651 if (nreq != NULL) {
1566 nreq->bpr_ptr = dreq->bpr_ptr + 1; 1652 nreq->bpr_ptr = dreq->bpr_ptr + 1;
1567 ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat); 1653 ret = nilfs_bmap_prepare_alloc_ptr(btree, nreq, dat);
1568 if (ret < 0) 1654 if (ret < 0)
1569 goto err_out_dreq; 1655 goto err_out_dreq;
1570 1656
@@ -1581,16 +1667,16 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1581 1667
1582 /* error */ 1668 /* error */
1583 err_out_nreq: 1669 err_out_nreq:
1584 nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat); 1670 nilfs_bmap_abort_alloc_ptr(btree, nreq, dat);
1585 err_out_dreq: 1671 err_out_dreq:
1586 nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat); 1672 nilfs_bmap_abort_alloc_ptr(btree, dreq, dat);
1587 stats->bs_nblocks = 0; 1673 stats->bs_nblocks = 0;
1588 return ret; 1674 return ret;
1589 1675
1590} 1676}
1591 1677
1592static void 1678static void
1593nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, 1679nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
1594 __u64 key, __u64 ptr, 1680 __u64 key, __u64 ptr,
1595 const __u64 *keys, const __u64 *ptrs, 1681 const __u64 *keys, const __u64 *ptrs,
1596 int n, 1682 int n,
@@ -1598,57 +1684,59 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1598 union nilfs_bmap_ptr_req *nreq, 1684 union nilfs_bmap_ptr_req *nreq,
1599 struct buffer_head *bh) 1685 struct buffer_head *bh)
1600{ 1686{
1601 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
1602 struct nilfs_btree_node *node; 1687 struct nilfs_btree_node *node;
1603 struct inode *dat; 1688 struct inode *dat;
1604 __u64 tmpptr; 1689 __u64 tmpptr;
1690 int ncblk;
1605 1691
1606 /* free resources */ 1692 /* free resources */
1607 if (bmap->b_ops->bop_clear != NULL) 1693 if (btree->b_ops->bop_clear != NULL)
1608 bmap->b_ops->bop_clear(bmap); 1694 btree->b_ops->bop_clear(btree);
1609 1695
1610 /* ptr must be a pointer to a buffer head. */ 1696 /* ptr must be a pointer to a buffer head. */
1611 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); 1697 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1612 1698
1613 /* convert and insert */ 1699 /* convert and insert */
1614 dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; 1700 dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
1615 nilfs_btree_init(bmap); 1701 nilfs_btree_init(btree);
1616 if (nreq != NULL) { 1702 if (nreq != NULL) {
1617 nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat); 1703 nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
1618 nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat); 1704 nilfs_bmap_commit_alloc_ptr(btree, nreq, dat);
1619 1705
1620 /* create child node at level 1 */ 1706 /* create child node at level 1 */
1621 node = (struct nilfs_btree_node *)bh->b_data; 1707 node = (struct nilfs_btree_node *)bh->b_data;
1622 nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs); 1708 ncblk = nilfs_btree_nchildren_per_block(btree);
1623 nilfs_btree_node_insert(btree, node, 1709 nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs);
1624 key, dreq->bpr_ptr, n); 1710 nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk);
1625 if (!buffer_dirty(bh)) 1711 if (!buffer_dirty(bh))
1626 nilfs_btnode_mark_dirty(bh); 1712 nilfs_btnode_mark_dirty(bh);
1627 if (!nilfs_bmap_dirty(bmap)) 1713 if (!nilfs_bmap_dirty(btree))
1628 nilfs_bmap_set_dirty(bmap); 1714 nilfs_bmap_set_dirty(btree);
1629 1715
1630 brelse(bh); 1716 brelse(bh);
1631 1717
1632 /* create root node at level 2 */ 1718 /* create root node at level 2 */
1633 node = nilfs_btree_get_root(btree); 1719 node = nilfs_btree_get_root(btree);
1634 tmpptr = nreq->bpr_ptr; 1720 tmpptr = nreq->bpr_ptr;
1635 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, 1721 nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 2, 1,
1636 2, 1, &keys[0], &tmpptr); 1722 NILFS_BTREE_ROOT_NCHILDREN_MAX,
1723 &keys[0], &tmpptr);
1637 } else { 1724 } else {
1638 nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat); 1725 nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
1639 1726
1640 /* create root node at level 1 */ 1727 /* create root node at level 1 */
1641 node = nilfs_btree_get_root(btree); 1728 node = nilfs_btree_get_root(btree);
1642 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, 1729 nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 1, n,
1643 1, n, keys, ptrs); 1730 NILFS_BTREE_ROOT_NCHILDREN_MAX,
1644 nilfs_btree_node_insert(btree, node, 1731 keys, ptrs);
1645 key, dreq->bpr_ptr, n); 1732 nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr,
1646 if (!nilfs_bmap_dirty(bmap)) 1733 NILFS_BTREE_ROOT_NCHILDREN_MAX);
1647 nilfs_bmap_set_dirty(bmap); 1734 if (!nilfs_bmap_dirty(btree))
1735 nilfs_bmap_set_dirty(btree);
1648 } 1736 }
1649 1737
1650 if (NILFS_BMAP_USE_VBN(bmap)) 1738 if (NILFS_BMAP_USE_VBN(btree))
1651 nilfs_btree_set_target_v(btree, key, dreq->bpr_ptr); 1739 nilfs_bmap_set_target_v(btree, key, dreq->bpr_ptr);
1652} 1740}
1653 1741
1654/** 1742/**
@@ -1660,7 +1748,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1660 * @ptrs: 1748 * @ptrs:
1661 * @n: 1749 * @n:
1662 */ 1750 */
1663int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap, 1751int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
1664 __u64 key, __u64 ptr, 1752 __u64 key, __u64 ptr,
1665 const __u64 *keys, const __u64 *ptrs, int n) 1753 const __u64 *keys, const __u64 *ptrs, int n)
1666{ 1754{
@@ -1673,7 +1761,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
1673 di = &dreq; 1761 di = &dreq;
1674 ni = NULL; 1762 ni = NULL;
1675 } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX( 1763 } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
1676 1 << bmap->b_inode->i_blkbits)) { 1764 1 << btree->b_inode->i_blkbits)) {
1677 di = &dreq; 1765 di = &dreq;
1678 ni = &nreq; 1766 ni = &nreq;
1679 } else { 1767 } else {
@@ -1682,17 +1770,17 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
1682 BUG(); 1770 BUG();
1683 } 1771 }
1684 1772
1685 ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh, 1773 ret = nilfs_btree_prepare_convert_and_insert(btree, key, di, ni, &bh,
1686 &stats); 1774 &stats);
1687 if (ret < 0) 1775 if (ret < 0)
1688 return ret; 1776 return ret;
1689 nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n, 1777 nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n,
1690 di, ni, bh); 1778 di, ni, bh);
1691 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); 1779 nilfs_bmap_add_blocks(btree, stats.bs_nblocks);
1692 return 0; 1780 return 0;
1693} 1781}
1694 1782
1695static int nilfs_btree_propagate_p(struct nilfs_btree *btree, 1783static int nilfs_btree_propagate_p(struct nilfs_bmap *btree,
1696 struct nilfs_btree_path *path, 1784 struct nilfs_btree_path *path,
1697 int level, 1785 int level,
1698 struct buffer_head *bh) 1786 struct buffer_head *bh)
@@ -1704,17 +1792,17 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
1704 return 0; 1792 return 0;
1705} 1793}
1706 1794
1707static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, 1795static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree,
1708 struct nilfs_btree_path *path, 1796 struct nilfs_btree_path *path,
1709 int level, struct inode *dat) 1797 int level, struct inode *dat)
1710{ 1798{
1711 struct nilfs_btree_node *parent; 1799 struct nilfs_btree_node *parent;
1712 int ret; 1800 int ncmax, ret;
1713 1801
1714 parent = nilfs_btree_get_node(btree, path, level + 1); 1802 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
1715 path[level].bp_oldreq.bpr_ptr = 1803 path[level].bp_oldreq.bpr_ptr =
1716 nilfs_btree_node_get_ptr(btree, parent, 1804 nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
1717 path[level + 1].bp_index); 1805 ncmax);
1718 path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; 1806 path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
1719 ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req, 1807 ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
1720 &path[level].bp_newreq.bpr_req); 1808 &path[level].bp_newreq.bpr_req);
@@ -1726,7 +1814,7 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1726 path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr; 1814 path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
1727 path[level].bp_ctxt.bh = path[level].bp_bh; 1815 path[level].bp_ctxt.bh = path[level].bp_bh;
1728 ret = nilfs_btnode_prepare_change_key( 1816 ret = nilfs_btnode_prepare_change_key(
1729 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 1817 &NILFS_BMAP_I(btree)->i_btnode_cache,
1730 &path[level].bp_ctxt); 1818 &path[level].bp_ctxt);
1731 if (ret < 0) { 1819 if (ret < 0) {
1732 nilfs_dat_abort_update(dat, 1820 nilfs_dat_abort_update(dat,
@@ -1739,30 +1827,31 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1739 return 0; 1827 return 0;
1740} 1828}
1741 1829
1742static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, 1830static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree,
1743 struct nilfs_btree_path *path, 1831 struct nilfs_btree_path *path,
1744 int level, struct inode *dat) 1832 int level, struct inode *dat)
1745{ 1833{
1746 struct nilfs_btree_node *parent; 1834 struct nilfs_btree_node *parent;
1835 int ncmax;
1747 1836
1748 nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req, 1837 nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
1749 &path[level].bp_newreq.bpr_req, 1838 &path[level].bp_newreq.bpr_req,
1750 btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS); 1839 btree->b_ptr_type == NILFS_BMAP_PTR_VS);
1751 1840
1752 if (buffer_nilfs_node(path[level].bp_bh)) { 1841 if (buffer_nilfs_node(path[level].bp_bh)) {
1753 nilfs_btnode_commit_change_key( 1842 nilfs_btnode_commit_change_key(
1754 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 1843 &NILFS_BMAP_I(btree)->i_btnode_cache,
1755 &path[level].bp_ctxt); 1844 &path[level].bp_ctxt);
1756 path[level].bp_bh = path[level].bp_ctxt.bh; 1845 path[level].bp_bh = path[level].bp_ctxt.bh;
1757 } 1846 }
1758 set_buffer_nilfs_volatile(path[level].bp_bh); 1847 set_buffer_nilfs_volatile(path[level].bp_bh);
1759 1848
1760 parent = nilfs_btree_get_node(btree, path, level + 1); 1849 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
1761 nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index, 1850 nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index,
1762 path[level].bp_newreq.bpr_ptr); 1851 path[level].bp_newreq.bpr_ptr, ncmax);
1763} 1852}
1764 1853
1765static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, 1854static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree,
1766 struct nilfs_btree_path *path, 1855 struct nilfs_btree_path *path,
1767 int level, struct inode *dat) 1856 int level, struct inode *dat)
1768{ 1857{
@@ -1770,11 +1859,11 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
1770 &path[level].bp_newreq.bpr_req); 1859 &path[level].bp_newreq.bpr_req);
1771 if (buffer_nilfs_node(path[level].bp_bh)) 1860 if (buffer_nilfs_node(path[level].bp_bh))
1772 nilfs_btnode_abort_change_key( 1861 nilfs_btnode_abort_change_key(
1773 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 1862 &NILFS_BMAP_I(btree)->i_btnode_cache,
1774 &path[level].bp_ctxt); 1863 &path[level].bp_ctxt);
1775} 1864}
1776 1865
1777static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, 1866static int nilfs_btree_prepare_propagate_v(struct nilfs_bmap *btree,
1778 struct nilfs_btree_path *path, 1867 struct nilfs_btree_path *path,
1779 int minlevel, int *maxlevelp, 1868 int minlevel, int *maxlevelp,
1780 struct inode *dat) 1869 struct inode *dat)
@@ -1809,7 +1898,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
1809 return ret; 1898 return ret;
1810} 1899}
1811 1900
1812static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree, 1901static void nilfs_btree_commit_propagate_v(struct nilfs_bmap *btree,
1813 struct nilfs_btree_path *path, 1902 struct nilfs_btree_path *path,
1814 int minlevel, int maxlevel, 1903 int minlevel, int maxlevel,
1815 struct buffer_head *bh, 1904 struct buffer_head *bh,
@@ -1824,14 +1913,15 @@ static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
1824 nilfs_btree_commit_update_v(btree, path, level, dat); 1913 nilfs_btree_commit_update_v(btree, path, level, dat);
1825} 1914}
1826 1915
1827static int nilfs_btree_propagate_v(struct nilfs_btree *btree, 1916static int nilfs_btree_propagate_v(struct nilfs_bmap *btree,
1828 struct nilfs_btree_path *path, 1917 struct nilfs_btree_path *path,
1829 int level, struct buffer_head *bh) 1918 int level, struct buffer_head *bh)
1830{ 1919{
1831 int maxlevel = 0, ret; 1920 int maxlevel = 0, ret;
1832 struct nilfs_btree_node *parent; 1921 struct nilfs_btree_node *parent;
1833 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); 1922 struct inode *dat = nilfs_bmap_get_dat(btree);
1834 __u64 ptr; 1923 __u64 ptr;
1924 int ncmax;
1835 1925
1836 get_bh(bh); 1926 get_bh(bh);
1837 path[level].bp_bh = bh; 1927 path[level].bp_bh = bh;
@@ -1841,9 +1931,10 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1841 goto out; 1931 goto out;
1842 1932
1843 if (buffer_nilfs_volatile(path[level].bp_bh)) { 1933 if (buffer_nilfs_volatile(path[level].bp_bh)) {
1844 parent = nilfs_btree_get_node(btree, path, level + 1); 1934 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
1845 ptr = nilfs_btree_node_get_ptr(btree, parent, 1935 ptr = nilfs_btree_node_get_ptr(parent,
1846 path[level + 1].bp_index); 1936 path[level + 1].bp_index,
1937 ncmax);
1847 ret = nilfs_dat_mark_dirty(dat, ptr); 1938 ret = nilfs_dat_mark_dirty(dat, ptr);
1848 if (ret < 0) 1939 if (ret < 0)
1849 goto out; 1940 goto out;
@@ -1857,10 +1948,9 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1857 return ret; 1948 return ret;
1858} 1949}
1859 1950
1860static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, 1951static int nilfs_btree_propagate(struct nilfs_bmap *btree,
1861 struct buffer_head *bh) 1952 struct buffer_head *bh)
1862{ 1953{
1863 struct nilfs_btree *btree;
1864 struct nilfs_btree_path *path; 1954 struct nilfs_btree_path *path;
1865 struct nilfs_btree_node *node; 1955 struct nilfs_btree_node *node;
1866 __u64 key; 1956 __u64 key;
@@ -1868,7 +1958,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1868 1958
1869 WARN_ON(!buffer_dirty(bh)); 1959 WARN_ON(!buffer_dirty(bh));
1870 1960
1871 btree = (struct nilfs_btree *)bmap;
1872 path = nilfs_btree_alloc_path(); 1961 path = nilfs_btree_alloc_path();
1873 if (path == NULL) 1962 if (path == NULL)
1874 return -ENOMEM; 1963 return -ENOMEM;
@@ -1878,11 +1967,11 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1878 key = nilfs_btree_node_get_key(node, 0); 1967 key = nilfs_btree_node_get_key(node, 0);
1879 level = nilfs_btree_node_get_level(node); 1968 level = nilfs_btree_node_get_level(node);
1880 } else { 1969 } else {
1881 key = nilfs_bmap_data_get_key(bmap, bh); 1970 key = nilfs_bmap_data_get_key(btree, bh);
1882 level = NILFS_BTREE_LEVEL_DATA; 1971 level = NILFS_BTREE_LEVEL_DATA;
1883 } 1972 }
1884 1973
1885 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1); 1974 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
1886 if (ret < 0) { 1975 if (ret < 0) {
1887 if (unlikely(ret == -ENOENT)) 1976 if (unlikely(ret == -ENOENT))
1888 printk(KERN_CRIT "%s: key = %llu, level == %d\n", 1977 printk(KERN_CRIT "%s: key = %llu, level == %d\n",
@@ -1890,7 +1979,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1890 goto out; 1979 goto out;
1891 } 1980 }
1892 1981
1893 ret = NILFS_BMAP_USE_VBN(bmap) ? 1982 ret = NILFS_BMAP_USE_VBN(btree) ?
1894 nilfs_btree_propagate_v(btree, path, level, bh) : 1983 nilfs_btree_propagate_v(btree, path, level, bh) :
1895 nilfs_btree_propagate_p(btree, path, level, bh); 1984 nilfs_btree_propagate_p(btree, path, level, bh);
1896 1985
@@ -1900,13 +1989,13 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1900 return ret; 1989 return ret;
1901} 1990}
1902 1991
1903static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap, 1992static int nilfs_btree_propagate_gc(struct nilfs_bmap *btree,
1904 struct buffer_head *bh) 1993 struct buffer_head *bh)
1905{ 1994{
1906 return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr); 1995 return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(btree), bh->b_blocknr);
1907} 1996}
1908 1997
1909static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, 1998static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
1910 struct list_head *lists, 1999 struct list_head *lists,
1911 struct buffer_head *bh) 2000 struct buffer_head *bh)
1912{ 2001{
@@ -1920,6 +2009,18 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
1920 node = (struct nilfs_btree_node *)bh->b_data; 2009 node = (struct nilfs_btree_node *)bh->b_data;
1921 key = nilfs_btree_node_get_key(node, 0); 2010 key = nilfs_btree_node_get_key(node, 0);
1922 level = nilfs_btree_node_get_level(node); 2011 level = nilfs_btree_node_get_level(node);
2012 if (level < NILFS_BTREE_LEVEL_NODE_MIN ||
2013 level >= NILFS_BTREE_LEVEL_MAX) {
2014 dump_stack();
2015 printk(KERN_WARNING
2016 "%s: invalid btree level: %d (key=%llu, ino=%lu, "
2017 "blocknr=%llu)\n",
2018 __func__, level, (unsigned long long)key,
2019 NILFS_BMAP_I(btree)->vfs_inode.i_ino,
2020 (unsigned long long)bh->b_blocknr);
2021 return;
2022 }
2023
1923 list_for_each(head, &lists[level]) { 2024 list_for_each(head, &lists[level]) {
1924 cbh = list_entry(head, struct buffer_head, b_assoc_buffers); 2025 cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
1925 cnode = (struct nilfs_btree_node *)cbh->b_data; 2026 cnode = (struct nilfs_btree_node *)cbh->b_data;
@@ -1930,11 +2031,10 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
1930 list_add_tail(&bh->b_assoc_buffers, head); 2031 list_add_tail(&bh->b_assoc_buffers, head);
1931} 2032}
1932 2033
1933static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap, 2034static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
1934 struct list_head *listp) 2035 struct list_head *listp)
1935{ 2036{
1936 struct nilfs_btree *btree = (struct nilfs_btree *)bmap; 2037 struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache;
1937 struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache;
1938 struct list_head lists[NILFS_BTREE_LEVEL_MAX]; 2038 struct list_head lists[NILFS_BTREE_LEVEL_MAX];
1939 struct pagevec pvec; 2039 struct pagevec pvec;
1940 struct buffer_head *bh, *head; 2040 struct buffer_head *bh, *head;
@@ -1968,7 +2068,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
1968 list_splice_tail(&lists[level], listp); 2068 list_splice_tail(&lists[level], listp);
1969} 2069}
1970 2070
1971static int nilfs_btree_assign_p(struct nilfs_btree *btree, 2071static int nilfs_btree_assign_p(struct nilfs_bmap *btree,
1972 struct nilfs_btree_path *path, 2072 struct nilfs_btree_path *path,
1973 int level, 2073 int level,
1974 struct buffer_head **bh, 2074 struct buffer_head **bh,
@@ -1978,38 +2078,38 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree,
1978 struct nilfs_btree_node *parent; 2078 struct nilfs_btree_node *parent;
1979 __u64 key; 2079 __u64 key;
1980 __u64 ptr; 2080 __u64 ptr;
1981 int ret; 2081 int ncmax, ret;
1982 2082
1983 parent = nilfs_btree_get_node(btree, path, level + 1); 2083 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
1984 ptr = nilfs_btree_node_get_ptr(btree, parent, 2084 ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
1985 path[level + 1].bp_index); 2085 ncmax);
1986 if (buffer_nilfs_node(*bh)) { 2086 if (buffer_nilfs_node(*bh)) {
1987 path[level].bp_ctxt.oldkey = ptr; 2087 path[level].bp_ctxt.oldkey = ptr;
1988 path[level].bp_ctxt.newkey = blocknr; 2088 path[level].bp_ctxt.newkey = blocknr;
1989 path[level].bp_ctxt.bh = *bh; 2089 path[level].bp_ctxt.bh = *bh;
1990 ret = nilfs_btnode_prepare_change_key( 2090 ret = nilfs_btnode_prepare_change_key(
1991 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 2091 &NILFS_BMAP_I(btree)->i_btnode_cache,
1992 &path[level].bp_ctxt); 2092 &path[level].bp_ctxt);
1993 if (ret < 0) 2093 if (ret < 0)
1994 return ret; 2094 return ret;
1995 nilfs_btnode_commit_change_key( 2095 nilfs_btnode_commit_change_key(
1996 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 2096 &NILFS_BMAP_I(btree)->i_btnode_cache,
1997 &path[level].bp_ctxt); 2097 &path[level].bp_ctxt);
1998 *bh = path[level].bp_ctxt.bh; 2098 *bh = path[level].bp_ctxt.bh;
1999 } 2099 }
2000 2100
2001 nilfs_btree_node_set_ptr(btree, parent, 2101 nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, blocknr,
2002 path[level + 1].bp_index, blocknr); 2102 ncmax);
2003 2103
2004 key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); 2104 key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
2005 /* on-disk format */ 2105 /* on-disk format */
2006 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); 2106 binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
2007 binfo->bi_dat.bi_level = level; 2107 binfo->bi_dat.bi_level = level;
2008 2108
2009 return 0; 2109 return 0;
2010} 2110}
2011 2111
2012static int nilfs_btree_assign_v(struct nilfs_btree *btree, 2112static int nilfs_btree_assign_v(struct nilfs_bmap *btree,
2013 struct nilfs_btree_path *path, 2113 struct nilfs_btree_path *path,
2014 int level, 2114 int level,
2015 struct buffer_head **bh, 2115 struct buffer_head **bh,
@@ -2017,15 +2117,15 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
2017 union nilfs_binfo *binfo) 2117 union nilfs_binfo *binfo)
2018{ 2118{
2019 struct nilfs_btree_node *parent; 2119 struct nilfs_btree_node *parent;
2020 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); 2120 struct inode *dat = nilfs_bmap_get_dat(btree);
2021 __u64 key; 2121 __u64 key;
2022 __u64 ptr; 2122 __u64 ptr;
2023 union nilfs_bmap_ptr_req req; 2123 union nilfs_bmap_ptr_req req;
2024 int ret; 2124 int ncmax, ret;
2025 2125
2026 parent = nilfs_btree_get_node(btree, path, level + 1); 2126 parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax);
2027 ptr = nilfs_btree_node_get_ptr(btree, parent, 2127 ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index,
2028 path[level + 1].bp_index); 2128 ncmax);
2029 req.bpr_ptr = ptr; 2129 req.bpr_ptr = ptr;
2030 ret = nilfs_dat_prepare_start(dat, &req.bpr_req); 2130 ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
2031 if (ret < 0) 2131 if (ret < 0)
@@ -2034,24 +2134,22 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
2034 2134
2035 key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); 2135 key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
2036 /* on-disk format */ 2136 /* on-disk format */
2037 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); 2137 binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
2038 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); 2138 binfo->bi_v.bi_blkoff = cpu_to_le64(key);
2039 2139
2040 return 0; 2140 return 0;
2041} 2141}
2042 2142
2043static int nilfs_btree_assign(struct nilfs_bmap *bmap, 2143static int nilfs_btree_assign(struct nilfs_bmap *btree,
2044 struct buffer_head **bh, 2144 struct buffer_head **bh,
2045 sector_t blocknr, 2145 sector_t blocknr,
2046 union nilfs_binfo *binfo) 2146 union nilfs_binfo *binfo)
2047{ 2147{
2048 struct nilfs_btree *btree;
2049 struct nilfs_btree_path *path; 2148 struct nilfs_btree_path *path;
2050 struct nilfs_btree_node *node; 2149 struct nilfs_btree_node *node;
2051 __u64 key; 2150 __u64 key;
2052 int level, ret; 2151 int level, ret;
2053 2152
2054 btree = (struct nilfs_btree *)bmap;
2055 path = nilfs_btree_alloc_path(); 2153 path = nilfs_btree_alloc_path();
2056 if (path == NULL) 2154 if (path == NULL)
2057 return -ENOMEM; 2155 return -ENOMEM;
@@ -2061,17 +2159,17 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2061 key = nilfs_btree_node_get_key(node, 0); 2159 key = nilfs_btree_node_get_key(node, 0);
2062 level = nilfs_btree_node_get_level(node); 2160 level = nilfs_btree_node_get_level(node);
2063 } else { 2161 } else {
2064 key = nilfs_bmap_data_get_key(bmap, *bh); 2162 key = nilfs_bmap_data_get_key(btree, *bh);
2065 level = NILFS_BTREE_LEVEL_DATA; 2163 level = NILFS_BTREE_LEVEL_DATA;
2066 } 2164 }
2067 2165
2068 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1); 2166 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
2069 if (ret < 0) { 2167 if (ret < 0) {
2070 WARN_ON(ret == -ENOENT); 2168 WARN_ON(ret == -ENOENT);
2071 goto out; 2169 goto out;
2072 } 2170 }
2073 2171
2074 ret = NILFS_BMAP_USE_VBN(bmap) ? 2172 ret = NILFS_BMAP_USE_VBN(btree) ?
2075 nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) : 2173 nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) :
2076 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); 2174 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
2077 2175
@@ -2081,7 +2179,7 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2081 return ret; 2179 return ret;
2082} 2180}
2083 2181
2084static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap, 2182static int nilfs_btree_assign_gc(struct nilfs_bmap *btree,
2085 struct buffer_head **bh, 2183 struct buffer_head **bh,
2086 sector_t blocknr, 2184 sector_t blocknr,
2087 union nilfs_binfo *binfo) 2185 union nilfs_binfo *binfo)
@@ -2090,7 +2188,7 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
2090 __u64 key; 2188 __u64 key;
2091 int ret; 2189 int ret;
2092 2190
2093 ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr, 2191 ret = nilfs_dat_move(nilfs_bmap_get_dat(btree), (*bh)->b_blocknr,
2094 blocknr); 2192 blocknr);
2095 if (ret < 0) 2193 if (ret < 0)
2096 return ret; 2194 return ret;
@@ -2099,29 +2197,27 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
2099 node = (struct nilfs_btree_node *)(*bh)->b_data; 2197 node = (struct nilfs_btree_node *)(*bh)->b_data;
2100 key = nilfs_btree_node_get_key(node, 0); 2198 key = nilfs_btree_node_get_key(node, 0);
2101 } else 2199 } else
2102 key = nilfs_bmap_data_get_key(bmap, *bh); 2200 key = nilfs_bmap_data_get_key(btree, *bh);
2103 2201
2104 /* on-disk format */ 2202 /* on-disk format */
2105 binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr); 2203 binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr);
2106 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); 2204 binfo->bi_v.bi_blkoff = cpu_to_le64(key);
2107 2205
2108 return 0; 2206 return 0;
2109} 2207}
2110 2208
2111static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) 2209static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level)
2112{ 2210{
2113 struct buffer_head *bh; 2211 struct buffer_head *bh;
2114 struct nilfs_btree *btree;
2115 struct nilfs_btree_path *path; 2212 struct nilfs_btree_path *path;
2116 __u64 ptr; 2213 __u64 ptr;
2117 int ret; 2214 int ret;
2118 2215
2119 btree = (struct nilfs_btree *)bmap;
2120 path = nilfs_btree_alloc_path(); 2216 path = nilfs_btree_alloc_path();
2121 if (path == NULL) 2217 if (path == NULL)
2122 return -ENOMEM; 2218 return -ENOMEM;
2123 2219
2124 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); 2220 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1, 0);
2125 if (ret < 0) { 2221 if (ret < 0) {
2126 WARN_ON(ret == -ENOENT); 2222 WARN_ON(ret == -ENOENT);
2127 goto out; 2223 goto out;
@@ -2135,8 +2231,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2135 if (!buffer_dirty(bh)) 2231 if (!buffer_dirty(bh))
2136 nilfs_btnode_mark_dirty(bh); 2232 nilfs_btnode_mark_dirty(bh);
2137 brelse(bh); 2233 brelse(bh);
2138 if (!nilfs_bmap_dirty(&btree->bt_bmap)) 2234 if (!nilfs_bmap_dirty(btree))
2139 nilfs_bmap_set_dirty(&btree->bt_bmap); 2235 nilfs_bmap_set_dirty(btree);
2140 2236
2141 out: 2237 out:
2142 nilfs_btree_free_path(path); 2238 nilfs_btree_free_path(path);
@@ -2186,10 +2282,14 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
2186int nilfs_btree_init(struct nilfs_bmap *bmap) 2282int nilfs_btree_init(struct nilfs_bmap *bmap)
2187{ 2283{
2188 bmap->b_ops = &nilfs_btree_ops; 2284 bmap->b_ops = &nilfs_btree_ops;
2285 bmap->b_nchildren_per_block =
2286 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
2189 return 0; 2287 return 0;
2190} 2288}
2191 2289
2192void nilfs_btree_init_gc(struct nilfs_bmap *bmap) 2290void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
2193{ 2291{
2194 bmap->b_ops = &nilfs_btree_ops_gc; 2292 bmap->b_ops = &nilfs_btree_ops_gc;
2293 bmap->b_nchildren_per_block =
2294 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
2195} 2295}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index 43c8c5b541fd..22c02e35b6ef 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -31,14 +31,6 @@
31#include "bmap.h" 31#include "bmap.h"
32 32
33/** 33/**
34 * struct nilfs_btree - B-tree structure
35 * @bt_bmap: bmap base structure
36 */
37struct nilfs_btree {
38 struct nilfs_bmap bt_bmap;
39};
40
41/**
42 * struct nilfs_btree_path - A path on which B-tree operations are executed 34 * struct nilfs_btree_path - A path on which B-tree operations are executed
43 * @bp_bh: buffer head of node block 35 * @bp_bh: buffer head of node block
44 * @bp_sib_bh: buffer head of sibling node block 36 * @bp_sib_bh: buffer head of sibling node block
@@ -54,7 +46,7 @@ struct nilfs_btree_path {
54 union nilfs_bmap_ptr_req bp_oldreq; 46 union nilfs_bmap_ptr_req bp_oldreq;
55 union nilfs_bmap_ptr_req bp_newreq; 47 union nilfs_bmap_ptr_req bp_newreq;
56 struct nilfs_btnode_chkey_ctxt bp_ctxt; 48 struct nilfs_btnode_chkey_ctxt bp_ctxt;
57 void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *, 49 void (*bp_op)(struct nilfs_bmap *, struct nilfs_btree_path *,
58 int, __u64 *, __u64 *); 50 int, __u64 *, __u64 *);
59}; 51};
60 52
@@ -80,4 +72,6 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
80 const __u64 *, const __u64 *, int); 72 const __u64 *, const __u64 *, int);
81void nilfs_btree_init_gc(struct nilfs_bmap *); 73void nilfs_btree_init_gc(struct nilfs_bmap *);
82 74
75int nilfs_btree_broken_node_block(struct buffer_head *bh);
76
83#endif /* _NILFS_BTREE_H */ 77#endif /* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 85c89dfc71f0..b60277b44468 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -141,7 +141,7 @@ static void nilfs_check_page(struct page *page)
141 } 141 }
142 for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) { 142 for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) {
143 p = (struct nilfs_dir_entry *)(kaddr + offs); 143 p = (struct nilfs_dir_entry *)(kaddr + offs);
144 rec_len = le16_to_cpu(p->rec_len); 144 rec_len = nilfs_rec_len_from_disk(p->rec_len);
145 145
146 if (rec_len < NILFS_DIR_REC_LEN(1)) 146 if (rec_len < NILFS_DIR_REC_LEN(1))
147 goto Eshort; 147 goto Eshort;
@@ -199,13 +199,10 @@ fail:
199static struct page *nilfs_get_page(struct inode *dir, unsigned long n) 199static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
200{ 200{
201 struct address_space *mapping = dir->i_mapping; 201 struct address_space *mapping = dir->i_mapping;
202 struct page *page = read_cache_page(mapping, n, 202 struct page *page = read_mapping_page(mapping, n, NULL);
203 (filler_t *)mapping->a_ops->readpage, NULL); 203
204 if (!IS_ERR(page)) { 204 if (!IS_ERR(page)) {
205 wait_on_page_locked(page);
206 kmap(page); 205 kmap(page);
207 if (!PageUptodate(page))
208 goto fail;
209 if (!PageChecked(page)) 206 if (!PageChecked(page))
210 nilfs_check_page(page); 207 nilfs_check_page(page);
211 if (PageError(page)) 208 if (PageError(page))
@@ -238,7 +235,8 @@ nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de)
238 */ 235 */
239static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p) 236static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
240{ 237{
241 return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len)); 238 return (struct nilfs_dir_entry *)((char *)p +
239 nilfs_rec_len_from_disk(p->rec_len));
242} 240}
243 241
244static unsigned char 242static unsigned char
@@ -329,7 +327,7 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
329 goto success; 327 goto success;
330 } 328 }
331 } 329 }
332 filp->f_pos += le16_to_cpu(de->rec_len); 330 filp->f_pos += nilfs_rec_len_from_disk(de->rec_len);
333 } 331 }
334 nilfs_put_page(page); 332 nilfs_put_page(page);
335 } 333 }
@@ -444,7 +442,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
444 struct page *page, struct inode *inode) 442 struct page *page, struct inode *inode)
445{ 443{
446 unsigned from = (char *) de - (char *) page_address(page); 444 unsigned from = (char *) de - (char *) page_address(page);
447 unsigned to = from + le16_to_cpu(de->rec_len); 445 unsigned to = from + nilfs_rec_len_from_disk(de->rec_len);
448 struct address_space *mapping = page->mapping; 446 struct address_space *mapping = page->mapping;
449 int err; 447 int err;
450 448
@@ -500,7 +498,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
500 /* We hit i_size */ 498 /* We hit i_size */
501 name_len = 0; 499 name_len = 0;
502 rec_len = chunk_size; 500 rec_len = chunk_size;
503 de->rec_len = cpu_to_le16(chunk_size); 501 de->rec_len = nilfs_rec_len_to_disk(chunk_size);
504 de->inode = 0; 502 de->inode = 0;
505 goto got_it; 503 goto got_it;
506 } 504 }
@@ -514,7 +512,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
514 if (nilfs_match(namelen, name, de)) 512 if (nilfs_match(namelen, name, de))
515 goto out_unlock; 513 goto out_unlock;
516 name_len = NILFS_DIR_REC_LEN(de->name_len); 514 name_len = NILFS_DIR_REC_LEN(de->name_len);
517 rec_len = le16_to_cpu(de->rec_len); 515 rec_len = nilfs_rec_len_from_disk(de->rec_len);
518 if (!de->inode && rec_len >= reclen) 516 if (!de->inode && rec_len >= reclen)
519 goto got_it; 517 goto got_it;
520 if (rec_len >= name_len + reclen) 518 if (rec_len >= name_len + reclen)
@@ -537,8 +535,8 @@ got_it:
537 struct nilfs_dir_entry *de1; 535 struct nilfs_dir_entry *de1;
538 536
539 de1 = (struct nilfs_dir_entry *)((char *)de + name_len); 537 de1 = (struct nilfs_dir_entry *)((char *)de + name_len);
540 de1->rec_len = cpu_to_le16(rec_len - name_len); 538 de1->rec_len = nilfs_rec_len_to_disk(rec_len - name_len);
541 de->rec_len = cpu_to_le16(name_len); 539 de->rec_len = nilfs_rec_len_to_disk(name_len);
542 de = de1; 540 de = de1;
543 } 541 }
544 de->name_len = namelen; 542 de->name_len = namelen;
@@ -569,7 +567,8 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
569 struct inode *inode = mapping->host; 567 struct inode *inode = mapping->host;
570 char *kaddr = page_address(page); 568 char *kaddr = page_address(page);
571 unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1); 569 unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
572 unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len); 570 unsigned to = ((char *)dir - kaddr) +
571 nilfs_rec_len_from_disk(dir->rec_len);
573 struct nilfs_dir_entry *pde = NULL; 572 struct nilfs_dir_entry *pde = NULL;
574 struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from); 573 struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
575 int err; 574 int err;
@@ -590,7 +589,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
590 err = nilfs_prepare_chunk(page, mapping, from, to); 589 err = nilfs_prepare_chunk(page, mapping, from, to);
591 BUG_ON(err); 590 BUG_ON(err);
592 if (pde) 591 if (pde)
593 pde->rec_len = cpu_to_le16(to - from); 592 pde->rec_len = nilfs_rec_len_to_disk(to - from);
594 dir->inode = 0; 593 dir->inode = 0;
595 nilfs_commit_chunk(page, mapping, from, to); 594 nilfs_commit_chunk(page, mapping, from, to);
596 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 595 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -624,14 +623,14 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
624 memset(kaddr, 0, chunk_size); 623 memset(kaddr, 0, chunk_size);
625 de = (struct nilfs_dir_entry *)kaddr; 624 de = (struct nilfs_dir_entry *)kaddr;
626 de->name_len = 1; 625 de->name_len = 1;
627 de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1)); 626 de->rec_len = nilfs_rec_len_to_disk(NILFS_DIR_REC_LEN(1));
628 memcpy(de->name, ".\0\0", 4); 627 memcpy(de->name, ".\0\0", 4);
629 de->inode = cpu_to_le64(inode->i_ino); 628 de->inode = cpu_to_le64(inode->i_ino);
630 nilfs_set_de_type(de, inode); 629 nilfs_set_de_type(de, inode);
631 630
632 de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1)); 631 de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
633 de->name_len = 2; 632 de->name_len = 2;
634 de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1)); 633 de->rec_len = nilfs_rec_len_to_disk(chunk_size - NILFS_DIR_REC_LEN(1));
635 de->inode = cpu_to_le64(parent->i_ino); 634 de->inode = cpu_to_le64(parent->i_ino);
636 memcpy(de->name, "..\0", 4); 635 memcpy(de->name, "..\0", 4);
637 nilfs_set_de_type(de, inode); 636 nilfs_set_de_type(de, inode);
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 236753df5cdf..324d80c57518 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -27,47 +27,43 @@
27#include "alloc.h" 27#include "alloc.h"
28#include "dat.h" 28#include "dat.h"
29 29
30static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct) 30static inline __le64 *nilfs_direct_dptrs(const struct nilfs_bmap *direct)
31{ 31{
32 return (__le64 *) 32 return (__le64 *)
33 ((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1); 33 ((struct nilfs_direct_node *)direct->b_u.u_data + 1);
34} 34}
35 35
36static inline __u64 36static inline __u64
37nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key) 37nilfs_direct_get_ptr(const struct nilfs_bmap *direct, __u64 key)
38{ 38{
39 return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key)); 39 return le64_to_cpu(*(nilfs_direct_dptrs(direct) + key));
40} 40}
41 41
42static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct, 42static inline void nilfs_direct_set_ptr(struct nilfs_bmap *direct,
43 __u64 key, __u64 ptr) 43 __u64 key, __u64 ptr)
44{ 44{
45 *(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr); 45 *(nilfs_direct_dptrs(direct) + key) = cpu_to_le64(ptr);
46} 46}
47 47
48static int nilfs_direct_lookup(const struct nilfs_bmap *bmap, 48static int nilfs_direct_lookup(const struct nilfs_bmap *direct,
49 __u64 key, int level, __u64 *ptrp) 49 __u64 key, int level, __u64 *ptrp)
50{ 50{
51 struct nilfs_direct *direct;
52 __u64 ptr; 51 __u64 ptr;
53 52
54 direct = (struct nilfs_direct *)bmap; /* XXX: use macro for level 1 */
55 if (key > NILFS_DIRECT_KEY_MAX || level != 1) 53 if (key > NILFS_DIRECT_KEY_MAX || level != 1)
56 return -ENOENT; 54 return -ENOENT;
57 ptr = nilfs_direct_get_ptr(direct, key); 55 ptr = nilfs_direct_get_ptr(direct, key);
58 if (ptr == NILFS_BMAP_INVALID_PTR) 56 if (ptr == NILFS_BMAP_INVALID_PTR)
59 return -ENOENT; 57 return -ENOENT;
60 58
61 if (ptrp != NULL) 59 *ptrp = ptr;
62 *ptrp = ptr;
63 return 0; 60 return 0;
64} 61}
65 62
66static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap, 63static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
67 __u64 key, __u64 *ptrp, 64 __u64 key, __u64 *ptrp,
68 unsigned maxblocks) 65 unsigned maxblocks)
69{ 66{
70 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
71 struct inode *dat = NULL; 67 struct inode *dat = NULL;
72 __u64 ptr, ptr2; 68 __u64 ptr, ptr2;
73 sector_t blocknr; 69 sector_t blocknr;
@@ -79,8 +75,8 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
79 if (ptr == NILFS_BMAP_INVALID_PTR) 75 if (ptr == NILFS_BMAP_INVALID_PTR)
80 return -ENOENT; 76 return -ENOENT;
81 77
82 if (NILFS_BMAP_USE_VBN(bmap)) { 78 if (NILFS_BMAP_USE_VBN(direct)) {
83 dat = nilfs_bmap_get_dat(bmap); 79 dat = nilfs_bmap_get_dat(direct);
84 ret = nilfs_dat_translate(dat, ptr, &blocknr); 80 ret = nilfs_dat_translate(dat, ptr, &blocknr);
85 if (ret < 0) 81 if (ret < 0)
86 return ret; 82 return ret;
@@ -106,29 +102,21 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
106} 102}
107 103
108static __u64 104static __u64
109nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key) 105nilfs_direct_find_target_v(const struct nilfs_bmap *direct, __u64 key)
110{ 106{
111 __u64 ptr; 107 __u64 ptr;
112 108
113 ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key); 109 ptr = nilfs_bmap_find_target_seq(direct, key);
114 if (ptr != NILFS_BMAP_INVALID_PTR) 110 if (ptr != NILFS_BMAP_INVALID_PTR)
115 /* sequential access */ 111 /* sequential access */
116 return ptr; 112 return ptr;
117 else 113 else
118 /* block group */ 114 /* block group */
119 return nilfs_bmap_find_target_in_group(&direct->d_bmap); 115 return nilfs_bmap_find_target_in_group(direct);
120}
121
122static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
123 __u64 key, __u64 ptr)
124{
125 direct->d_bmap.b_last_allocated_key = key;
126 direct->d_bmap.b_last_allocated_ptr = ptr;
127} 116}
128 117
129static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) 118static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
130{ 119{
131 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
132 union nilfs_bmap_ptr_req req; 120 union nilfs_bmap_ptr_req req;
133 struct inode *dat = NULL; 121 struct inode *dat = NULL;
134 struct buffer_head *bh; 122 struct buffer_head *bh;
@@ -136,11 +124,11 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
136 124
137 if (key > NILFS_DIRECT_KEY_MAX) 125 if (key > NILFS_DIRECT_KEY_MAX)
138 return -ENOENT; 126 return -ENOENT;
139 if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR) 127 if (nilfs_direct_get_ptr(bmap, key) != NILFS_BMAP_INVALID_PTR)
140 return -EEXIST; 128 return -EEXIST;
141 129
142 if (NILFS_BMAP_USE_VBN(bmap)) { 130 if (NILFS_BMAP_USE_VBN(bmap)) {
143 req.bpr_ptr = nilfs_direct_find_target_v(direct, key); 131 req.bpr_ptr = nilfs_direct_find_target_v(bmap, key);
144 dat = nilfs_bmap_get_dat(bmap); 132 dat = nilfs_bmap_get_dat(bmap);
145 } 133 }
146 ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat); 134 ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat);
@@ -150,13 +138,13 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
150 set_buffer_nilfs_volatile(bh); 138 set_buffer_nilfs_volatile(bh);
151 139
152 nilfs_bmap_commit_alloc_ptr(bmap, &req, dat); 140 nilfs_bmap_commit_alloc_ptr(bmap, &req, dat);
153 nilfs_direct_set_ptr(direct, key, req.bpr_ptr); 141 nilfs_direct_set_ptr(bmap, key, req.bpr_ptr);
154 142
155 if (!nilfs_bmap_dirty(bmap)) 143 if (!nilfs_bmap_dirty(bmap))
156 nilfs_bmap_set_dirty(bmap); 144 nilfs_bmap_set_dirty(bmap);
157 145
158 if (NILFS_BMAP_USE_VBN(bmap)) 146 if (NILFS_BMAP_USE_VBN(bmap))
159 nilfs_direct_set_target_v(direct, key, req.bpr_ptr); 147 nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr);
160 148
161 nilfs_bmap_add_blocks(bmap, 1); 149 nilfs_bmap_add_blocks(bmap, 1);
162 } 150 }
@@ -165,33 +153,30 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
165 153
166static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key) 154static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
167{ 155{
168 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
169 union nilfs_bmap_ptr_req req; 156 union nilfs_bmap_ptr_req req;
170 struct inode *dat; 157 struct inode *dat;
171 int ret; 158 int ret;
172 159
173 if (key > NILFS_DIRECT_KEY_MAX || 160 if (key > NILFS_DIRECT_KEY_MAX ||
174 nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR) 161 nilfs_direct_get_ptr(bmap, key) == NILFS_BMAP_INVALID_PTR)
175 return -ENOENT; 162 return -ENOENT;
176 163
177 dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; 164 dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
178 req.bpr_ptr = nilfs_direct_get_ptr(direct, key); 165 req.bpr_ptr = nilfs_direct_get_ptr(bmap, key);
179 166
180 ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat); 167 ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat);
181 if (!ret) { 168 if (!ret) {
182 nilfs_bmap_commit_end_ptr(bmap, &req, dat); 169 nilfs_bmap_commit_end_ptr(bmap, &req, dat);
183 nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR); 170 nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR);
184 nilfs_bmap_sub_blocks(bmap, 1); 171 nilfs_bmap_sub_blocks(bmap, 1);
185 } 172 }
186 return ret; 173 return ret;
187} 174}
188 175
189static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) 176static int nilfs_direct_last_key(const struct nilfs_bmap *direct, __u64 *keyp)
190{ 177{
191 struct nilfs_direct *direct;
192 __u64 key, lastkey; 178 __u64 key, lastkey;
193 179
194 direct = (struct nilfs_direct *)bmap;
195 lastkey = NILFS_DIRECT_KEY_MAX + 1; 180 lastkey = NILFS_DIRECT_KEY_MAX + 1;
196 for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++) 181 for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++)
197 if (nilfs_direct_get_ptr(direct, key) != 182 if (nilfs_direct_get_ptr(direct, key) !=
@@ -211,15 +196,13 @@ static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key)
211 return key > NILFS_DIRECT_KEY_MAX; 196 return key > NILFS_DIRECT_KEY_MAX;
212} 197}
213 198
214static int nilfs_direct_gather_data(struct nilfs_bmap *bmap, 199static int nilfs_direct_gather_data(struct nilfs_bmap *direct,
215 __u64 *keys, __u64 *ptrs, int nitems) 200 __u64 *keys, __u64 *ptrs, int nitems)
216{ 201{
217 struct nilfs_direct *direct;
218 __u64 key; 202 __u64 key;
219 __u64 ptr; 203 __u64 ptr;
220 int n; 204 int n;
221 205
222 direct = (struct nilfs_direct *)bmap;
223 if (nitems > NILFS_DIRECT_NBLOCKS) 206 if (nitems > NILFS_DIRECT_NBLOCKS)
224 nitems = NILFS_DIRECT_NBLOCKS; 207 nitems = NILFS_DIRECT_NBLOCKS;
225 n = 0; 208 n = 0;
@@ -237,7 +220,6 @@ static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
237int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, 220int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
238 __u64 key, __u64 *keys, __u64 *ptrs, int n) 221 __u64 key, __u64 *keys, __u64 *ptrs, int n)
239{ 222{
240 struct nilfs_direct *direct;
241 __le64 *dptrs; 223 __le64 *dptrs;
242 int ret, i, j; 224 int ret, i, j;
243 225
@@ -253,12 +235,11 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
253 bmap->b_ops->bop_clear(bmap); 235 bmap->b_ops->bop_clear(bmap);
254 236
255 /* convert */ 237 /* convert */
256 direct = (struct nilfs_direct *)bmap; 238 dptrs = nilfs_direct_dptrs(bmap);
257 dptrs = nilfs_direct_dptrs(direct);
258 for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) { 239 for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) {
259 if ((j < n) && (i == keys[j])) { 240 if ((j < n) && (i == keys[j])) {
260 dptrs[i] = (i != key) ? 241 dptrs[i] = (i != key) ?
261 nilfs_bmap_ptr_to_dptr(ptrs[j]) : 242 cpu_to_le64(ptrs[j]) :
262 NILFS_BMAP_INVALID_PTR; 243 NILFS_BMAP_INVALID_PTR;
263 j++; 244 j++;
264 } else 245 } else
@@ -269,10 +250,9 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
269 return 0; 250 return 0;
270} 251}
271 252
272static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, 253static int nilfs_direct_propagate(struct nilfs_bmap *bmap,
273 struct buffer_head *bh) 254 struct buffer_head *bh)
274{ 255{
275 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
276 struct nilfs_palloc_req oldreq, newreq; 256 struct nilfs_palloc_req oldreq, newreq;
277 struct inode *dat; 257 struct inode *dat;
278 __u64 key; 258 __u64 key;
@@ -284,7 +264,7 @@ static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
284 264
285 dat = nilfs_bmap_get_dat(bmap); 265 dat = nilfs_bmap_get_dat(bmap);
286 key = nilfs_bmap_data_get_key(bmap, bh); 266 key = nilfs_bmap_data_get_key(bmap, bh);
287 ptr = nilfs_direct_get_ptr(direct, key); 267 ptr = nilfs_direct_get_ptr(bmap, key);
288 if (!buffer_nilfs_volatile(bh)) { 268 if (!buffer_nilfs_volatile(bh)) {
289 oldreq.pr_entry_nr = ptr; 269 oldreq.pr_entry_nr = ptr;
290 newreq.pr_entry_nr = ptr; 270 newreq.pr_entry_nr = ptr;
@@ -294,20 +274,20 @@ static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
294 nilfs_dat_commit_update(dat, &oldreq, &newreq, 274 nilfs_dat_commit_update(dat, &oldreq, &newreq,
295 bmap->b_ptr_type == NILFS_BMAP_PTR_VS); 275 bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
296 set_buffer_nilfs_volatile(bh); 276 set_buffer_nilfs_volatile(bh);
297 nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr); 277 nilfs_direct_set_ptr(bmap, key, newreq.pr_entry_nr);
298 } else 278 } else
299 ret = nilfs_dat_mark_dirty(dat, ptr); 279 ret = nilfs_dat_mark_dirty(dat, ptr);
300 280
301 return ret; 281 return ret;
302} 282}
303 283
304static int nilfs_direct_assign_v(struct nilfs_direct *direct, 284static int nilfs_direct_assign_v(struct nilfs_bmap *direct,
305 __u64 key, __u64 ptr, 285 __u64 key, __u64 ptr,
306 struct buffer_head **bh, 286 struct buffer_head **bh,
307 sector_t blocknr, 287 sector_t blocknr,
308 union nilfs_binfo *binfo) 288 union nilfs_binfo *binfo)
309{ 289{
310 struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap); 290 struct inode *dat = nilfs_bmap_get_dat(direct);
311 union nilfs_bmap_ptr_req req; 291 union nilfs_bmap_ptr_req req;
312 int ret; 292 int ret;
313 293
@@ -315,13 +295,13 @@ static int nilfs_direct_assign_v(struct nilfs_direct *direct,
315 ret = nilfs_dat_prepare_start(dat, &req.bpr_req); 295 ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
316 if (!ret) { 296 if (!ret) {
317 nilfs_dat_commit_start(dat, &req.bpr_req, blocknr); 297 nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
318 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); 298 binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr);
319 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); 299 binfo->bi_v.bi_blkoff = cpu_to_le64(key);
320 } 300 }
321 return ret; 301 return ret;
322} 302}
323 303
324static int nilfs_direct_assign_p(struct nilfs_direct *direct, 304static int nilfs_direct_assign_p(struct nilfs_bmap *direct,
325 __u64 key, __u64 ptr, 305 __u64 key, __u64 ptr,
326 struct buffer_head **bh, 306 struct buffer_head **bh,
327 sector_t blocknr, 307 sector_t blocknr,
@@ -329,7 +309,7 @@ static int nilfs_direct_assign_p(struct nilfs_direct *direct,
329{ 309{
330 nilfs_direct_set_ptr(direct, key, blocknr); 310 nilfs_direct_set_ptr(direct, key, blocknr);
331 311
332 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); 312 binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
333 binfo->bi_dat.bi_level = 0; 313 binfo->bi_dat.bi_level = 0;
334 314
335 return 0; 315 return 0;
@@ -340,18 +320,16 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
340 sector_t blocknr, 320 sector_t blocknr,
341 union nilfs_binfo *binfo) 321 union nilfs_binfo *binfo)
342{ 322{
343 struct nilfs_direct *direct;
344 __u64 key; 323 __u64 key;
345 __u64 ptr; 324 __u64 ptr;
346 325
347 direct = (struct nilfs_direct *)bmap;
348 key = nilfs_bmap_data_get_key(bmap, *bh); 326 key = nilfs_bmap_data_get_key(bmap, *bh);
349 if (unlikely(key > NILFS_DIRECT_KEY_MAX)) { 327 if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
350 printk(KERN_CRIT "%s: invalid key: %llu\n", __func__, 328 printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
351 (unsigned long long)key); 329 (unsigned long long)key);
352 return -EINVAL; 330 return -EINVAL;
353 } 331 }
354 ptr = nilfs_direct_get_ptr(direct, key); 332 ptr = nilfs_direct_get_ptr(bmap, key);
355 if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) { 333 if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
356 printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__, 334 printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
357 (unsigned long long)ptr); 335 (unsigned long long)ptr);
@@ -359,8 +337,8 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
359 } 337 }
360 338
361 return NILFS_BMAP_USE_VBN(bmap) ? 339 return NILFS_BMAP_USE_VBN(bmap) ?
362 nilfs_direct_assign_v(direct, key, ptr, bh, blocknr, binfo) : 340 nilfs_direct_assign_v(bmap, key, ptr, bh, blocknr, binfo) :
363 nilfs_direct_assign_p(direct, key, ptr, bh, blocknr, binfo); 341 nilfs_direct_assign_p(bmap, key, ptr, bh, blocknr, binfo);
364} 342}
365 343
366static const struct nilfs_bmap_operations nilfs_direct_ops = { 344static const struct nilfs_bmap_operations nilfs_direct_ops = {
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index a5ffd66e25d0..dc643de20a25 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -28,8 +28,6 @@
28#include "bmap.h" 28#include "bmap.h"
29 29
30 30
31struct nilfs_direct;
32
33/** 31/**
34 * struct nilfs_direct_node - direct node 32 * struct nilfs_direct_node - direct node
35 * @dn_flags: flags 33 * @dn_flags: flags
@@ -40,15 +38,6 @@ struct nilfs_direct_node {
40 __u8 pad[7]; 38 __u8 pad[7];
41}; 39};
42 40
43/**
44 * struct nilfs_direct - direct mapping
45 * @d_bmap: bmap structure
46 */
47struct nilfs_direct {
48 struct nilfs_bmap d_bmap;
49};
50
51
52#define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1) 41#define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1)
53#define NILFS_DIRECT_KEY_MIN 0 42#define NILFS_DIRECT_KEY_MIN 0
54#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1) 43#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1)
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 145f03cd7d3e..bed3a783129b 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -48,6 +48,8 @@
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <linux/swap.h> 49#include <linux/swap.h>
50#include "nilfs.h" 50#include "nilfs.h"
51#include "btree.h"
52#include "btnode.h"
51#include "page.h" 53#include "page.h"
52#include "mdt.h" 54#include "mdt.h"
53#include "dat.h" 55#include "dat.h"
@@ -149,8 +151,10 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
149int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, 151int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
150 __u64 vbn, struct buffer_head **out_bh) 152 __u64 vbn, struct buffer_head **out_bh)
151{ 153{
152 int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, 154 int ret;
153 vbn ? : pbn, pbn, out_bh); 155
156 ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
157 vbn ? : pbn, pbn, READ, out_bh, &pbn);
154 if (ret == -EEXIST) /* internal code (cache hit) */ 158 if (ret == -EEXIST) /* internal code (cache hit) */
155 ret = 0; 159 ret = 0;
156 return ret; 160 return ret;
@@ -164,10 +168,15 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
164 if (buffer_dirty(bh)) 168 if (buffer_dirty(bh))
165 return -EEXIST; 169 return -EEXIST;
166 170
167 if (buffer_nilfs_node(bh)) 171 if (buffer_nilfs_node(bh)) {
172 if (nilfs_btree_broken_node_block(bh)) {
173 clear_buffer_uptodate(bh);
174 return -EIO;
175 }
168 nilfs_btnode_mark_dirty(bh); 176 nilfs_btnode_mark_dirty(bh);
169 else 177 } else {
170 nilfs_mdt_mark_buffer_dirty(bh); 178 nilfs_mdt_mark_buffer_dirty(bh);
179 }
171 return 0; 180 return 0;
172} 181}
173 182
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 024be8c35bb6..d01aff4957d9 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -28,6 +28,7 @@
28#include <linux/swap.h> 28#include <linux/swap.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include "nilfs.h" 30#include "nilfs.h"
31#include "btnode.h"
31#include "segment.h" 32#include "segment.h"
32#include "page.h" 33#include "page.h"
33#include "mdt.h" 34#include "mdt.h"
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 47d6d7928122..0842d775b3e0 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -32,7 +32,6 @@
32#include "the_nilfs.h" 32#include "the_nilfs.h"
33#include "sb.h" 33#include "sb.h"
34#include "bmap.h" 34#include "bmap.h"
35#include "bmap_union.h"
36 35
37/* 36/*
38 * nilfs inode data in memory 37 * nilfs inode data in memory
@@ -41,7 +40,7 @@ struct nilfs_inode_info {
41 __u32 i_flags; 40 __u32 i_flags;
42 unsigned long i_state; /* Dynamic state flags */ 41 unsigned long i_state; /* Dynamic state flags */
43 struct nilfs_bmap *i_bmap; 42 struct nilfs_bmap *i_bmap;
44 union nilfs_bmap_union i_bmap_union; 43 struct nilfs_bmap i_bmap_data;
45 __u64 i_xattr; /* sector_t ??? */ 44 __u64 i_xattr; /* sector_t ??? */
46 __u32 i_dir_start_lookup; 45 __u32 i_dir_start_lookup;
47 __u64 i_cno; /* check point number for GC inode */ 46 __u64 i_cno; /* check point number for GC inode */
@@ -71,9 +70,7 @@ static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode)
71static inline struct nilfs_inode_info * 70static inline struct nilfs_inode_info *
72NILFS_BMAP_I(const struct nilfs_bmap *bmap) 71NILFS_BMAP_I(const struct nilfs_bmap *bmap)
73{ 72{
74 return container_of((union nilfs_bmap_union *)bmap, 73 return container_of(bmap, struct nilfs_inode_info, i_bmap_data);
75 struct nilfs_inode_info,
76 i_bmap_union);
77} 74}
78 75
79static inline struct inode *NILFS_BTNC_I(struct address_space *btnc) 76static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
@@ -107,6 +104,14 @@ enum {
107}; 104};
108 105
109/* 106/*
107 * commit flags for nilfs_commit_super and nilfs_sync_super
108 */
109enum {
110 NILFS_SB_COMMIT = 0, /* Commit a super block alternately */
111 NILFS_SB_COMMIT_ALL /* Commit both super blocks */
112};
113
114/*
110 * Macros to check inode numbers 115 * Macros to check inode numbers
111 */ 116 */
112#define NILFS_MDT_INO_BITS \ 117#define NILFS_MDT_INO_BITS \
@@ -270,7 +275,14 @@ extern struct nilfs_super_block *
270nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); 275nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
271extern int nilfs_store_magic_and_option(struct super_block *, 276extern int nilfs_store_magic_and_option(struct super_block *,
272 struct nilfs_super_block *, char *); 277 struct nilfs_super_block *, char *);
278extern int nilfs_check_feature_compatibility(struct super_block *,
279 struct nilfs_super_block *);
280extern void nilfs_set_log_cursor(struct nilfs_super_block *,
281 struct the_nilfs *);
282extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *,
283 int flip);
273extern int nilfs_commit_super(struct nilfs_sb_info *, int); 284extern int nilfs_commit_super(struct nilfs_sb_info *, int);
285extern int nilfs_cleanup_super(struct nilfs_sb_info *);
274extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64); 286extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64);
275extern void nilfs_detach_checkpoint(struct nilfs_sb_info *); 287extern void nilfs_detach_checkpoint(struct nilfs_sb_info *);
276 288
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 8de3e1e48130..aab11db2cb08 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -37,7 +37,8 @@
37 37
38#define NILFS_BUFFER_INHERENT_BITS \ 38#define NILFS_BUFFER_INHERENT_BITS \
39 ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \ 39 ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
40 (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated)) 40 (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated) | \
41 (1UL << BH_NILFS_Checked))
41 42
42static struct buffer_head * 43static struct buffer_head *
43__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, 44__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
@@ -129,6 +130,7 @@ void nilfs_forget_buffer(struct buffer_head *bh)
129 130
130 lock_buffer(bh); 131 lock_buffer(bh);
131 clear_buffer_nilfs_volatile(bh); 132 clear_buffer_nilfs_volatile(bh);
133 clear_buffer_nilfs_checked(bh);
132 clear_buffer_dirty(bh); 134 clear_buffer_dirty(bh);
133 if (nilfs_page_buffers_clean(page)) 135 if (nilfs_page_buffers_clean(page))
134 __nilfs_clear_page_dirty(page); 136 __nilfs_clear_page_dirty(page);
@@ -480,6 +482,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping)
480 lock_buffer(bh); 482 lock_buffer(bh);
481 clear_buffer_dirty(bh); 483 clear_buffer_dirty(bh);
482 clear_buffer_nilfs_volatile(bh); 484 clear_buffer_nilfs_volatile(bh);
485 clear_buffer_nilfs_checked(bh);
483 clear_buffer_uptodate(bh); 486 clear_buffer_uptodate(bh);
484 clear_buffer_mapped(bh); 487 clear_buffer_mapped(bh);
485 unlock_buffer(bh); 488 unlock_buffer(bh);
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 8abca4d1c1f8..f53d8da41ed7 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -34,11 +34,13 @@ enum {
34 BH_NILFS_Allocated = BH_PrivateStart, 34 BH_NILFS_Allocated = BH_PrivateStart,
35 BH_NILFS_Node, 35 BH_NILFS_Node,
36 BH_NILFS_Volatile, 36 BH_NILFS_Volatile,
37 BH_NILFS_Checked,
37}; 38};
38 39
39BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */ 40BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */
40BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */ 41BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */
41BUFFER_FNS(NILFS_Volatile, nilfs_volatile) 42BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
43BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */
42 44
43 45
44void nilfs_mark_buffer_dirty(struct buffer_head *bh); 46void nilfs_mark_buffer_dirty(struct buffer_head *bh);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index bae2a516b4ee..83e3d8c61a01 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -91,27 +91,9 @@ static int nilfs_warn_segment_error(int err)
91 return -EINVAL; 91 return -EINVAL;
92} 92}
93 93
94static void store_segsum_info(struct nilfs_segsum_info *ssi,
95 struct nilfs_segment_summary *sum,
96 unsigned int blocksize)
97{
98 ssi->flags = le16_to_cpu(sum->ss_flags);
99 ssi->seg_seq = le64_to_cpu(sum->ss_seq);
100 ssi->ctime = le64_to_cpu(sum->ss_create);
101 ssi->next = le64_to_cpu(sum->ss_next);
102 ssi->nblocks = le32_to_cpu(sum->ss_nblocks);
103 ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo);
104 ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes);
105
106 ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
107 ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
108
109 /* need to verify ->ss_bytes field if read ->ss_cno */
110}
111
112/** 94/**
113 * calc_crc_cont - check CRC of blocks continuously 95 * nilfs_compute_checksum - compute checksum of blocks continuously
114 * @sbi: nilfs_sb_info 96 * @nilfs: nilfs object
115 * @bhs: buffer head of start block 97 * @bhs: buffer head of start block
116 * @sum: place to store result 98 * @sum: place to store result
117 * @offset: offset bytes in the first block 99 * @offset: offset bytes in the first block
@@ -119,23 +101,25 @@ static void store_segsum_info(struct nilfs_segsum_info *ssi,
119 * @start: DBN of start block 101 * @start: DBN of start block
120 * @nblock: number of blocks to be checked 102 * @nblock: number of blocks to be checked
121 */ 103 */
122static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs, 104static int nilfs_compute_checksum(struct the_nilfs *nilfs,
123 u32 *sum, unsigned long offset, u64 check_bytes, 105 struct buffer_head *bhs, u32 *sum,
124 sector_t start, unsigned long nblock) 106 unsigned long offset, u64 check_bytes,
107 sector_t start, unsigned long nblock)
125{ 108{
126 unsigned long blocksize = sbi->s_super->s_blocksize; 109 unsigned int blocksize = nilfs->ns_blocksize;
127 unsigned long size; 110 unsigned long size;
128 u32 crc; 111 u32 crc;
129 112
130 BUG_ON(offset >= blocksize); 113 BUG_ON(offset >= blocksize);
131 check_bytes -= offset; 114 check_bytes -= offset;
132 size = min_t(u64, check_bytes, blocksize - offset); 115 size = min_t(u64, check_bytes, blocksize - offset);
133 crc = crc32_le(sbi->s_nilfs->ns_crc_seed, 116 crc = crc32_le(nilfs->ns_crc_seed,
134 (unsigned char *)bhs->b_data + offset, size); 117 (unsigned char *)bhs->b_data + offset, size);
135 if (--nblock > 0) { 118 if (--nblock > 0) {
136 do { 119 do {
137 struct buffer_head *bh 120 struct buffer_head *bh;
138 = sb_bread(sbi->s_super, ++start); 121
122 bh = __bread(nilfs->ns_bdev, ++start, blocksize);
139 if (!bh) 123 if (!bh)
140 return -EIO; 124 return -EIO;
141 check_bytes -= size; 125 check_bytes -= size;
@@ -150,12 +134,12 @@ static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs,
150 134
151/** 135/**
152 * nilfs_read_super_root_block - read super root block 136 * nilfs_read_super_root_block - read super root block
153 * @sb: super_block 137 * @nilfs: nilfs object
154 * @sr_block: disk block number of the super root block 138 * @sr_block: disk block number of the super root block
155 * @pbh: address of a buffer_head pointer to return super root buffer 139 * @pbh: address of a buffer_head pointer to return super root buffer
156 * @check: CRC check flag 140 * @check: CRC check flag
157 */ 141 */
158int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block, 142int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
159 struct buffer_head **pbh, int check) 143 struct buffer_head **pbh, int check)
160{ 144{
161 struct buffer_head *bh_sr; 145 struct buffer_head *bh_sr;
@@ -164,7 +148,7 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
164 int ret; 148 int ret;
165 149
166 *pbh = NULL; 150 *pbh = NULL;
167 bh_sr = sb_bread(sb, sr_block); 151 bh_sr = __bread(nilfs->ns_bdev, sr_block, nilfs->ns_blocksize);
168 if (unlikely(!bh_sr)) { 152 if (unlikely(!bh_sr)) {
169 ret = NILFS_SEG_FAIL_IO; 153 ret = NILFS_SEG_FAIL_IO;
170 goto failed; 154 goto failed;
@@ -174,12 +158,13 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
174 if (check) { 158 if (check) {
175 unsigned bytes = le16_to_cpu(sr->sr_bytes); 159 unsigned bytes = le16_to_cpu(sr->sr_bytes);
176 160
177 if (bytes == 0 || bytes > sb->s_blocksize) { 161 if (bytes == 0 || bytes > nilfs->ns_blocksize) {
178 ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; 162 ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
179 goto failed_bh; 163 goto failed_bh;
180 } 164 }
181 if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc, 165 if (nilfs_compute_checksum(
182 sizeof(sr->sr_sum), bytes, sr_block, 1)) { 166 nilfs, bh_sr, &crc, sizeof(sr->sr_sum), bytes,
167 sr_block, 1)) {
183 ret = NILFS_SEG_FAIL_IO; 168 ret = NILFS_SEG_FAIL_IO;
184 goto failed_bh; 169 goto failed_bh;
185 } 170 }
@@ -199,64 +184,76 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
199} 184}
200 185
201/** 186/**
202 * load_segment_summary - read segment summary of the specified partial segment 187 * nilfs_read_log_header - read summary header of the specified log
203 * @sbi: nilfs_sb_info 188 * @nilfs: nilfs object
204 * @pseg_start: start disk block number of partial segment 189 * @start_blocknr: start block number of the log
205 * @seg_seq: sequence number requested 190 * @sum: pointer to return segment summary structure
206 * @ssi: pointer to nilfs_segsum_info struct to store information
207 */ 191 */
208static int 192static struct buffer_head *
209load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start, 193nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr,
210 u64 seg_seq, struct nilfs_segsum_info *ssi) 194 struct nilfs_segment_summary **sum)
211{ 195{
212 struct buffer_head *bh_sum; 196 struct buffer_head *bh_sum;
213 struct nilfs_segment_summary *sum; 197
198 bh_sum = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize);
199 if (bh_sum)
200 *sum = (struct nilfs_segment_summary *)bh_sum->b_data;
201 return bh_sum;
202}
203
204/**
205 * nilfs_validate_log - verify consistency of log
206 * @nilfs: nilfs object
207 * @seg_seq: sequence number of segment
208 * @bh_sum: buffer head of summary block
209 * @sum: segment summary struct
210 */
211static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq,
212 struct buffer_head *bh_sum,
213 struct nilfs_segment_summary *sum)
214{
214 unsigned long nblock; 215 unsigned long nblock;
215 u32 crc; 216 u32 crc;
216 int ret = NILFS_SEG_FAIL_IO; 217 int ret;
217 218
218 bh_sum = sb_bread(sbi->s_super, pseg_start); 219 ret = NILFS_SEG_FAIL_MAGIC;
219 if (!bh_sum) 220 if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC)
220 goto out; 221 goto out;
221 222
222 sum = (struct nilfs_segment_summary *)bh_sum->b_data; 223 ret = NILFS_SEG_FAIL_SEQ;
223 224 if (le64_to_cpu(sum->ss_seq) != seg_seq)
224 /* Check consistency of segment summary */ 225 goto out;
225 if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) {
226 ret = NILFS_SEG_FAIL_MAGIC;
227 goto failed;
228 }
229 store_segsum_info(ssi, sum, sbi->s_super->s_blocksize);
230 if (seg_seq != ssi->seg_seq) {
231 ret = NILFS_SEG_FAIL_SEQ;
232 goto failed;
233 }
234 226
235 nblock = ssi->nblocks; 227 nblock = le32_to_cpu(sum->ss_nblocks);
236 if (unlikely(nblock == 0 || 228 ret = NILFS_SEG_FAIL_CONSISTENCY;
237 nblock > sbi->s_nilfs->ns_blocks_per_segment)) { 229 if (unlikely(nblock == 0 || nblock > nilfs->ns_blocks_per_segment))
238 /* This limits the number of blocks read in the CRC check */ 230 /* This limits the number of blocks read in the CRC check */
239 ret = NILFS_SEG_FAIL_CONSISTENCY; 231 goto out;
240 goto failed; 232
241 } 233 ret = NILFS_SEG_FAIL_IO;
242 if (calc_crc_cont(sbi, bh_sum, &crc, sizeof(sum->ss_datasum), 234 if (nilfs_compute_checksum(nilfs, bh_sum, &crc, sizeof(sum->ss_datasum),
243 ((u64)nblock << sbi->s_super->s_blocksize_bits), 235 ((u64)nblock << nilfs->ns_blocksize_bits),
244 pseg_start, nblock)) { 236 bh_sum->b_blocknr, nblock))
245 ret = NILFS_SEG_FAIL_IO; 237 goto out;
246 goto failed; 238
247 } 239 ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
248 if (crc == le32_to_cpu(sum->ss_datasum)) 240 if (crc != le32_to_cpu(sum->ss_datasum))
249 ret = 0; 241 goto out;
250 else 242 ret = 0;
251 ret = NILFS_SEG_FAIL_CHECKSUM_FULL; 243out:
252 failed:
253 brelse(bh_sum);
254 out:
255 return ret; 244 return ret;
256} 245}
257 246
258static void *segsum_get(struct super_block *sb, struct buffer_head **pbh, 247/**
259 unsigned int *offset, unsigned int bytes) 248 * nilfs_read_summary_info - read an item on summary blocks of a log
249 * @nilfs: nilfs object
250 * @pbh: the current buffer head on summary blocks [in, out]
251 * @offset: the current byte offset on summary blocks [in, out]
252 * @bytes: byte size of the item to be read
253 */
254static void *nilfs_read_summary_info(struct the_nilfs *nilfs,
255 struct buffer_head **pbh,
256 unsigned int *offset, unsigned int bytes)
260{ 257{
261 void *ptr; 258 void *ptr;
262 sector_t blocknr; 259 sector_t blocknr;
@@ -265,7 +262,8 @@ static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
265 if (bytes > (*pbh)->b_size - *offset) { 262 if (bytes > (*pbh)->b_size - *offset) {
266 blocknr = (*pbh)->b_blocknr; 263 blocknr = (*pbh)->b_blocknr;
267 brelse(*pbh); 264 brelse(*pbh);
268 *pbh = sb_bread(sb, blocknr + 1); 265 *pbh = __bread(nilfs->ns_bdev, blocknr + 1,
266 nilfs->ns_blocksize);
269 if (unlikely(!*pbh)) 267 if (unlikely(!*pbh))
270 return NULL; 268 return NULL;
271 *offset = 0; 269 *offset = 0;
@@ -275,9 +273,18 @@ static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
275 return ptr; 273 return ptr;
276} 274}
277 275
278static void segsum_skip(struct super_block *sb, struct buffer_head **pbh, 276/**
279 unsigned int *offset, unsigned int bytes, 277 * nilfs_skip_summary_info - skip items on summary blocks of a log
280 unsigned long count) 278 * @nilfs: nilfs object
279 * @pbh: the current buffer head on summary blocks [in, out]
280 * @offset: the current byte offset on summary blocks [in, out]
281 * @bytes: byte size of the item to be skipped
282 * @count: number of items to be skipped
283 */
284static void nilfs_skip_summary_info(struct the_nilfs *nilfs,
285 struct buffer_head **pbh,
286 unsigned int *offset, unsigned int bytes,
287 unsigned long count)
281{ 288{
282 unsigned int rest_item_in_current_block 289 unsigned int rest_item_in_current_block
283 = ((*pbh)->b_size - *offset) / bytes; 290 = ((*pbh)->b_size - *offset) / bytes;
@@ -294,36 +301,46 @@ static void segsum_skip(struct super_block *sb, struct buffer_head **pbh,
294 *offset = bytes * (count - (bcnt - 1) * nitem_per_block); 301 *offset = bytes * (count - (bcnt - 1) * nitem_per_block);
295 302
296 brelse(*pbh); 303 brelse(*pbh);
297 *pbh = sb_bread(sb, blocknr + bcnt); 304 *pbh = __bread(nilfs->ns_bdev, blocknr + bcnt,
305 nilfs->ns_blocksize);
298 } 306 }
299} 307}
300 308
301static int 309/**
302collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr, 310 * nilfs_scan_dsync_log - get block information of a log written for data sync
303 struct nilfs_segsum_info *ssi, 311 * @nilfs: nilfs object
304 struct list_head *head) 312 * @start_blocknr: start block number of the log
313 * @sum: log summary information
314 * @head: list head to add nilfs_recovery_block struct
315 */
316static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr,
317 struct nilfs_segment_summary *sum,
318 struct list_head *head)
305{ 319{
306 struct buffer_head *bh; 320 struct buffer_head *bh;
307 unsigned int offset; 321 unsigned int offset;
308 unsigned long nfinfo = ssi->nfinfo; 322 u32 nfinfo, sumbytes;
309 sector_t blocknr = sum_blocknr + ssi->nsumblk; 323 sector_t blocknr;
310 ino_t ino; 324 ino_t ino;
311 int err = -EIO; 325 int err = -EIO;
312 326
327 nfinfo = le32_to_cpu(sum->ss_nfinfo);
313 if (!nfinfo) 328 if (!nfinfo)
314 return 0; 329 return 0;
315 330
316 bh = sb_bread(sbi->s_super, sum_blocknr); 331 sumbytes = le32_to_cpu(sum->ss_sumbytes);
332 blocknr = start_blocknr + DIV_ROUND_UP(sumbytes, nilfs->ns_blocksize);
333 bh = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize);
317 if (unlikely(!bh)) 334 if (unlikely(!bh))
318 goto out; 335 goto out;
319 336
320 offset = le16_to_cpu( 337 offset = le16_to_cpu(sum->ss_bytes);
321 ((struct nilfs_segment_summary *)bh->b_data)->ss_bytes);
322 for (;;) { 338 for (;;) {
323 unsigned long nblocks, ndatablk, nnodeblk; 339 unsigned long nblocks, ndatablk, nnodeblk;
324 struct nilfs_finfo *finfo; 340 struct nilfs_finfo *finfo;
325 341
326 finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo)); 342 finfo = nilfs_read_summary_info(nilfs, &bh, &offset,
343 sizeof(*finfo));
327 if (unlikely(!finfo)) 344 if (unlikely(!finfo))
328 goto out; 345 goto out;
329 346
@@ -336,8 +353,8 @@ collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
336 struct nilfs_recovery_block *rb; 353 struct nilfs_recovery_block *rb;
337 struct nilfs_binfo_v *binfo; 354 struct nilfs_binfo_v *binfo;
338 355
339 binfo = segsum_get(sbi->s_super, &bh, &offset, 356 binfo = nilfs_read_summary_info(nilfs, &bh, &offset,
340 sizeof(*binfo)); 357 sizeof(*binfo));
341 if (unlikely(!binfo)) 358 if (unlikely(!binfo))
342 goto out; 359 goto out;
343 360
@@ -355,9 +372,9 @@ collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
355 } 372 }
356 if (--nfinfo == 0) 373 if (--nfinfo == 0)
357 break; 374 break;
358 blocknr += nnodeblk; /* always 0 for the data sync segments */ 375 blocknr += nnodeblk; /* always 0 for data sync logs */
359 segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64), 376 nilfs_skip_summary_info(nilfs, &bh, &offset, sizeof(__le64),
360 nnodeblk); 377 nnodeblk);
361 if (unlikely(!bh)) 378 if (unlikely(!bh))
362 goto out; 379 goto out;
363 } 380 }
@@ -467,14 +484,14 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
467 return err; 484 return err;
468} 485}
469 486
470static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi, 487static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
471 struct nilfs_recovery_block *rb, 488 struct nilfs_recovery_block *rb,
472 struct page *page) 489 struct page *page)
473{ 490{
474 struct buffer_head *bh_org; 491 struct buffer_head *bh_org;
475 void *kaddr; 492 void *kaddr;
476 493
477 bh_org = sb_bread(sbi->s_super, rb->blocknr); 494 bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize);
478 if (unlikely(!bh_org)) 495 if (unlikely(!bh_org))
479 return -EIO; 496 return -EIO;
480 497
@@ -485,13 +502,14 @@ static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi,
485 return 0; 502 return 0;
486} 503}
487 504
488static int recover_dsync_blocks(struct nilfs_sb_info *sbi, 505static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
489 struct list_head *head, 506 struct nilfs_sb_info *sbi,
490 unsigned long *nr_salvaged_blocks) 507 struct list_head *head,
508 unsigned long *nr_salvaged_blocks)
491{ 509{
492 struct inode *inode; 510 struct inode *inode;
493 struct nilfs_recovery_block *rb, *n; 511 struct nilfs_recovery_block *rb, *n;
494 unsigned blocksize = sbi->s_super->s_blocksize; 512 unsigned blocksize = nilfs->ns_blocksize;
495 struct page *page; 513 struct page *page;
496 loff_t pos; 514 loff_t pos;
497 int err = 0, err2 = 0; 515 int err = 0, err2 = 0;
@@ -511,7 +529,7 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
511 if (unlikely(err)) 529 if (unlikely(err))
512 goto failed_inode; 530 goto failed_inode;
513 531
514 err = nilfs_recovery_copy_block(sbi, rb, page); 532 err = nilfs_recovery_copy_block(nilfs, rb, page);
515 if (unlikely(err)) 533 if (unlikely(err))
516 goto failed_page; 534 goto failed_page;
517 535
@@ -551,18 +569,20 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
551/** 569/**
552 * nilfs_do_roll_forward - salvage logical segments newer than the latest 570 * nilfs_do_roll_forward - salvage logical segments newer than the latest
553 * checkpoint 571 * checkpoint
572 * @nilfs: nilfs object
554 * @sbi: nilfs_sb_info 573 * @sbi: nilfs_sb_info
555 * @nilfs: the_nilfs
556 * @ri: pointer to a nilfs_recovery_info 574 * @ri: pointer to a nilfs_recovery_info
557 */ 575 */
558static int nilfs_do_roll_forward(struct the_nilfs *nilfs, 576static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
559 struct nilfs_sb_info *sbi, 577 struct nilfs_sb_info *sbi,
560 struct nilfs_recovery_info *ri) 578 struct nilfs_recovery_info *ri)
561{ 579{
562 struct nilfs_segsum_info ssi; 580 struct buffer_head *bh_sum = NULL;
581 struct nilfs_segment_summary *sum;
563 sector_t pseg_start; 582 sector_t pseg_start;
564 sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */ 583 sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */
565 unsigned long nsalvaged_blocks = 0; 584 unsigned long nsalvaged_blocks = 0;
585 unsigned int flags;
566 u64 seg_seq; 586 u64 seg_seq;
567 __u64 segnum, nextnum = 0; 587 __u64 segnum, nextnum = 0;
568 int empty_seg = 0; 588 int empty_seg = 0;
@@ -581,8 +601,14 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
581 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); 601 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
582 602
583 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { 603 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
604 brelse(bh_sum);
605 bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum);
606 if (!bh_sum) {
607 err = -EIO;
608 goto failed;
609 }
584 610
585 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi); 611 ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum);
586 if (ret) { 612 if (ret) {
587 if (ret == NILFS_SEG_FAIL_IO) { 613 if (ret == NILFS_SEG_FAIL_IO) {
588 err = -EIO; 614 err = -EIO;
@@ -590,33 +616,38 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
590 } 616 }
591 goto strayed; 617 goto strayed;
592 } 618 }
593 if (unlikely(NILFS_SEG_HAS_SR(&ssi))) 619
620 flags = le16_to_cpu(sum->ss_flags);
621 if (flags & NILFS_SS_SR)
594 goto confused; 622 goto confused;
595 623
596 /* Found a valid partial segment; do recovery actions */ 624 /* Found a valid partial segment; do recovery actions */
597 nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next); 625 nextnum = nilfs_get_segnum_of_block(nilfs,
626 le64_to_cpu(sum->ss_next));
598 empty_seg = 0; 627 empty_seg = 0;
599 nilfs->ns_ctime = ssi.ctime; 628 nilfs->ns_ctime = le64_to_cpu(sum->ss_create);
600 if (!(ssi.flags & NILFS_SS_GC)) 629 if (!(flags & NILFS_SS_GC))
601 nilfs->ns_nongc_ctime = ssi.ctime; 630 nilfs->ns_nongc_ctime = nilfs->ns_ctime;
602 631
603 switch (state) { 632 switch (state) {
604 case RF_INIT_ST: 633 case RF_INIT_ST:
605 if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi)) 634 if (!(flags & NILFS_SS_LOGBGN) ||
635 !(flags & NILFS_SS_SYNDT))
606 goto try_next_pseg; 636 goto try_next_pseg;
607 state = RF_DSYNC_ST; 637 state = RF_DSYNC_ST;
608 /* Fall through */ 638 /* Fall through */
609 case RF_DSYNC_ST: 639 case RF_DSYNC_ST:
610 if (!NILFS_SEG_DSYNC(&ssi)) 640 if (!(flags & NILFS_SS_SYNDT))
611 goto confused; 641 goto confused;
612 642
613 err = collect_blocks_from_segsum( 643 err = nilfs_scan_dsync_log(nilfs, pseg_start, sum,
614 sbi, pseg_start, &ssi, &dsync_blocks); 644 &dsync_blocks);
615 if (unlikely(err)) 645 if (unlikely(err))
616 goto failed; 646 goto failed;
617 if (NILFS_SEG_LOGEND(&ssi)) { 647 if (flags & NILFS_SS_LOGEND) {
618 err = recover_dsync_blocks( 648 err = nilfs_recover_dsync_blocks(
619 sbi, &dsync_blocks, &nsalvaged_blocks); 649 nilfs, sbi, &dsync_blocks,
650 &nsalvaged_blocks);
620 if (unlikely(err)) 651 if (unlikely(err))
621 goto failed; 652 goto failed;
622 state = RF_INIT_ST; 653 state = RF_INIT_ST;
@@ -627,7 +658,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
627 try_next_pseg: 658 try_next_pseg:
628 if (pseg_start == ri->ri_lsegs_end) 659 if (pseg_start == ri->ri_lsegs_end)
629 break; 660 break;
630 pseg_start += ssi.nblocks; 661 pseg_start += le32_to_cpu(sum->ss_nblocks);
631 if (pseg_start < seg_end) 662 if (pseg_start < seg_end)
632 continue; 663 continue;
633 goto feed_segment; 664 goto feed_segment;
@@ -652,8 +683,9 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
652 ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE; 683 ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
653 } 684 }
654 out: 685 out:
686 brelse(bh_sum);
655 dispose_recovery_list(&dsync_blocks); 687 dispose_recovery_list(&dsync_blocks);
656 nilfs_detach_writer(sbi->s_nilfs, sbi); 688 nilfs_detach_writer(nilfs, sbi);
657 return err; 689 return err;
658 690
659 confused: 691 confused:
@@ -667,7 +699,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
667} 699}
668 700
669static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, 701static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
670 struct nilfs_sb_info *sbi,
671 struct nilfs_recovery_info *ri) 702 struct nilfs_recovery_info *ri)
672{ 703{
673 struct buffer_head *bh; 704 struct buffer_head *bh;
@@ -677,7 +708,7 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
677 nilfs_get_segnum_of_block(nilfs, ri->ri_super_root)) 708 nilfs_get_segnum_of_block(nilfs, ri->ri_super_root))
678 return; 709 return;
679 710
680 bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start); 711 bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize);
681 BUG_ON(!bh); 712 BUG_ON(!bh);
682 memset(bh->b_data, 0, bh->b_size); 713 memset(bh->b_data, 0, bh->b_size);
683 set_buffer_dirty(bh); 714 set_buffer_dirty(bh);
@@ -690,9 +721,8 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
690} 721}
691 722
692/** 723/**
693 * nilfs_recover_logical_segments - salvage logical segments written after 724 * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint
694 * the latest super root 725 * @nilfs: nilfs object
695 * @nilfs: the_nilfs
696 * @sbi: nilfs_sb_info 726 * @sbi: nilfs_sb_info
697 * @ri: pointer to a nilfs_recovery_info struct to store search results. 727 * @ri: pointer to a nilfs_recovery_info struct to store search results.
698 * 728 *
@@ -709,9 +739,9 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
709 * 739 *
710 * %-ENOMEM - Insufficient memory available. 740 * %-ENOMEM - Insufficient memory available.
711 */ 741 */
712int nilfs_recover_logical_segments(struct the_nilfs *nilfs, 742int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
713 struct nilfs_sb_info *sbi, 743 struct nilfs_sb_info *sbi,
714 struct nilfs_recovery_info *ri) 744 struct nilfs_recovery_info *ri)
715{ 745{
716 int err; 746 int err;
717 747
@@ -751,7 +781,7 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
751 goto failed; 781 goto failed;
752 } 782 }
753 783
754 nilfs_finish_roll_forward(nilfs, sbi, ri); 784 nilfs_finish_roll_forward(nilfs, ri);
755 } 785 }
756 786
757 failed: 787 failed:
@@ -762,7 +792,6 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
762/** 792/**
763 * nilfs_search_super_root - search the latest valid super root 793 * nilfs_search_super_root - search the latest valid super root
764 * @nilfs: the_nilfs 794 * @nilfs: the_nilfs
765 * @sbi: nilfs_sb_info
766 * @ri: pointer to a nilfs_recovery_info struct to store search results. 795 * @ri: pointer to a nilfs_recovery_info struct to store search results.
767 * 796 *
768 * nilfs_search_super_root() looks for the latest super-root from a partial 797 * nilfs_search_super_root() looks for the latest super-root from a partial
@@ -775,14 +804,19 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
775 * %-EINVAL - No valid segment found 804 * %-EINVAL - No valid segment found
776 * 805 *
777 * %-EIO - I/O error 806 * %-EIO - I/O error
807 *
808 * %-ENOMEM - Insufficient memory available.
778 */ 809 */
779int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, 810int nilfs_search_super_root(struct the_nilfs *nilfs,
780 struct nilfs_recovery_info *ri) 811 struct nilfs_recovery_info *ri)
781{ 812{
782 struct nilfs_segsum_info ssi; 813 struct buffer_head *bh_sum = NULL;
814 struct nilfs_segment_summary *sum;
783 sector_t pseg_start, pseg_end, sr_pseg_start = 0; 815 sector_t pseg_start, pseg_end, sr_pseg_start = 0;
784 sector_t seg_start, seg_end; /* range of full segment (block number) */ 816 sector_t seg_start, seg_end; /* range of full segment (block number) */
785 sector_t b, end; 817 sector_t b, end;
818 unsigned long nblocks;
819 unsigned int flags;
786 u64 seg_seq; 820 u64 seg_seq;
787 __u64 segnum, nextnum = 0; 821 __u64 segnum, nextnum = 0;
788 __u64 cno; 822 __u64 cno;
@@ -801,17 +835,24 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
801 /* Read ahead segment */ 835 /* Read ahead segment */
802 b = seg_start; 836 b = seg_start;
803 while (b <= seg_end) 837 while (b <= seg_end)
804 sb_breadahead(sbi->s_super, b++); 838 __breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize);
805 839
806 for (;;) { 840 for (;;) {
807 /* Load segment summary */ 841 brelse(bh_sum);
808 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi); 842 ret = NILFS_SEG_FAIL_IO;
843 bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum);
844 if (!bh_sum)
845 goto failed;
846
847 ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum);
809 if (ret) { 848 if (ret) {
810 if (ret == NILFS_SEG_FAIL_IO) 849 if (ret == NILFS_SEG_FAIL_IO)
811 goto failed; 850 goto failed;
812 goto strayed; 851 goto strayed;
813 } 852 }
814 pseg_end = pseg_start + ssi.nblocks - 1; 853
854 nblocks = le32_to_cpu(sum->ss_nblocks);
855 pseg_end = pseg_start + nblocks - 1;
815 if (unlikely(pseg_end > seg_end)) { 856 if (unlikely(pseg_end > seg_end)) {
816 ret = NILFS_SEG_FAIL_CONSISTENCY; 857 ret = NILFS_SEG_FAIL_CONSISTENCY;
817 goto strayed; 858 goto strayed;
@@ -821,11 +862,13 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
821 ri->ri_pseg_start = pseg_start; 862 ri->ri_pseg_start = pseg_start;
822 ri->ri_seq = seg_seq; 863 ri->ri_seq = seg_seq;
823 ri->ri_segnum = segnum; 864 ri->ri_segnum = segnum;
824 nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next); 865 nextnum = nilfs_get_segnum_of_block(nilfs,
866 le64_to_cpu(sum->ss_next));
825 ri->ri_nextnum = nextnum; 867 ri->ri_nextnum = nextnum;
826 empty_seg = 0; 868 empty_seg = 0;
827 869
828 if (!NILFS_SEG_HAS_SR(&ssi) && !scan_newer) { 870 flags = le16_to_cpu(sum->ss_flags);
871 if (!(flags & NILFS_SS_SR) && !scan_newer) {
829 /* This will never happen because a superblock 872 /* This will never happen because a superblock
830 (last_segment) always points to a pseg 873 (last_segment) always points to a pseg
831 having a super root. */ 874 having a super root. */
@@ -836,14 +879,15 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
836 if (pseg_start == seg_start) { 879 if (pseg_start == seg_start) {
837 nilfs_get_segment_range(nilfs, nextnum, &b, &end); 880 nilfs_get_segment_range(nilfs, nextnum, &b, &end);
838 while (b <= end) 881 while (b <= end)
839 sb_breadahead(sbi->s_super, b++); 882 __breadahead(nilfs->ns_bdev, b++,
883 nilfs->ns_blocksize);
840 } 884 }
841 if (!NILFS_SEG_HAS_SR(&ssi)) { 885 if (!(flags & NILFS_SS_SR)) {
842 if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) { 886 if (!ri->ri_lsegs_start && (flags & NILFS_SS_LOGBGN)) {
843 ri->ri_lsegs_start = pseg_start; 887 ri->ri_lsegs_start = pseg_start;
844 ri->ri_lsegs_start_seq = seg_seq; 888 ri->ri_lsegs_start_seq = seg_seq;
845 } 889 }
846 if (NILFS_SEG_LOGEND(&ssi)) 890 if (flags & NILFS_SS_LOGEND)
847 ri->ri_lsegs_end = pseg_start; 891 ri->ri_lsegs_end = pseg_start;
848 goto try_next_pseg; 892 goto try_next_pseg;
849 } 893 }
@@ -854,12 +898,12 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
854 ri->ri_lsegs_start = ri->ri_lsegs_end = 0; 898 ri->ri_lsegs_start = ri->ri_lsegs_end = 0;
855 899
856 nilfs_dispose_segment_list(&segments); 900 nilfs_dispose_segment_list(&segments);
857 nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start) 901 sr_pseg_start = pseg_start;
858 + ssi.nblocks - seg_start; 902 nilfs->ns_pseg_offset = pseg_start + nblocks - seg_start;
859 nilfs->ns_seg_seq = seg_seq; 903 nilfs->ns_seg_seq = seg_seq;
860 nilfs->ns_segnum = segnum; 904 nilfs->ns_segnum = segnum;
861 nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */ 905 nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */
862 nilfs->ns_ctime = ssi.ctime; 906 nilfs->ns_ctime = le64_to_cpu(sum->ss_create);
863 nilfs->ns_nextnum = nextnum; 907 nilfs->ns_nextnum = nextnum;
864 908
865 if (scan_newer) 909 if (scan_newer)
@@ -870,15 +914,9 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
870 scan_newer = 1; 914 scan_newer = 1;
871 } 915 }
872 916
873 /* reset region for roll-forward */
874 pseg_start += ssi.nblocks;
875 if (pseg_start < seg_end)
876 continue;
877 goto feed_segment;
878
879 try_next_pseg: 917 try_next_pseg:
880 /* Standing on a course, or met an inconsistent state */ 918 /* Standing on a course, or met an inconsistent state */
881 pseg_start += ssi.nblocks; 919 pseg_start += nblocks;
882 if (pseg_start < seg_end) 920 if (pseg_start < seg_end)
883 continue; 921 continue;
884 goto feed_segment; 922 goto feed_segment;
@@ -909,6 +947,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
909 947
910 super_root_found: 948 super_root_found:
911 /* Updating pointers relating to the latest checkpoint */ 949 /* Updating pointers relating to the latest checkpoint */
950 brelse(bh_sum);
912 list_splice_tail(&segments, &ri->ri_used_segments); 951 list_splice_tail(&segments, &ri->ri_used_segments);
913 nilfs->ns_last_pseg = sr_pseg_start; 952 nilfs->ns_last_pseg = sr_pseg_start;
914 nilfs->ns_last_seq = nilfs->ns_seg_seq; 953 nilfs->ns_last_seq = nilfs->ns_seg_seq;
@@ -916,6 +955,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
916 return 0; 955 return 0;
917 956
918 failed: 957 failed:
958 brelse(bh_sum);
919 nilfs_dispose_segment_list(&segments); 959 nilfs_dispose_segment_list(&segments);
920 return (ret < 0) ? ret : nilfs_warn_segment_error(ret); 960 return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
921} 961}
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index 85fbb66455e2..b04f08cc2397 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -54,17 +54,6 @@ struct nilfs_segsum_info {
54 sector_t next; 54 sector_t next;
55}; 55};
56 56
57/* macro for the flags */
58#define NILFS_SEG_HAS_SR(sum) ((sum)->flags & NILFS_SS_SR)
59#define NILFS_SEG_LOGBGN(sum) ((sum)->flags & NILFS_SS_LOGBGN)
60#define NILFS_SEG_LOGEND(sum) ((sum)->flags & NILFS_SS_LOGEND)
61#define NILFS_SEG_DSYNC(sum) ((sum)->flags & NILFS_SS_SYNDT)
62#define NILFS_SEG_SIMPLEX(sum) \
63 (((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \
64 (NILFS_SS_LOGBGN | NILFS_SS_LOGEND))
65
66#define NILFS_SEG_EMPTY(sum) ((sum)->nblocks == (sum)->nsumblk)
67
68/** 57/**
69 * struct nilfs_segment_buffer - Segment buffer 58 * struct nilfs_segment_buffer - Segment buffer
70 * @sb_super: back pointer to a superblock struct 59 * @sb_super: back pointer to a superblock struct
@@ -141,6 +130,19 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
141 struct buffer_head **); 130 struct buffer_head **);
142void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *); 131void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
143 132
133static inline int nilfs_segbuf_simplex(struct nilfs_segment_buffer *segbuf)
134{
135 unsigned int flags = segbuf->sb_sum.flags;
136
137 return (flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) ==
138 (NILFS_SS_LOGBGN | NILFS_SS_LOGEND);
139}
140
141static inline int nilfs_segbuf_empty(struct nilfs_segment_buffer *segbuf)
142{
143 return segbuf->sb_sum.nblocks == segbuf->sb_sum.nsumblk;
144}
145
144static inline void 146static inline void
145nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf, 147nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
146 struct buffer_head *bh) 148 struct buffer_head *bh)
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index c9201649cc49..9fd051a33c4f 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1914,12 +1914,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1914 } 1914 }
1915 } 1915 }
1916 1916
1917 if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) { 1917 if (!nilfs_segbuf_simplex(segbuf)) {
1918 if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) { 1918 if (segbuf->sb_sum.flags & NILFS_SS_LOGBGN) {
1919 set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); 1919 set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
1920 sci->sc_lseg_stime = jiffies; 1920 sci->sc_lseg_stime = jiffies;
1921 } 1921 }
1922 if (NILFS_SEG_LOGEND(&segbuf->sb_sum)) 1922 if (segbuf->sb_sum.flags & NILFS_SS_LOGEND)
1923 clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); 1923 clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
1924 } 1924 }
1925 } 1925 }
@@ -1951,7 +1951,6 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1951 if (update_sr) { 1951 if (update_sr) {
1952 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, 1952 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
1953 segbuf->sb_sum.seg_seq, nilfs->ns_cno++); 1953 segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
1954 set_nilfs_sb_dirty(nilfs);
1955 1954
1956 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); 1955 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
1957 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); 1956 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
@@ -2082,7 +2081,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2082 2081
2083 /* Avoid empty segment */ 2082 /* Avoid empty segment */
2084 if (sci->sc_stage.scnt == NILFS_ST_DONE && 2083 if (sci->sc_stage.scnt == NILFS_ST_DONE &&
2085 NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) { 2084 nilfs_segbuf_empty(sci->sc_curseg)) {
2086 nilfs_segctor_abort_construction(sci, nilfs, 1); 2085 nilfs_segctor_abort_construction(sci, nilfs, 1);
2087 goto out; 2086 goto out;
2088 } 2087 }
@@ -2408,6 +2407,7 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
2408{ 2407{
2409 struct nilfs_sb_info *sbi = sci->sc_sbi; 2408 struct nilfs_sb_info *sbi = sci->sc_sbi;
2410 struct the_nilfs *nilfs = sbi->s_nilfs; 2409 struct the_nilfs *nilfs = sbi->s_nilfs;
2410 struct nilfs_super_block **sbp;
2411 int err = 0; 2411 int err = 0;
2412 2412
2413 nilfs_segctor_accept(sci); 2413 nilfs_segctor_accept(sci);
@@ -2423,8 +2423,13 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
2423 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && 2423 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
2424 nilfs_discontinued(nilfs)) { 2424 nilfs_discontinued(nilfs)) {
2425 down_write(&nilfs->ns_sem); 2425 down_write(&nilfs->ns_sem);
2426 err = nilfs_commit_super( 2426 err = -EIO;
2427 sbi, nilfs_altsb_need_update(nilfs)); 2427 sbp = nilfs_prepare_super(sbi,
2428 nilfs_sb_will_flip(nilfs));
2429 if (likely(sbp)) {
2430 nilfs_set_log_cursor(sbp[0], nilfs);
2431 err = nilfs_commit_super(sbi, NILFS_SB_COMMIT);
2432 }
2428 up_write(&nilfs->ns_sem); 2433 up_write(&nilfs->ns_sem);
2429 } 2434 }
2430 } 2435 }
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 01e20dbb217d..17c487bd8152 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -234,13 +234,13 @@ extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
234extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *); 234extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
235 235
236/* recovery.c */ 236/* recovery.c */
237extern int nilfs_read_super_root_block(struct super_block *, sector_t, 237extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t,
238 struct buffer_head **, int); 238 struct buffer_head **, int);
239extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *, 239extern int nilfs_search_super_root(struct the_nilfs *,
240 struct nilfs_recovery_info *); 240 struct nilfs_recovery_info *);
241extern int nilfs_recover_logical_segments(struct the_nilfs *, 241extern int nilfs_salvage_orphan_logs(struct the_nilfs *,
242 struct nilfs_sb_info *, 242 struct nilfs_sb_info *,
243 struct nilfs_recovery_info *); 243 struct nilfs_recovery_info *);
244extern void nilfs_dispose_segment_list(struct list_head *); 244extern void nilfs_dispose_segment_list(struct list_head *);
245 245
246#endif /* _NILFS_SEGMENT_H */ 246#endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 414ef68931cf..26078b3407c9 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -55,6 +55,8 @@
55#include "nilfs.h" 55#include "nilfs.h"
56#include "mdt.h" 56#include "mdt.h"
57#include "alloc.h" 57#include "alloc.h"
58#include "btree.h"
59#include "btnode.h"
58#include "page.h" 60#include "page.h"
59#include "cpfile.h" 61#include "cpfile.h"
60#include "ifile.h" 62#include "ifile.h"
@@ -74,6 +76,25 @@ struct kmem_cache *nilfs_btree_path_cache;
74 76
75static int nilfs_remount(struct super_block *sb, int *flags, char *data); 77static int nilfs_remount(struct super_block *sb, int *flags, char *data);
76 78
79static void nilfs_set_error(struct nilfs_sb_info *sbi)
80{
81 struct the_nilfs *nilfs = sbi->s_nilfs;
82 struct nilfs_super_block **sbp;
83
84 down_write(&nilfs->ns_sem);
85 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
86 nilfs->ns_mount_state |= NILFS_ERROR_FS;
87 sbp = nilfs_prepare_super(sbi, 0);
88 if (likely(sbp)) {
89 sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
90 if (sbp[1])
91 sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
92 nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
93 }
94 }
95 up_write(&nilfs->ns_sem);
96}
97
77/** 98/**
78 * nilfs_error() - report failure condition on a filesystem 99 * nilfs_error() - report failure condition on a filesystem
79 * 100 *
@@ -99,16 +120,7 @@ void nilfs_error(struct super_block *sb, const char *function,
99 va_end(args); 120 va_end(args);
100 121
101 if (!(sb->s_flags & MS_RDONLY)) { 122 if (!(sb->s_flags & MS_RDONLY)) {
102 struct the_nilfs *nilfs = sbi->s_nilfs; 123 nilfs_set_error(sbi);
103
104 down_write(&nilfs->ns_sem);
105 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
106 nilfs->ns_mount_state |= NILFS_ERROR_FS;
107 nilfs->ns_sbp[0]->s_state |=
108 cpu_to_le16(NILFS_ERROR_FS);
109 nilfs_commit_super(sbi, 1);
110 }
111 up_write(&nilfs->ns_sem);
112 124
113 if (nilfs_test_opt(sbi, ERRORS_RO)) { 125 if (nilfs_test_opt(sbi, ERRORS_RO)) {
114 printk(KERN_CRIT "Remounting filesystem read-only\n"); 126 printk(KERN_CRIT "Remounting filesystem read-only\n");
@@ -176,7 +188,7 @@ static void nilfs_clear_inode(struct inode *inode)
176 nilfs_btnode_cache_clear(&ii->i_btnode_cache); 188 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
177} 189}
178 190
179static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb) 191static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
180{ 192{
181 struct the_nilfs *nilfs = sbi->s_nilfs; 193 struct the_nilfs *nilfs = sbi->s_nilfs;
182 int err; 194 int err;
@@ -202,12 +214,20 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
202 printk(KERN_ERR 214 printk(KERN_ERR
203 "NILFS: unable to write superblock (err=%d)\n", err); 215 "NILFS: unable to write superblock (err=%d)\n", err);
204 if (err == -EIO && nilfs->ns_sbh[1]) { 216 if (err == -EIO && nilfs->ns_sbh[1]) {
217 /*
218 * sbp[0] points to newer log than sbp[1],
219 * so copy sbp[0] to sbp[1] to take over sbp[0].
220 */
221 memcpy(nilfs->ns_sbp[1], nilfs->ns_sbp[0],
222 nilfs->ns_sbsize);
205 nilfs_fall_back_super_block(nilfs); 223 nilfs_fall_back_super_block(nilfs);
206 goto retry; 224 goto retry;
207 } 225 }
208 } else { 226 } else {
209 struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; 227 struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
210 228
229 nilfs->ns_sbwcount++;
230
211 /* 231 /*
212 * The latest segment becomes trailable from the position 232 * The latest segment becomes trailable from the position
213 * written in superblock. 233 * written in superblock.
@@ -216,66 +236,122 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
216 236
217 /* update GC protection for recent segments */ 237 /* update GC protection for recent segments */
218 if (nilfs->ns_sbh[1]) { 238 if (nilfs->ns_sbh[1]) {
219 sbp = NULL; 239 if (flag == NILFS_SB_COMMIT_ALL) {
220 if (dupsb) {
221 set_buffer_dirty(nilfs->ns_sbh[1]); 240 set_buffer_dirty(nilfs->ns_sbh[1]);
222 if (!sync_dirty_buffer(nilfs->ns_sbh[1])) 241 if (sync_dirty_buffer(nilfs->ns_sbh[1]) < 0)
223 sbp = nilfs->ns_sbp[1]; 242 goto out;
224 } 243 }
244 if (le64_to_cpu(nilfs->ns_sbp[1]->s_last_cno) <
245 le64_to_cpu(nilfs->ns_sbp[0]->s_last_cno))
246 sbp = nilfs->ns_sbp[1];
225 } 247 }
226 if (sbp) {
227 spin_lock(&nilfs->ns_last_segment_lock);
228 nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
229 spin_unlock(&nilfs->ns_last_segment_lock);
230 }
231 }
232 248
249 spin_lock(&nilfs->ns_last_segment_lock);
250 nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
251 spin_unlock(&nilfs->ns_last_segment_lock);
252 }
253 out:
233 return err; 254 return err;
234} 255}
235 256
236int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb) 257void nilfs_set_log_cursor(struct nilfs_super_block *sbp,
258 struct the_nilfs *nilfs)
259{
260 sector_t nfreeblocks;
261
262 /* nilfs->ns_sem must be locked by the caller. */
263 nilfs_count_free_blocks(nilfs, &nfreeblocks);
264 sbp->s_free_blocks_count = cpu_to_le64(nfreeblocks);
265
266 spin_lock(&nilfs->ns_last_segment_lock);
267 sbp->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
268 sbp->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
269 sbp->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
270 spin_unlock(&nilfs->ns_last_segment_lock);
271}
272
273struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
274 int flip)
237{ 275{
238 struct the_nilfs *nilfs = sbi->s_nilfs; 276 struct the_nilfs *nilfs = sbi->s_nilfs;
239 struct nilfs_super_block **sbp = nilfs->ns_sbp; 277 struct nilfs_super_block **sbp = nilfs->ns_sbp;
240 sector_t nfreeblocks;
241 time_t t;
242 int err;
243 278
244 /* nilfs->sem must be locked by the caller. */ 279 /* nilfs->ns_sem must be locked by the caller. */
245 if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { 280 if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
246 if (sbp[1] && sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) 281 if (sbp[1] &&
247 nilfs_swap_super_block(nilfs); 282 sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) {
248 else { 283 memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
284 } else {
249 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", 285 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
250 sbi->s_super->s_id); 286 sbi->s_super->s_id);
251 return -EIO; 287 return NULL;
252 } 288 }
289 } else if (sbp[1] &&
290 sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) {
291 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
253 } 292 }
254 err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
255 if (unlikely(err)) {
256 printk(KERN_ERR "NILFS: failed to count free blocks\n");
257 return err;
258 }
259 spin_lock(&nilfs->ns_last_segment_lock);
260 sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
261 sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
262 sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
263 spin_unlock(&nilfs->ns_last_segment_lock);
264 293
294 if (flip && sbp[1])
295 nilfs_swap_super_block(nilfs);
296
297 return sbp;
298}
299
300int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag)
301{
302 struct the_nilfs *nilfs = sbi->s_nilfs;
303 struct nilfs_super_block **sbp = nilfs->ns_sbp;
304 time_t t;
305
306 /* nilfs->ns_sem must be locked by the caller. */
265 t = get_seconds(); 307 t = get_seconds();
266 nilfs->ns_sbwtime[0] = t; 308 nilfs->ns_sbwtime = t;
267 sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks);
268 sbp[0]->s_wtime = cpu_to_le64(t); 309 sbp[0]->s_wtime = cpu_to_le64(t);
269 sbp[0]->s_sum = 0; 310 sbp[0]->s_sum = 0;
270 sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, 311 sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
271 (unsigned char *)sbp[0], 312 (unsigned char *)sbp[0],
272 nilfs->ns_sbsize)); 313 nilfs->ns_sbsize));
273 if (dupsb && sbp[1]) { 314 if (flag == NILFS_SB_COMMIT_ALL && sbp[1]) {
274 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); 315 sbp[1]->s_wtime = sbp[0]->s_wtime;
275 nilfs->ns_sbwtime[1] = t; 316 sbp[1]->s_sum = 0;
317 sbp[1]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
318 (unsigned char *)sbp[1],
319 nilfs->ns_sbsize));
276 } 320 }
277 clear_nilfs_sb_dirty(nilfs); 321 clear_nilfs_sb_dirty(nilfs);
278 return nilfs_sync_super(sbi, dupsb); 322 return nilfs_sync_super(sbi, flag);
323}
324
325/**
326 * nilfs_cleanup_super() - write filesystem state for cleanup
327 * @sbi: nilfs_sb_info to be unmounted or degraded to read-only
328 *
329 * This function restores state flags in the on-disk super block.
330 * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the
331 * filesystem was not clean previously.
332 */
333int nilfs_cleanup_super(struct nilfs_sb_info *sbi)
334{
335 struct nilfs_super_block **sbp;
336 int flag = NILFS_SB_COMMIT;
337 int ret = -EIO;
338
339 sbp = nilfs_prepare_super(sbi, 0);
340 if (sbp) {
341 sbp[0]->s_state = cpu_to_le16(sbi->s_nilfs->ns_mount_state);
342 nilfs_set_log_cursor(sbp[0], sbi->s_nilfs);
343 if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) {
344 /*
345 * make the "clean" flag also to the opposite
346 * super block if both super blocks point to
347 * the same checkpoint.
348 */
349 sbp[1]->s_state = sbp[0]->s_state;
350 flag = NILFS_SB_COMMIT_ALL;
351 }
352 ret = nilfs_commit_super(sbi, flag);
353 }
354 return ret;
279} 355}
280 356
281static void nilfs_put_super(struct super_block *sb) 357static void nilfs_put_super(struct super_block *sb)
@@ -289,8 +365,7 @@ static void nilfs_put_super(struct super_block *sb)
289 365
290 if (!(sb->s_flags & MS_RDONLY)) { 366 if (!(sb->s_flags & MS_RDONLY)) {
291 down_write(&nilfs->ns_sem); 367 down_write(&nilfs->ns_sem);
292 nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state); 368 nilfs_cleanup_super(sbi);
293 nilfs_commit_super(sbi, 1);
294 up_write(&nilfs->ns_sem); 369 up_write(&nilfs->ns_sem);
295 } 370 }
296 down_write(&nilfs->ns_super_sem); 371 down_write(&nilfs->ns_super_sem);
@@ -311,6 +386,7 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
311{ 386{
312 struct nilfs_sb_info *sbi = NILFS_SB(sb); 387 struct nilfs_sb_info *sbi = NILFS_SB(sb);
313 struct the_nilfs *nilfs = sbi->s_nilfs; 388 struct the_nilfs *nilfs = sbi->s_nilfs;
389 struct nilfs_super_block **sbp;
314 int err = 0; 390 int err = 0;
315 391
316 /* This function is called when super block should be written back */ 392 /* This function is called when super block should be written back */
@@ -318,8 +394,13 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
318 err = nilfs_construct_segment(sb); 394 err = nilfs_construct_segment(sb);
319 395
320 down_write(&nilfs->ns_sem); 396 down_write(&nilfs->ns_sem);
321 if (nilfs_sb_dirty(nilfs)) 397 if (nilfs_sb_dirty(nilfs)) {
322 nilfs_commit_super(sbi, 1); 398 sbp = nilfs_prepare_super(sbi, nilfs_sb_will_flip(nilfs));
399 if (likely(sbp)) {
400 nilfs_set_log_cursor(sbp[0], nilfs);
401 nilfs_commit_super(sbi, NILFS_SB_COMMIT);
402 }
403 }
323 up_write(&nilfs->ns_sem); 404 up_write(&nilfs->ns_sem);
324 405
325 return err; 406 return err;
@@ -442,20 +523,20 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
442 struct nilfs_sb_info *sbi = NILFS_SB(sb); 523 struct nilfs_sb_info *sbi = NILFS_SB(sb);
443 524
444 if (!nilfs_test_opt(sbi, BARRIER)) 525 if (!nilfs_test_opt(sbi, BARRIER))
445 seq_printf(seq, ",nobarrier"); 526 seq_puts(seq, ",nobarrier");
446 if (nilfs_test_opt(sbi, SNAPSHOT)) 527 if (nilfs_test_opt(sbi, SNAPSHOT))
447 seq_printf(seq, ",cp=%llu", 528 seq_printf(seq, ",cp=%llu",
448 (unsigned long long int)sbi->s_snapshot_cno); 529 (unsigned long long int)sbi->s_snapshot_cno);
449 if (nilfs_test_opt(sbi, ERRORS_PANIC)) 530 if (nilfs_test_opt(sbi, ERRORS_PANIC))
450 seq_printf(seq, ",errors=panic"); 531 seq_puts(seq, ",errors=panic");
451 if (nilfs_test_opt(sbi, ERRORS_CONT)) 532 if (nilfs_test_opt(sbi, ERRORS_CONT))
452 seq_printf(seq, ",errors=continue"); 533 seq_puts(seq, ",errors=continue");
453 if (nilfs_test_opt(sbi, STRICT_ORDER)) 534 if (nilfs_test_opt(sbi, STRICT_ORDER))
454 seq_printf(seq, ",order=strict"); 535 seq_puts(seq, ",order=strict");
455 if (nilfs_test_opt(sbi, NORECOVERY)) 536 if (nilfs_test_opt(sbi, NORECOVERY))
456 seq_printf(seq, ",norecovery"); 537 seq_puts(seq, ",norecovery");
457 if (nilfs_test_opt(sbi, DISCARD)) 538 if (nilfs_test_opt(sbi, DISCARD))
458 seq_printf(seq, ",discard"); 539 seq_puts(seq, ",discard");
459 540
460 return 0; 541 return 0;
461} 542}
@@ -524,23 +605,25 @@ static const struct export_operations nilfs_export_ops = {
524 605
525enum { 606enum {
526 Opt_err_cont, Opt_err_panic, Opt_err_ro, 607 Opt_err_cont, Opt_err_panic, Opt_err_ro,
527 Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, 608 Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery,
528 Opt_discard, Opt_err, 609 Opt_discard, Opt_nodiscard, Opt_err,
529}; 610};
530 611
531static match_table_t tokens = { 612static match_table_t tokens = {
532 {Opt_err_cont, "errors=continue"}, 613 {Opt_err_cont, "errors=continue"},
533 {Opt_err_panic, "errors=panic"}, 614 {Opt_err_panic, "errors=panic"},
534 {Opt_err_ro, "errors=remount-ro"}, 615 {Opt_err_ro, "errors=remount-ro"},
616 {Opt_barrier, "barrier"},
535 {Opt_nobarrier, "nobarrier"}, 617 {Opt_nobarrier, "nobarrier"},
536 {Opt_snapshot, "cp=%u"}, 618 {Opt_snapshot, "cp=%u"},
537 {Opt_order, "order=%s"}, 619 {Opt_order, "order=%s"},
538 {Opt_norecovery, "norecovery"}, 620 {Opt_norecovery, "norecovery"},
539 {Opt_discard, "discard"}, 621 {Opt_discard, "discard"},
622 {Opt_nodiscard, "nodiscard"},
540 {Opt_err, NULL} 623 {Opt_err, NULL}
541}; 624};
542 625
543static int parse_options(char *options, struct super_block *sb) 626static int parse_options(char *options, struct super_block *sb, int is_remount)
544{ 627{
545 struct nilfs_sb_info *sbi = NILFS_SB(sb); 628 struct nilfs_sb_info *sbi = NILFS_SB(sb);
546 char *p; 629 char *p;
@@ -557,6 +640,9 @@ static int parse_options(char *options, struct super_block *sb)
557 640
558 token = match_token(p, tokens, args); 641 token = match_token(p, tokens, args);
559 switch (token) { 642 switch (token) {
643 case Opt_barrier:
644 nilfs_set_opt(sbi, BARRIER);
645 break;
560 case Opt_nobarrier: 646 case Opt_nobarrier:
561 nilfs_clear_opt(sbi, BARRIER); 647 nilfs_clear_opt(sbi, BARRIER);
562 break; 648 break;
@@ -582,8 +668,26 @@ static int parse_options(char *options, struct super_block *sb)
582 case Opt_snapshot: 668 case Opt_snapshot:
583 if (match_int(&args[0], &option) || option <= 0) 669 if (match_int(&args[0], &option) || option <= 0)
584 return 0; 670 return 0;
585 if (!(sb->s_flags & MS_RDONLY)) 671 if (is_remount) {
672 if (!nilfs_test_opt(sbi, SNAPSHOT)) {
673 printk(KERN_ERR
674 "NILFS: cannot change regular "
675 "mount to snapshot.\n");
676 return 0;
677 } else if (option != sbi->s_snapshot_cno) {
678 printk(KERN_ERR
679 "NILFS: cannot remount to a "
680 "different snapshot.\n");
681 return 0;
682 }
683 break;
684 }
685 if (!(sb->s_flags & MS_RDONLY)) {
686 printk(KERN_ERR "NILFS: cannot mount snapshot "
687 "read/write. A read-only option is "
688 "required.\n");
586 return 0; 689 return 0;
690 }
587 sbi->s_snapshot_cno = option; 691 sbi->s_snapshot_cno = option;
588 nilfs_set_opt(sbi, SNAPSHOT); 692 nilfs_set_opt(sbi, SNAPSHOT);
589 break; 693 break;
@@ -593,6 +697,9 @@ static int parse_options(char *options, struct super_block *sb)
593 case Opt_discard: 697 case Opt_discard:
594 nilfs_set_opt(sbi, DISCARD); 698 nilfs_set_opt(sbi, DISCARD);
595 break; 699 break;
700 case Opt_nodiscard:
701 nilfs_clear_opt(sbi, DISCARD);
702 break;
596 default: 703 default:
597 printk(KERN_ERR 704 printk(KERN_ERR
598 "NILFS: Unrecognized mount option \"%s\"\n", p); 705 "NILFS: Unrecognized mount option \"%s\"\n", p);
@@ -613,11 +720,18 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi,
613static int nilfs_setup_super(struct nilfs_sb_info *sbi) 720static int nilfs_setup_super(struct nilfs_sb_info *sbi)
614{ 721{
615 struct the_nilfs *nilfs = sbi->s_nilfs; 722 struct the_nilfs *nilfs = sbi->s_nilfs;
616 struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; 723 struct nilfs_super_block **sbp;
617 int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count); 724 int max_mnt_count;
618 int mnt_count = le16_to_cpu(sbp->s_mnt_count); 725 int mnt_count;
726
727 /* nilfs->ns_sem must be locked by the caller. */
728 sbp = nilfs_prepare_super(sbi, 0);
729 if (!sbp)
730 return -EIO;
731
732 max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count);
733 mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
619 734
620 /* nilfs->sem must be locked by the caller. */
621 if (nilfs->ns_mount_state & NILFS_ERROR_FS) { 735 if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
622 printk(KERN_WARNING 736 printk(KERN_WARNING
623 "NILFS warning: mounting fs with errors\n"); 737 "NILFS warning: mounting fs with errors\n");
@@ -628,12 +742,15 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi)
628#endif 742#endif
629 } 743 }
630 if (!max_mnt_count) 744 if (!max_mnt_count)
631 sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); 745 sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
632 746
633 sbp->s_mnt_count = cpu_to_le16(mnt_count + 1); 747 sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1);
634 sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS); 748 sbp[0]->s_state =
635 sbp->s_mtime = cpu_to_le64(get_seconds()); 749 cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS);
636 return nilfs_commit_super(sbi, 1); 750 sbp[0]->s_mtime = cpu_to_le64(get_seconds());
751 /* synchronize sbp[1] with sbp[0] */
752 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
753 return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL);
637} 754}
638 755
639struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb, 756struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
@@ -670,7 +787,31 @@ int nilfs_store_magic_and_option(struct super_block *sb,
670 sbi->s_interval = le32_to_cpu(sbp->s_c_interval); 787 sbi->s_interval = le32_to_cpu(sbp->s_c_interval);
671 sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max); 788 sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max);
672 789
673 return !parse_options(data, sb) ? -EINVAL : 0 ; 790 return !parse_options(data, sb, 0) ? -EINVAL : 0 ;
791}
792
793int nilfs_check_feature_compatibility(struct super_block *sb,
794 struct nilfs_super_block *sbp)
795{
796 __u64 features;
797
798 features = le64_to_cpu(sbp->s_feature_incompat) &
799 ~NILFS_FEATURE_INCOMPAT_SUPP;
800 if (features) {
801 printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
802 "optional features (%llx)\n",
803 (unsigned long long)features);
804 return -EINVAL;
805 }
806 features = le64_to_cpu(sbp->s_feature_compat_ro) &
807 ~NILFS_FEATURE_COMPAT_RO_SUPP;
808 if (!(sb->s_flags & MS_RDONLY) && features) {
809 printk(KERN_ERR "NILFS: couldn't mount RDWR because of "
810 "unsupported optional features (%llx)\n",
811 (unsigned long long)features);
812 return -EINVAL;
813 }
814 return 0;
674} 815}
675 816
676/** 817/**
@@ -819,7 +960,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
819static int nilfs_remount(struct super_block *sb, int *flags, char *data) 960static int nilfs_remount(struct super_block *sb, int *flags, char *data)
820{ 961{
821 struct nilfs_sb_info *sbi = NILFS_SB(sb); 962 struct nilfs_sb_info *sbi = NILFS_SB(sb);
822 struct nilfs_super_block *sbp;
823 struct the_nilfs *nilfs = sbi->s_nilfs; 963 struct the_nilfs *nilfs = sbi->s_nilfs;
824 unsigned long old_sb_flags; 964 unsigned long old_sb_flags;
825 struct nilfs_mount_options old_opts; 965 struct nilfs_mount_options old_opts;
@@ -833,32 +973,17 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
833 old_opts.snapshot_cno = sbi->s_snapshot_cno; 973 old_opts.snapshot_cno = sbi->s_snapshot_cno;
834 was_snapshot = nilfs_test_opt(sbi, SNAPSHOT); 974 was_snapshot = nilfs_test_opt(sbi, SNAPSHOT);
835 975
836 if (!parse_options(data, sb)) { 976 if (!parse_options(data, sb, 1)) {
837 err = -EINVAL; 977 err = -EINVAL;
838 goto restore_opts; 978 goto restore_opts;
839 } 979 }
840 sb->s_flags = (sb->s_flags & ~MS_POSIXACL); 980 sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
841 981
842 err = -EINVAL; 982 err = -EINVAL;
843 if (was_snapshot) { 983 if (was_snapshot && !(*flags & MS_RDONLY)) {
844 if (!(*flags & MS_RDONLY)) { 984 printk(KERN_ERR "NILFS (device %s): cannot remount snapshot "
845 printk(KERN_ERR "NILFS (device %s): cannot remount " 985 "read/write.\n", sb->s_id);
846 "snapshot read/write.\n", 986 goto restore_opts;
847 sb->s_id);
848 goto restore_opts;
849 } else if (sbi->s_snapshot_cno != old_opts.snapshot_cno) {
850 printk(KERN_ERR "NILFS (device %s): cannot "
851 "remount to a different snapshot.\n",
852 sb->s_id);
853 goto restore_opts;
854 }
855 } else {
856 if (nilfs_test_opt(sbi, SNAPSHOT)) {
857 printk(KERN_ERR "NILFS (device %s): cannot change "
858 "a regular mount to a snapshot.\n",
859 sb->s_id);
860 goto restore_opts;
861 }
862 } 987 }
863 988
864 if (!nilfs_valid_fs(nilfs)) { 989 if (!nilfs_valid_fs(nilfs)) {
@@ -880,19 +1005,29 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
880 * the RDONLY flag and then mark the partition as valid again. 1005 * the RDONLY flag and then mark the partition as valid again.
881 */ 1006 */
882 down_write(&nilfs->ns_sem); 1007 down_write(&nilfs->ns_sem);
883 sbp = nilfs->ns_sbp[0]; 1008 nilfs_cleanup_super(sbi);
884 if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) &&
885 (nilfs->ns_mount_state & NILFS_VALID_FS))
886 sbp->s_state = cpu_to_le16(nilfs->ns_mount_state);
887 sbp->s_mtime = cpu_to_le64(get_seconds());
888 nilfs_commit_super(sbi, 1);
889 up_write(&nilfs->ns_sem); 1009 up_write(&nilfs->ns_sem);
890 } else { 1010 } else {
1011 __u64 features;
1012
891 /* 1013 /*
892 * Mounting a RDONLY partition read-write, so reread and 1014 * Mounting a RDONLY partition read-write, so reread and
893 * store the current valid flag. (It may have been changed 1015 * store the current valid flag. (It may have been changed
894 * by fsck since we originally mounted the partition.) 1016 * by fsck since we originally mounted the partition.)
895 */ 1017 */
1018 down_read(&nilfs->ns_sem);
1019 features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
1020 ~NILFS_FEATURE_COMPAT_RO_SUPP;
1021 up_read(&nilfs->ns_sem);
1022 if (features) {
1023 printk(KERN_WARNING "NILFS (device %s): couldn't "
1024 "remount RDWR because of unsupported optional "
1025 "features (%llx)\n",
1026 sb->s_id, (unsigned long long)features);
1027 err = -EROFS;
1028 goto restore_opts;
1029 }
1030
896 sb->s_flags &= ~MS_RDONLY; 1031 sb->s_flags &= ~MS_RDONLY;
897 1032
898 err = nilfs_attach_segment_constructor(sbi); 1033 err = nilfs_attach_segment_constructor(sbi);
@@ -1119,7 +1254,7 @@ static void nilfs_inode_init_once(void *obj)
1119 init_rwsem(&ii->xattr_sem); 1254 init_rwsem(&ii->xattr_sem);
1120#endif 1255#endif
1121 nilfs_btnode_cache_init_once(&ii->i_btnode_cache); 1256 nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
1122 ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union; 1257 ii->i_bmap = &ii->i_bmap_data;
1123 inode_init_once(&ii->vfs_inode); 1258 inode_init_once(&ii->vfs_inode);
1124} 1259}
1125 1260
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8c1097327abc..37de1f062d81 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -38,6 +38,8 @@
38static LIST_HEAD(nilfs_objects); 38static LIST_HEAD(nilfs_objects);
39static DEFINE_SPINLOCK(nilfs_lock); 39static DEFINE_SPINLOCK(nilfs_lock);
40 40
41static int nilfs_valid_sb(struct nilfs_super_block *sbp);
42
41void nilfs_set_last_segment(struct the_nilfs *nilfs, 43void nilfs_set_last_segment(struct the_nilfs *nilfs,
42 sector_t start_blocknr, u64 seq, __u64 cno) 44 sector_t start_blocknr, u64 seq, __u64 cno)
43{ 45{
@@ -45,6 +47,16 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
45 nilfs->ns_last_pseg = start_blocknr; 47 nilfs->ns_last_pseg = start_blocknr;
46 nilfs->ns_last_seq = seq; 48 nilfs->ns_last_seq = seq;
47 nilfs->ns_last_cno = cno; 49 nilfs->ns_last_cno = cno;
50
51 if (!nilfs_sb_dirty(nilfs)) {
52 if (nilfs->ns_prev_seq == nilfs->ns_last_seq)
53 goto stay_cursor;
54
55 set_nilfs_sb_dirty(nilfs);
56 }
57 nilfs->ns_prev_seq = nilfs->ns_last_seq;
58
59 stay_cursor:
48 spin_unlock(&nilfs->ns_last_segment_lock); 60 spin_unlock(&nilfs->ns_last_segment_lock);
49} 61}
50 62
@@ -159,8 +171,7 @@ void put_nilfs(struct the_nilfs *nilfs)
159 kfree(nilfs); 171 kfree(nilfs);
160} 172}
161 173
162static int nilfs_load_super_root(struct the_nilfs *nilfs, 174static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block)
163 struct nilfs_sb_info *sbi, sector_t sr_block)
164{ 175{
165 struct buffer_head *bh_sr; 176 struct buffer_head *bh_sr;
166 struct nilfs_super_root *raw_sr; 177 struct nilfs_super_root *raw_sr;
@@ -169,7 +180,7 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
169 unsigned inode_size; 180 unsigned inode_size;
170 int err; 181 int err;
171 182
172 err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1); 183 err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1);
173 if (unlikely(err)) 184 if (unlikely(err))
174 return err; 185 return err;
175 186
@@ -248,6 +259,37 @@ static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri)
248} 259}
249 260
250/** 261/**
262 * nilfs_store_log_cursor - load log cursor from a super block
263 * @nilfs: nilfs object
264 * @sbp: buffer storing super block to be read
265 *
266 * nilfs_store_log_cursor() reads the last position of the log
267 * containing a super root from a given super block, and initializes
268 * relevant information on the nilfs object preparatory for log
269 * scanning and recovery.
270 */
271static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
272 struct nilfs_super_block *sbp)
273{
274 int ret = 0;
275
276 nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
277 nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
278 nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
279
280 nilfs->ns_prev_seq = nilfs->ns_last_seq;
281 nilfs->ns_seg_seq = nilfs->ns_last_seq;
282 nilfs->ns_segnum =
283 nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
284 nilfs->ns_cno = nilfs->ns_last_cno + 1;
285 if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
286 printk(KERN_ERR "NILFS invalid last segment number.\n");
287 ret = -EINVAL;
288 }
289 return ret;
290}
291
292/**
251 * load_nilfs - load and recover the nilfs 293 * load_nilfs - load and recover the nilfs
252 * @nilfs: the_nilfs structure to be released 294 * @nilfs: the_nilfs structure to be released
253 * @sbi: nilfs_sb_info used to recover past segment 295 * @sbi: nilfs_sb_info used to recover past segment
@@ -285,13 +327,55 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
285 327
286 nilfs_init_recovery_info(&ri); 328 nilfs_init_recovery_info(&ri);
287 329
288 err = nilfs_search_super_root(nilfs, sbi, &ri); 330 err = nilfs_search_super_root(nilfs, &ri);
289 if (unlikely(err)) { 331 if (unlikely(err)) {
290 printk(KERN_ERR "NILFS: error searching super root.\n"); 332 struct nilfs_super_block **sbp = nilfs->ns_sbp;
291 goto failed; 333 int blocksize;
334
335 if (err != -EINVAL)
336 goto scan_error;
337
338 if (!nilfs_valid_sb(sbp[1])) {
339 printk(KERN_WARNING
340 "NILFS warning: unable to fall back to spare"
341 "super block\n");
342 goto scan_error;
343 }
344 printk(KERN_INFO
345 "NILFS: try rollback from an earlier position\n");
346
347 /*
348 * restore super block with its spare and reconfigure
349 * relevant states of the nilfs object.
350 */
351 memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
352 nilfs->ns_crc_seed = le32_to_cpu(sbp[0]->s_crc_seed);
353 nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
354
355 /* verify consistency between two super blocks */
356 blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size);
357 if (blocksize != nilfs->ns_blocksize) {
358 printk(KERN_WARNING
359 "NILFS warning: blocksize differs between "
360 "two super blocks (%d != %d)\n",
361 blocksize, nilfs->ns_blocksize);
362 goto scan_error;
363 }
364
365 err = nilfs_store_log_cursor(nilfs, sbp[0]);
366 if (err)
367 goto scan_error;
368
369 /* drop clean flag to allow roll-forward and recovery */
370 nilfs->ns_mount_state &= ~NILFS_VALID_FS;
371 valid_fs = 0;
372
373 err = nilfs_search_super_root(nilfs, &ri);
374 if (err)
375 goto scan_error;
292 } 376 }
293 377
294 err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root); 378 err = nilfs_load_super_root(nilfs, ri.ri_super_root);
295 if (unlikely(err)) { 379 if (unlikely(err)) {
296 printk(KERN_ERR "NILFS: error loading super root.\n"); 380 printk(KERN_ERR "NILFS: error loading super root.\n");
297 goto failed; 381 goto failed;
@@ -301,11 +385,23 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
301 goto skip_recovery; 385 goto skip_recovery;
302 386
303 if (s_flags & MS_RDONLY) { 387 if (s_flags & MS_RDONLY) {
388 __u64 features;
389
304 if (nilfs_test_opt(sbi, NORECOVERY)) { 390 if (nilfs_test_opt(sbi, NORECOVERY)) {
305 printk(KERN_INFO "NILFS: norecovery option specified. " 391 printk(KERN_INFO "NILFS: norecovery option specified. "
306 "skipping roll-forward recovery\n"); 392 "skipping roll-forward recovery\n");
307 goto skip_recovery; 393 goto skip_recovery;
308 } 394 }
395 features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
396 ~NILFS_FEATURE_COMPAT_RO_SUPP;
397 if (features) {
398 printk(KERN_ERR "NILFS: couldn't proceed with "
399 "recovery because of unsupported optional "
400 "features (%llx)\n",
401 (unsigned long long)features);
402 err = -EROFS;
403 goto failed_unload;
404 }
309 if (really_read_only) { 405 if (really_read_only) {
310 printk(KERN_ERR "NILFS: write access " 406 printk(KERN_ERR "NILFS: write access "
311 "unavailable, cannot proceed.\n"); 407 "unavailable, cannot proceed.\n");
@@ -320,14 +416,13 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
320 goto failed_unload; 416 goto failed_unload;
321 } 417 }
322 418
323 err = nilfs_recover_logical_segments(nilfs, sbi, &ri); 419 err = nilfs_salvage_orphan_logs(nilfs, sbi, &ri);
324 if (err) 420 if (err)
325 goto failed_unload; 421 goto failed_unload;
326 422
327 down_write(&nilfs->ns_sem); 423 down_write(&nilfs->ns_sem);
328 nilfs->ns_mount_state |= NILFS_VALID_FS; 424 nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */
329 nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state); 425 err = nilfs_cleanup_super(sbi);
330 err = nilfs_commit_super(sbi, 1);
331 up_write(&nilfs->ns_sem); 426 up_write(&nilfs->ns_sem);
332 427
333 if (err) { 428 if (err) {
@@ -343,6 +438,10 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
343 sbi->s_super->s_flags = s_flags; 438 sbi->s_super->s_flags = s_flags;
344 return 0; 439 return 0;
345 440
441 scan_error:
442 printk(KERN_ERR "NILFS: error searching super root.\n");
443 goto failed;
444
346 failed_unload: 445 failed_unload:
347 nilfs_mdt_destroy(nilfs->ns_cpfile); 446 nilfs_mdt_destroy(nilfs->ns_cpfile);
348 nilfs_mdt_destroy(nilfs->ns_sufile); 447 nilfs_mdt_destroy(nilfs->ns_sufile);
@@ -515,8 +614,8 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
515 nilfs_swap_super_block(nilfs); 614 nilfs_swap_super_block(nilfs);
516 } 615 }
517 616
518 nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime); 617 nilfs->ns_sbwcount = 0;
519 nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0; 618 nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
520 nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq); 619 nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq);
521 *sbpp = sbp[0]; 620 *sbpp = sbp[0];
522 return 0; 621 return 0;
@@ -557,6 +656,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
557 if (err) 656 if (err)
558 goto out; 657 goto out;
559 658
659 err = nilfs_check_feature_compatibility(sb, sbp);
660 if (err)
661 goto out;
662
560 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); 663 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
561 if (sb->s_blocksize != blocksize && 664 if (sb->s_blocksize != blocksize &&
562 !sb_set_blocksize(sb, blocksize)) { 665 !sb_set_blocksize(sb, blocksize)) {
@@ -568,7 +671,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
568 goto out; 671 goto out;
569 } 672 }
570 673
571 blocksize = sb_min_blocksize(sb, BLOCK_SIZE); 674 blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
572 if (!blocksize) { 675 if (!blocksize) {
573 printk(KERN_ERR "NILFS: unable to set blocksize\n"); 676 printk(KERN_ERR "NILFS: unable to set blocksize\n");
574 err = -EINVAL; 677 err = -EINVAL;
@@ -582,7 +685,18 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
582 if (err) 685 if (err)
583 goto failed_sbh; 686 goto failed_sbh;
584 687
688 err = nilfs_check_feature_compatibility(sb, sbp);
689 if (err)
690 goto failed_sbh;
691
585 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); 692 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
693 if (blocksize < NILFS_MIN_BLOCK_SIZE ||
694 blocksize > NILFS_MAX_BLOCK_SIZE) {
695 printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
696 "filesystem blocksize %d\n", blocksize);
697 err = -EINVAL;
698 goto failed_sbh;
699 }
586 if (sb->s_blocksize != blocksize) { 700 if (sb->s_blocksize != blocksize) {
587 int hw_blocksize = bdev_logical_block_size(sb->s_bdev); 701 int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
588 702
@@ -604,6 +718,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
604 when reloading fails. */ 718 when reloading fails. */
605 } 719 }
606 nilfs->ns_blocksize_bits = sb->s_blocksize_bits; 720 nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
721 nilfs->ns_blocksize = blocksize;
607 722
608 err = nilfs_store_disk_layout(nilfs, sbp); 723 err = nilfs_store_disk_layout(nilfs, sbp);
609 if (err) 724 if (err)
@@ -616,23 +731,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
616 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info; 731 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
617 nilfs->ns_bdi = bdi ? : &default_backing_dev_info; 732 nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
618 733
619 /* Finding last segment */ 734 err = nilfs_store_log_cursor(nilfs, sbp);
620 nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg); 735 if (err)
621 nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
622 nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
623
624 nilfs->ns_seg_seq = nilfs->ns_last_seq;
625 nilfs->ns_segnum =
626 nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
627 nilfs->ns_cno = nilfs->ns_last_cno + 1;
628 if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
629 printk(KERN_ERR "NILFS invalid last segment number.\n");
630 err = -EINVAL;
631 goto failed_sbh; 736 goto failed_sbh;
632 }
633 /* Dummy values */
634 nilfs->ns_free_segments_count =
635 nilfs->ns_nsegments - (nilfs->ns_segnum + 1);
636 737
637 /* Initialize gcinode cache */ 738 /* Initialize gcinode cache */
638 err = nilfs_init_gccache(nilfs); 739 err = nilfs_init_gccache(nilfs);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 1ab974533697..f785a7b0ab99 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -57,7 +57,8 @@ enum {
57 * @ns_current: back pointer to current mount 57 * @ns_current: back pointer to current mount
58 * @ns_sbh: buffer heads of on-disk super blocks 58 * @ns_sbh: buffer heads of on-disk super blocks
59 * @ns_sbp: pointers to super block data 59 * @ns_sbp: pointers to super block data
60 * @ns_sbwtime: previous write time of super blocks 60 * @ns_sbwtime: previous write time of super block
61 * @ns_sbwcount: write count of super block
61 * @ns_sbsize: size of valid data in super block 62 * @ns_sbsize: size of valid data in super block
62 * @ns_supers: list of nilfs super block structs 63 * @ns_supers: list of nilfs super block structs
63 * @ns_seg_seq: segment sequence counter 64 * @ns_seg_seq: segment sequence counter
@@ -73,7 +74,7 @@ enum {
73 * @ns_last_seq: sequence value of the latest segment 74 * @ns_last_seq: sequence value of the latest segment
74 * @ns_last_cno: checkpoint number of the latest segment 75 * @ns_last_cno: checkpoint number of the latest segment
75 * @ns_prot_seq: least sequence number of segments which must not be reclaimed 76 * @ns_prot_seq: least sequence number of segments which must not be reclaimed
76 * @ns_free_segments_count: counter of free segments 77 * @ns_prev_seq: base sequence number used to decide if advance log cursor
77 * @ns_segctor_sem: segment constructor semaphore 78 * @ns_segctor_sem: segment constructor semaphore
78 * @ns_dat: DAT file inode 79 * @ns_dat: DAT file inode
79 * @ns_cpfile: checkpoint file inode 80 * @ns_cpfile: checkpoint file inode
@@ -82,6 +83,7 @@ enum {
82 * @ns_gc_inodes: dummy inodes to keep live blocks 83 * @ns_gc_inodes: dummy inodes to keep live blocks
83 * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks 84 * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
84 * @ns_blocksize_bits: bit length of block size 85 * @ns_blocksize_bits: bit length of block size
86 * @ns_blocksize: block size
85 * @ns_nsegments: number of segments in filesystem 87 * @ns_nsegments: number of segments in filesystem
86 * @ns_blocks_per_segment: number of blocks per segment 88 * @ns_blocks_per_segment: number of blocks per segment
87 * @ns_r_segments_percentage: reserved segments percentage 89 * @ns_r_segments_percentage: reserved segments percentage
@@ -119,7 +121,8 @@ struct the_nilfs {
119 */ 121 */
120 struct buffer_head *ns_sbh[2]; 122 struct buffer_head *ns_sbh[2];
121 struct nilfs_super_block *ns_sbp[2]; 123 struct nilfs_super_block *ns_sbp[2];
122 time_t ns_sbwtime[2]; 124 time_t ns_sbwtime;
125 unsigned ns_sbwcount;
123 unsigned ns_sbsize; 126 unsigned ns_sbsize;
124 unsigned ns_mount_state; 127 unsigned ns_mount_state;
125 128
@@ -149,7 +152,7 @@ struct the_nilfs {
149 u64 ns_last_seq; 152 u64 ns_last_seq;
150 __u64 ns_last_cno; 153 __u64 ns_last_cno;
151 u64 ns_prot_seq; 154 u64 ns_prot_seq;
152 unsigned long ns_free_segments_count; 155 u64 ns_prev_seq;
153 156
154 struct rw_semaphore ns_segctor_sem; 157 struct rw_semaphore ns_segctor_sem;
155 158
@@ -168,6 +171,7 @@ struct the_nilfs {
168 171
169 /* Disk layout information (static) */ 172 /* Disk layout information (static) */
170 unsigned int ns_blocksize_bits; 173 unsigned int ns_blocksize_bits;
174 unsigned int ns_blocksize;
171 unsigned long ns_nsegments; 175 unsigned long ns_nsegments;
172 unsigned long ns_blocks_per_segment; 176 unsigned long ns_blocks_per_segment;
173 unsigned long ns_r_segments_percentage; 177 unsigned long ns_r_segments_percentage;
@@ -203,20 +207,17 @@ THE_NILFS_FNS(SB_DIRTY, sb_dirty)
203 207
204/* Minimum interval of periodical update of superblocks (in seconds) */ 208/* Minimum interval of periodical update of superblocks (in seconds) */
205#define NILFS_SB_FREQ 10 209#define NILFS_SB_FREQ 10
206#define NILFS_ALTSB_FREQ 60 /* spare superblock */
207 210
208static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) 211static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
209{ 212{
210 u64 t = get_seconds(); 213 u64 t = get_seconds();
211 return t < nilfs->ns_sbwtime[0] || 214 return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + NILFS_SB_FREQ;
212 t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ;
213} 215}
214 216
215static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs) 217static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
216{ 218{
217 u64 t = get_seconds(); 219 int flip_bits = nilfs->ns_sbwcount & 0x0FL;
218 struct nilfs_super_block **sbp = nilfs->ns_sbp; 220 return (flip_bits != 0x08 && flip_bits != 0x0F);
219 return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
220} 221}
221 222
222void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); 223void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 625de9d7088c..9b57c0350ff9 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -760,13 +760,13 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
760 if (osb->osb_commit_interval) 760 if (osb->osb_commit_interval)
761 commit_interval = osb->osb_commit_interval; 761 commit_interval = osb->osb_commit_interval;
762 762
763 spin_lock(&journal->j_state_lock); 763 write_lock(&journal->j_state_lock);
764 journal->j_commit_interval = commit_interval; 764 journal->j_commit_interval = commit_interval;
765 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) 765 if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
766 journal->j_flags |= JBD2_BARRIER; 766 journal->j_flags |= JBD2_BARRIER;
767 else 767 else
768 journal->j_flags &= ~JBD2_BARRIER; 768 journal->j_flags &= ~JBD2_BARRIER;
769 spin_unlock(&journal->j_state_lock); 769 write_unlock(&journal->j_state_lock);
770} 770}
771 771
772int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) 772int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 5dcd4b0c5533..72c52656dc2e 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -459,7 +459,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
459 } 459 }
460 460
461 /* everything is up and running, commence */ 461 /* everything is up and running, commence */
462 INIT_RCU_HEAD(&p->rcu_head);
463 rcu_assign_pointer(ptbl->part[partno], p); 462 rcu_assign_pointer(ptbl->part[partno], p);
464 463
465 /* suppress uevent if the disk supresses it */ 464 /* suppress uevent if the disk supresses it */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index acb7ef80ea4f..69254a365ce2 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -63,6 +63,7 @@
63#include <linux/namei.h> 63#include <linux/namei.h>
64#include <linux/mnt_namespace.h> 64#include <linux/mnt_namespace.h>
65#include <linux/mm.h> 65#include <linux/mm.h>
66#include <linux/swap.h>
66#include <linux/rcupdate.h> 67#include <linux/rcupdate.h>
67#include <linux/kallsyms.h> 68#include <linux/kallsyms.h>
68#include <linux/stacktrace.h> 69#include <linux/stacktrace.h>
@@ -427,17 +428,14 @@ static const struct file_operations proc_lstats_operations = {
427 428
428#endif 429#endif
429 430
430/* The badness from the OOM killer */
431unsigned long badness(struct task_struct *p, unsigned long uptime);
432static int proc_oom_score(struct task_struct *task, char *buffer) 431static int proc_oom_score(struct task_struct *task, char *buffer)
433{ 432{
434 unsigned long points = 0; 433 unsigned long points = 0;
435 struct timespec uptime;
436 434
437 do_posix_clock_monotonic_gettime(&uptime);
438 read_lock(&tasklist_lock); 435 read_lock(&tasklist_lock);
439 if (pid_alive(task)) 436 if (pid_alive(task))
440 points = badness(task, uptime.tv_sec); 437 points = oom_badness(task, NULL, NULL,
438 totalram_pages + total_swap_pages);
441 read_unlock(&tasklist_lock); 439 read_unlock(&tasklist_lock);
442 return sprintf(buffer, "%lu\n", points); 440 return sprintf(buffer, "%lu\n", points);
443} 441}
@@ -1039,8 +1037,24 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1039 return -EACCES; 1037 return -EACCES;
1040 } 1038 }
1041 1039
1040 /*
1041 * Warn that /proc/pid/oom_adj is deprecated, see
1042 * Documentation/feature-removal-schedule.txt.
1043 */
1044 printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, "
1045 "please use /proc/%d/oom_score_adj instead.\n",
1046 current->comm, task_pid_nr(current),
1047 task_pid_nr(task), task_pid_nr(task));
1042 task->signal->oom_adj = oom_adjust; 1048 task->signal->oom_adj = oom_adjust;
1043 1049 /*
1050 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
1051 * value is always attainable.
1052 */
1053 if (task->signal->oom_adj == OOM_ADJUST_MAX)
1054 task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
1055 else
1056 task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
1057 -OOM_DISABLE;
1044 unlock_task_sighand(task, &flags); 1058 unlock_task_sighand(task, &flags);
1045 put_task_struct(task); 1059 put_task_struct(task);
1046 1060
@@ -1053,6 +1067,82 @@ static const struct file_operations proc_oom_adjust_operations = {
1053 .llseek = generic_file_llseek, 1067 .llseek = generic_file_llseek,
1054}; 1068};
1055 1069
1070static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
1071 size_t count, loff_t *ppos)
1072{
1073 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
1074 char buffer[PROC_NUMBUF];
1075 int oom_score_adj = OOM_SCORE_ADJ_MIN;
1076 unsigned long flags;
1077 size_t len;
1078
1079 if (!task)
1080 return -ESRCH;
1081 if (lock_task_sighand(task, &flags)) {
1082 oom_score_adj = task->signal->oom_score_adj;
1083 unlock_task_sighand(task, &flags);
1084 }
1085 put_task_struct(task);
1086 len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
1087 return simple_read_from_buffer(buf, count, ppos, buffer, len);
1088}
1089
1090static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1091 size_t count, loff_t *ppos)
1092{
1093 struct task_struct *task;
1094 char buffer[PROC_NUMBUF];
1095 unsigned long flags;
1096 long oom_score_adj;
1097 int err;
1098
1099 memset(buffer, 0, sizeof(buffer));
1100 if (count > sizeof(buffer) - 1)
1101 count = sizeof(buffer) - 1;
1102 if (copy_from_user(buffer, buf, count))
1103 return -EFAULT;
1104
1105 err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
1106 if (err)
1107 return -EINVAL;
1108 if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
1109 oom_score_adj > OOM_SCORE_ADJ_MAX)
1110 return -EINVAL;
1111
1112 task = get_proc_task(file->f_path.dentry->d_inode);
1113 if (!task)
1114 return -ESRCH;
1115 if (!lock_task_sighand(task, &flags)) {
1116 put_task_struct(task);
1117 return -ESRCH;
1118 }
1119 if (oom_score_adj < task->signal->oom_score_adj &&
1120 !capable(CAP_SYS_RESOURCE)) {
1121 unlock_task_sighand(task, &flags);
1122 put_task_struct(task);
1123 return -EACCES;
1124 }
1125
1126 task->signal->oom_score_adj = oom_score_adj;
1127 /*
1128 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
1129 * always attainable.
1130 */
1131 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1132 task->signal->oom_adj = OOM_DISABLE;
1133 else
1134 task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
1135 OOM_SCORE_ADJ_MAX;
1136 unlock_task_sighand(task, &flags);
1137 put_task_struct(task);
1138 return count;
1139}
1140
1141static const struct file_operations proc_oom_score_adj_operations = {
1142 .read = oom_score_adj_read,
1143 .write = oom_score_adj_write,
1144};
1145
1056#ifdef CONFIG_AUDITSYSCALL 1146#ifdef CONFIG_AUDITSYSCALL
1057#define TMPBUFLEN 21 1147#define TMPBUFLEN 21
1058static ssize_t proc_loginuid_read(struct file * file, char __user * buf, 1148static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
@@ -2625,6 +2715,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2625#endif 2715#endif
2626 INF("oom_score", S_IRUGO, proc_oom_score), 2716 INF("oom_score", S_IRUGO, proc_oom_score),
2627 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), 2717 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
2718 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2628#ifdef CONFIG_AUDITSYSCALL 2719#ifdef CONFIG_AUDITSYSCALL
2629 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 2720 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
2630 REG("sessionid", S_IRUGO, proc_sessionid_operations), 2721 REG("sessionid", S_IRUGO, proc_sessionid_operations),
@@ -2959,6 +3050,7 @@ static const struct pid_entry tid_base_stuff[] = {
2959#endif 3050#endif
2960 INF("oom_score", S_IRUGO, proc_oom_score), 3051 INF("oom_score", S_IRUGO, proc_oom_score),
2961 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), 3052 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
3053 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2962#ifdef CONFIG_AUDITSYSCALL 3054#ifdef CONFIG_AUDITSYSCALL
2963 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 3055 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
2964 REG("sessionid", S_IRUSR, proc_sessionid_operations), 3056 REG("sessionid", S_IRUSR, proc_sessionid_operations),
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 437d2ca2de97..ef72b1699429 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -132,6 +132,22 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
132__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); 132__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
133EXPORT_SYMBOL(dq_data_lock); 133EXPORT_SYMBOL(dq_data_lock);
134 134
135void __quota_error(struct super_block *sb, const char *func,
136 const char *fmt, ...)
137{
138 va_list args;
139
140 if (printk_ratelimit()) {
141 va_start(args, fmt);
142 printk(KERN_ERR "Quota error (device %s): %s: ",
143 sb->s_id, func);
144 vprintk(fmt, args);
145 printk("\n");
146 va_end(args);
147 }
148}
149EXPORT_SYMBOL(__quota_error);
150
135#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING) 151#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
136static char *quotatypes[] = INITQFNAMES; 152static char *quotatypes[] = INITQFNAMES;
137#endif 153#endif
@@ -705,11 +721,8 @@ void dqput(struct dquot *dquot)
705 return; 721 return;
706#ifdef CONFIG_QUOTA_DEBUG 722#ifdef CONFIG_QUOTA_DEBUG
707 if (!atomic_read(&dquot->dq_count)) { 723 if (!atomic_read(&dquot->dq_count)) {
708 printk("VFS: dqput: trying to free free dquot\n"); 724 quota_error(dquot->dq_sb, "trying to free free dquot of %s %d",
709 printk("VFS: device %s, dquot of %s %d\n", 725 quotatypes[dquot->dq_type], dquot->dq_id);
710 dquot->dq_sb->s_id,
711 quotatypes[dquot->dq_type],
712 dquot->dq_id);
713 BUG(); 726 BUG();
714 } 727 }
715#endif 728#endif
@@ -732,9 +745,9 @@ we_slept:
732 /* Commit dquot before releasing */ 745 /* Commit dquot before releasing */
733 ret = dquot->dq_sb->dq_op->write_dquot(dquot); 746 ret = dquot->dq_sb->dq_op->write_dquot(dquot);
734 if (ret < 0) { 747 if (ret < 0) {
735 printk(KERN_ERR "VFS: cannot write quota structure on " 748 quota_error(dquot->dq_sb, "Can't write quota structure"
736 "device %s (error %d). Quota may get out of " 749 " (error %d). Quota may get out of sync!",
737 "sync!\n", dquot->dq_sb->s_id, ret); 750 ret);
738 /* 751 /*
739 * We clear dirty bit anyway, so that we avoid 752 * We clear dirty bit anyway, so that we avoid
740 * infinite loop here 753 * infinite loop here
@@ -914,9 +927,9 @@ static void add_dquot_ref(struct super_block *sb, int type)
914 927
915#ifdef CONFIG_QUOTA_DEBUG 928#ifdef CONFIG_QUOTA_DEBUG
916 if (reserved) { 929 if (reserved) {
917 printk(KERN_WARNING "VFS (%s): Writes happened before quota" 930 quota_error(sb, "Writes happened before quota was turned on "
918 " was turned on thus quota information is probably " 931 "thus quota information is probably inconsistent. "
919 "inconsistent. Please run quotacheck(8).\n", sb->s_id); 932 "Please run quotacheck(8)");
920 } 933 }
921#endif 934#endif
922} 935}
@@ -947,7 +960,9 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
947 if (dqput_blocks(dquot)) { 960 if (dqput_blocks(dquot)) {
948#ifdef CONFIG_QUOTA_DEBUG 961#ifdef CONFIG_QUOTA_DEBUG
949 if (atomic_read(&dquot->dq_count) != 1) 962 if (atomic_read(&dquot->dq_count) != 1)
950 printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count)); 963 quota_error(inode->i_sb, "Adding dquot with "
964 "dq_count %d to dispose list",
965 atomic_read(&dquot->dq_count));
951#endif 966#endif
952 spin_lock(&dq_list_lock); 967 spin_lock(&dq_list_lock);
953 /* As dquot must have currently users it can't be on 968 /* As dquot must have currently users it can't be on
@@ -986,6 +1001,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
986 struct list_head *tofree_head) 1001 struct list_head *tofree_head)
987{ 1002{
988 struct inode *inode; 1003 struct inode *inode;
1004 int reserved = 0;
989 1005
990 spin_lock(&inode_lock); 1006 spin_lock(&inode_lock);
991 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1007 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
@@ -995,10 +1011,20 @@ static void remove_dquot_ref(struct super_block *sb, int type,
995 * only quota pointers and these have separate locking 1011 * only quota pointers and these have separate locking
996 * (dqptr_sem). 1012 * (dqptr_sem).
997 */ 1013 */
998 if (!IS_NOQUOTA(inode)) 1014 if (!IS_NOQUOTA(inode)) {
1015 if (unlikely(inode_get_rsv_space(inode) > 0))
1016 reserved = 1;
999 remove_inode_dquot_ref(inode, type, tofree_head); 1017 remove_inode_dquot_ref(inode, type, tofree_head);
1018 }
1000 } 1019 }
1001 spin_unlock(&inode_lock); 1020 spin_unlock(&inode_lock);
1021#ifdef CONFIG_QUOTA_DEBUG
1022 if (reserved) {
1023 printk(KERN_WARNING "VFS (%s): Writes happened after quota"
1024 " was disabled thus quota information is probably "
1025 "inconsistent. Please run quotacheck(8).\n", sb->s_id);
1026 }
1027#endif
1002} 1028}
1003 1029
1004/* Gather all references from inodes and drop them */ 1030/* Gather all references from inodes and drop them */
@@ -1304,6 +1330,15 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
1304 return QUOTA_NL_NOWARN; 1330 return QUOTA_NL_NOWARN;
1305} 1331}
1306 1332
1333static int dquot_active(const struct inode *inode)
1334{
1335 struct super_block *sb = inode->i_sb;
1336
1337 if (IS_NOQUOTA(inode))
1338 return 0;
1339 return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb);
1340}
1341
1307/* 1342/*
1308 * Initialize quota pointers in inode 1343 * Initialize quota pointers in inode
1309 * 1344 *
@@ -1323,7 +1358,7 @@ static void __dquot_initialize(struct inode *inode, int type)
1323 1358
1324 /* First test before acquiring mutex - solves deadlocks when we 1359 /* First test before acquiring mutex - solves deadlocks when we
1325 * re-enter the quota code and are already holding the mutex */ 1360 * re-enter the quota code and are already holding the mutex */
1326 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) 1361 if (!dquot_active(inode))
1327 return; 1362 return;
1328 1363
1329 /* First get references to structures we might need. */ 1364 /* First get references to structures we might need. */
@@ -1507,7 +1542,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
1507 * First test before acquiring mutex - solves deadlocks when we 1542 * First test before acquiring mutex - solves deadlocks when we
1508 * re-enter the quota code and are already holding the mutex 1543 * re-enter the quota code and are already holding the mutex
1509 */ 1544 */
1510 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { 1545 if (!dquot_active(inode)) {
1511 inode_incr_space(inode, number, reserve); 1546 inode_incr_space(inode, number, reserve);
1512 goto out; 1547 goto out;
1513 } 1548 }
@@ -1559,7 +1594,7 @@ int dquot_alloc_inode(const struct inode *inode)
1559 1594
1560 /* First test before acquiring mutex - solves deadlocks when we 1595 /* First test before acquiring mutex - solves deadlocks when we
1561 * re-enter the quota code and are already holding the mutex */ 1596 * re-enter the quota code and are already holding the mutex */
1562 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) 1597 if (!dquot_active(inode))
1563 return 0; 1598 return 0;
1564 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1599 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1565 warntype[cnt] = QUOTA_NL_NOWARN; 1600 warntype[cnt] = QUOTA_NL_NOWARN;
@@ -1596,7 +1631,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
1596{ 1631{
1597 int cnt; 1632 int cnt;
1598 1633
1599 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { 1634 if (!dquot_active(inode)) {
1600 inode_claim_rsv_space(inode, number); 1635 inode_claim_rsv_space(inode, number);
1601 return 0; 1636 return 0;
1602 } 1637 }
@@ -1629,7 +1664,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
1629 1664
1630 /* First test before acquiring mutex - solves deadlocks when we 1665 /* First test before acquiring mutex - solves deadlocks when we
1631 * re-enter the quota code and are already holding the mutex */ 1666 * re-enter the quota code and are already holding the mutex */
1632 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { 1667 if (!dquot_active(inode)) {
1633 inode_decr_space(inode, number, reserve); 1668 inode_decr_space(inode, number, reserve);
1634 return; 1669 return;
1635 } 1670 }
@@ -1667,7 +1702,7 @@ void dquot_free_inode(const struct inode *inode)
1667 1702
1668 /* First test before acquiring mutex - solves deadlocks when we 1703 /* First test before acquiring mutex - solves deadlocks when we
1669 * re-enter the quota code and are already holding the mutex */ 1704 * re-enter the quota code and are already holding the mutex */
1670 if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) 1705 if (!dquot_active(inode))
1671 return; 1706 return;
1672 1707
1673 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1708 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1790,7 +1825,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1790 struct super_block *sb = inode->i_sb; 1825 struct super_block *sb = inode->i_sb;
1791 int ret; 1826 int ret;
1792 1827
1793 if (!sb_any_quota_active(sb) || IS_NOQUOTA(inode)) 1828 if (!dquot_active(inode))
1794 return 0; 1829 return 0;
1795 1830
1796 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) 1831 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
@@ -1957,7 +1992,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
1957 truncate_inode_pages(&toputinode[cnt]->i_data, 1992 truncate_inode_pages(&toputinode[cnt]->i_data,
1958 0); 1993 0);
1959 mutex_unlock(&toputinode[cnt]->i_mutex); 1994 mutex_unlock(&toputinode[cnt]->i_mutex);
1960 mark_inode_dirty(toputinode[cnt]); 1995 mark_inode_dirty_sync(toputinode[cnt]);
1961 } 1996 }
1962 mutex_unlock(&dqopt->dqonoff_mutex); 1997 mutex_unlock(&dqopt->dqonoff_mutex);
1963 } 1998 }
@@ -2270,7 +2305,7 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2270 memset(di, 0, sizeof(*di)); 2305 memset(di, 0, sizeof(*di));
2271 di->d_version = FS_DQUOT_VERSION; 2306 di->d_version = FS_DQUOT_VERSION;
2272 di->d_flags = dquot->dq_type == USRQUOTA ? 2307 di->d_flags = dquot->dq_type == USRQUOTA ?
2273 XFS_USER_QUOTA : XFS_GROUP_QUOTA; 2308 FS_USER_QUOTA : FS_GROUP_QUOTA;
2274 di->d_id = dquot->dq_id; 2309 di->d_id = dquot->dq_id;
2275 2310
2276 spin_lock(&dq_data_lock); 2311 spin_lock(&dq_data_lock);
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 24f03407eeb5..9e48874eabcc 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -65,8 +65,7 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
65 ret = sb->s_op->quota_write(sb, info->dqi_type, buf, 65 ret = sb->s_op->quota_write(sb, info->dqi_type, buf,
66 info->dqi_usable_bs, blk << info->dqi_blocksize_bits); 66 info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
67 if (ret != info->dqi_usable_bs) { 67 if (ret != info->dqi_usable_bs) {
68 q_warn(KERN_WARNING "VFS: dquota write failed on " 68 quota_error(sb, "dquota write failed");
69 "dev %s\n", sb->s_id);
70 if (ret >= 0) 69 if (ret >= 0)
71 ret = -EIO; 70 ret = -EIO;
72 } 71 }
@@ -160,9 +159,8 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
160 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); 159 dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
161 /* No matter whether write succeeds block is out of list */ 160 /* No matter whether write succeeds block is out of list */
162 if (write_blk(info, blk, buf) < 0) 161 if (write_blk(info, blk, buf) < 0)
163 q_warn(KERN_ERR 162 quota_error(info->dqi_sb, "Can't write block (%u) "
164 "VFS: Can't write block (%u) with free entries.\n", 163 "with free entries", blk);
165 blk);
166 return 0; 164 return 0;
167out_buf: 165out_buf:
168 kfree(tmpbuf); 166 kfree(tmpbuf);
@@ -252,9 +250,8 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
252 if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { 250 if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
253 *err = remove_free_dqentry(info, buf, blk); 251 *err = remove_free_dqentry(info, buf, blk);
254 if (*err < 0) { 252 if (*err < 0) {
255 q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't " 253 quota_error(dquot->dq_sb, "Can't remove block (%u) "
256 "remove block (%u) from entry free list.\n", 254 "from entry free list", blk);
257 blk);
258 goto out_buf; 255 goto out_buf;
259 } 256 }
260 } 257 }
@@ -268,16 +265,15 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
268 } 265 }
269#ifdef __QUOTA_QT_PARANOIA 266#ifdef __QUOTA_QT_PARANOIA
270 if (i == qtree_dqstr_in_blk(info)) { 267 if (i == qtree_dqstr_in_blk(info)) {
271 printk(KERN_ERR "VFS: find_free_dqentry(): Data block full " 268 quota_error(dquot->dq_sb, "Data block full but it shouldn't");
272 "but it shouldn't.\n");
273 *err = -EIO; 269 *err = -EIO;
274 goto out_buf; 270 goto out_buf;
275 } 271 }
276#endif 272#endif
277 *err = write_blk(info, blk, buf); 273 *err = write_blk(info, blk, buf);
278 if (*err < 0) { 274 if (*err < 0) {
279 q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't write quota " 275 quota_error(dquot->dq_sb, "Can't write quota data block %u",
280 "data block %u.\n", blk); 276 blk);
281 goto out_buf; 277 goto out_buf;
282 } 278 }
283 dquot->dq_off = (blk << info->dqi_blocksize_bits) + 279 dquot->dq_off = (blk << info->dqi_blocksize_bits) +
@@ -311,8 +307,8 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
311 } else { 307 } else {
312 ret = read_blk(info, *treeblk, buf); 308 ret = read_blk(info, *treeblk, buf);
313 if (ret < 0) { 309 if (ret < 0) {
314 q_warn(KERN_ERR "VFS: Can't read tree quota block " 310 quota_error(dquot->dq_sb, "Can't read tree quota "
315 "%u.\n", *treeblk); 311 "block %u", *treeblk);
316 goto out_buf; 312 goto out_buf;
317 } 313 }
318 } 314 }
@@ -323,9 +319,9 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
323 if (depth == info->dqi_qtree_depth - 1) { 319 if (depth == info->dqi_qtree_depth - 1) {
324#ifdef __QUOTA_QT_PARANOIA 320#ifdef __QUOTA_QT_PARANOIA
325 if (newblk) { 321 if (newblk) {
326 printk(KERN_ERR "VFS: Inserting already present quota " 322 quota_error(dquot->dq_sb, "Inserting already present "
327 "entry (block %u).\n", 323 "quota entry (block %u)",
328 le32_to_cpu(ref[get_index(info, 324 le32_to_cpu(ref[get_index(info,
329 dquot->dq_id, depth)])); 325 dquot->dq_id, depth)]));
330 ret = -EIO; 326 ret = -EIO;
331 goto out_buf; 327 goto out_buf;
@@ -373,8 +369,8 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
373 if (!dquot->dq_off) { 369 if (!dquot->dq_off) {
374 ret = dq_insert_tree(info, dquot); 370 ret = dq_insert_tree(info, dquot);
375 if (ret < 0) { 371 if (ret < 0) {
376 q_warn(KERN_ERR "VFS: Error %zd occurred while " 372 quota_error(sb, "Error %zd occurred while creating "
377 "creating quota.\n", ret); 373 "quota", ret);
378 kfree(ddquot); 374 kfree(ddquot);
379 return ret; 375 return ret;
380 } 376 }
@@ -385,8 +381,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
385 ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size, 381 ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
386 dquot->dq_off); 382 dquot->dq_off);
387 if (ret != info->dqi_entry_size) { 383 if (ret != info->dqi_entry_size) {
388 q_warn(KERN_WARNING "VFS: dquota write failed on dev %s\n", 384 quota_error(sb, "dquota write failed");
389 sb->s_id);
390 if (ret >= 0) 385 if (ret >= 0)
391 ret = -ENOSPC; 386 ret = -ENOSPC;
392 } else { 387 } else {
@@ -410,14 +405,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
410 if (!buf) 405 if (!buf)
411 return -ENOMEM; 406 return -ENOMEM;
412 if (dquot->dq_off >> info->dqi_blocksize_bits != blk) { 407 if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
413 q_warn(KERN_ERR "VFS: Quota structure has offset to other " 408 quota_error(dquot->dq_sb, "Quota structure has offset to "
414 "block (%u) than it should (%u).\n", blk, 409 "other block (%u) than it should (%u)", blk,
415 (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); 410 (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
416 goto out_buf; 411 goto out_buf;
417 } 412 }
418 ret = read_blk(info, blk, buf); 413 ret = read_blk(info, blk, buf);
419 if (ret < 0) { 414 if (ret < 0) {
420 q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", blk); 415 quota_error(dquot->dq_sb, "Can't read quota data block %u",
416 blk);
421 goto out_buf; 417 goto out_buf;
422 } 418 }
423 dh = (struct qt_disk_dqdbheader *)buf; 419 dh = (struct qt_disk_dqdbheader *)buf;
@@ -427,8 +423,8 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
427 if (ret >= 0) 423 if (ret >= 0)
428 ret = put_free_dqblk(info, buf, blk); 424 ret = put_free_dqblk(info, buf, blk);
429 if (ret < 0) { 425 if (ret < 0) {
430 q_warn(KERN_ERR "VFS: Can't move quota data block (%u) " 426 quota_error(dquot->dq_sb, "Can't move quota data block "
431 "to free list.\n", blk); 427 "(%u) to free list", blk);
432 goto out_buf; 428 goto out_buf;
433 } 429 }
434 } else { 430 } else {
@@ -440,15 +436,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
440 /* Insert will write block itself */ 436 /* Insert will write block itself */
441 ret = insert_free_dqentry(info, buf, blk); 437 ret = insert_free_dqentry(info, buf, blk);
442 if (ret < 0) { 438 if (ret < 0) {
443 q_warn(KERN_ERR "VFS: Can't insert quota data " 439 quota_error(dquot->dq_sb, "Can't insert quota "
444 "block (%u) to free entry list.\n", blk); 440 "data block (%u) to free entry list", blk);
445 goto out_buf; 441 goto out_buf;
446 } 442 }
447 } else { 443 } else {
448 ret = write_blk(info, blk, buf); 444 ret = write_blk(info, blk, buf);
449 if (ret < 0) { 445 if (ret < 0) {
450 q_warn(KERN_ERR "VFS: Can't write quota data " 446 quota_error(dquot->dq_sb, "Can't write quota "
451 "block %u\n", blk); 447 "data block %u", blk);
452 goto out_buf; 448 goto out_buf;
453 } 449 }
454 } 450 }
@@ -472,7 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
472 return -ENOMEM; 468 return -ENOMEM;
473 ret = read_blk(info, *blk, buf); 469 ret = read_blk(info, *blk, buf);
474 if (ret < 0) { 470 if (ret < 0) {
475 q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", *blk); 471 quota_error(dquot->dq_sb, "Can't read quota data "
472 "block %u", blk);
476 goto out_buf; 473 goto out_buf;
477 } 474 }
478 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); 475 newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -496,8 +493,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
496 } else { 493 } else {
497 ret = write_blk(info, *blk, buf); 494 ret = write_blk(info, *blk, buf);
498 if (ret < 0) 495 if (ret < 0)
499 q_warn(KERN_ERR "VFS: Can't write quota tree " 496 quota_error(dquot->dq_sb, "Can't write quota "
500 "block %u.\n", *blk); 497 "tree block %u", blk);
501 } 498 }
502 } 499 }
503out_buf: 500out_buf:
@@ -529,7 +526,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
529 return -ENOMEM; 526 return -ENOMEM;
530 ret = read_blk(info, blk, buf); 527 ret = read_blk(info, blk, buf);
531 if (ret < 0) { 528 if (ret < 0) {
532 q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); 529 quota_error(dquot->dq_sb, "Can't read quota tree "
530 "block %u", blk);
533 goto out_buf; 531 goto out_buf;
534 } 532 }
535 ddquot = buf + sizeof(struct qt_disk_dqdbheader); 533 ddquot = buf + sizeof(struct qt_disk_dqdbheader);
@@ -539,8 +537,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
539 ddquot += info->dqi_entry_size; 537 ddquot += info->dqi_entry_size;
540 } 538 }
541 if (i == qtree_dqstr_in_blk(info)) { 539 if (i == qtree_dqstr_in_blk(info)) {
542 q_warn(KERN_ERR "VFS: Quota for id %u referenced " 540 quota_error(dquot->dq_sb, "Quota for id %u referenced "
543 "but not present.\n", dquot->dq_id); 541 "but not present", dquot->dq_id);
544 ret = -EIO; 542 ret = -EIO;
545 goto out_buf; 543 goto out_buf;
546 } else { 544 } else {
@@ -564,7 +562,8 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
564 return -ENOMEM; 562 return -ENOMEM;
565 ret = read_blk(info, blk, buf); 563 ret = read_blk(info, blk, buf);
566 if (ret < 0) { 564 if (ret < 0) {
567 q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); 565 quota_error(dquot->dq_sb, "Can't read quota tree block %u",
566 blk);
568 goto out_buf; 567 goto out_buf;
569 } 568 }
570 ret = 0; 569 ret = 0;
@@ -598,7 +597,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
598#ifdef __QUOTA_QT_PARANOIA 597#ifdef __QUOTA_QT_PARANOIA
599 /* Invalidated quota? */ 598 /* Invalidated quota? */
600 if (!sb_dqopt(dquot->dq_sb)->files[type]) { 599 if (!sb_dqopt(dquot->dq_sb)->files[type]) {
601 printk(KERN_ERR "VFS: Quota invalidated while reading!\n"); 600 quota_error(sb, "Quota invalidated while reading!");
602 return -EIO; 601 return -EIO;
603 } 602 }
604#endif 603#endif
@@ -607,8 +606,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
607 offset = find_dqentry(info, dquot); 606 offset = find_dqentry(info, dquot);
608 if (offset <= 0) { /* Entry not present? */ 607 if (offset <= 0) { /* Entry not present? */
609 if (offset < 0) 608 if (offset < 0)
610 q_warn(KERN_ERR "VFS: Can't read quota " 609 quota_error(sb, "Can't read quota structure "
611 "structure for id %u.\n", dquot->dq_id); 610 "for id %u", dquot->dq_id);
612 dquot->dq_off = 0; 611 dquot->dq_off = 0;
613 set_bit(DQ_FAKE_B, &dquot->dq_flags); 612 set_bit(DQ_FAKE_B, &dquot->dq_flags);
614 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); 613 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
@@ -625,8 +624,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
625 if (ret != info->dqi_entry_size) { 624 if (ret != info->dqi_entry_size) {
626 if (ret >= 0) 625 if (ret >= 0)
627 ret = -EIO; 626 ret = -EIO;
628 q_warn(KERN_ERR "VFS: Error while reading quota " 627 quota_error(sb, "Error while reading quota structure for id %u",
629 "structure for id %u.\n", dquot->dq_id); 628 dquot->dq_id);
630 set_bit(DQ_FAKE_B, &dquot->dq_flags); 629 set_bit(DQ_FAKE_B, &dquot->dq_flags);
631 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); 630 memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
632 kfree(ddquot); 631 kfree(ddquot);
diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h
index ccc3e71fb1d8..a1ab8db81a51 100644
--- a/fs/quota/quota_tree.h
+++ b/fs/quota/quota_tree.h
@@ -22,10 +22,4 @@ struct qt_disk_dqdbheader {
22 22
23#define QT_TREEOFF 1 /* Offset of tree in file in blocks */ 23#define QT_TREEOFF 1 /* Offset of tree in file in blocks */
24 24
25#define q_warn(fmt, args...) \
26do { \
27 if (printk_ratelimit()) \
28 printk(fmt, ## args); \
29} while(0)
30
31#endif /* _LINUX_QUOTAIO_TREE_H */ 25#endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 4af344c5852a..34b37a67bb16 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -95,8 +95,7 @@ static int v1_commit_dqblk(struct dquot *dquot)
95 (char *)&dqblk, sizeof(struct v1_disk_dqblk), 95 (char *)&dqblk, sizeof(struct v1_disk_dqblk),
96 v1_dqoff(dquot->dq_id)); 96 v1_dqoff(dquot->dq_id));
97 if (ret != sizeof(struct v1_disk_dqblk)) { 97 if (ret != sizeof(struct v1_disk_dqblk)) {
98 printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", 98 quota_error(dquot->dq_sb, "dquota write failed");
99 dquot->dq_sb->s_id);
100 if (ret >= 0) 99 if (ret >= 0)
101 ret = -EIO; 100 ret = -EIO;
102 goto out; 101 goto out;
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 135206af1458..65444d29406b 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -63,9 +63,8 @@ static int v2_read_header(struct super_block *sb, int type,
63 size = sb->s_op->quota_read(sb, type, (char *)dqhead, 63 size = sb->s_op->quota_read(sb, type, (char *)dqhead,
64 sizeof(struct v2_disk_dqheader), 0); 64 sizeof(struct v2_disk_dqheader), 0);
65 if (size != sizeof(struct v2_disk_dqheader)) { 65 if (size != sizeof(struct v2_disk_dqheader)) {
66 q_warn(KERN_WARNING "quota_v2: Failed header read:" 66 quota_error(sb, "Failed header read: expected=%zd got=%zd",
67 " expected=%zd got=%zd\n", 67 sizeof(struct v2_disk_dqheader), size);
68 sizeof(struct v2_disk_dqheader), size);
69 return 0; 68 return 0;
70 } 69 }
71 return 1; 70 return 1;
@@ -106,8 +105,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
106 size = sb->s_op->quota_read(sb, type, (char *)&dinfo, 105 size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
107 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); 106 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
108 if (size != sizeof(struct v2_disk_dqinfo)) { 107 if (size != sizeof(struct v2_disk_dqinfo)) {
109 q_warn(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n", 108 quota_error(sb, "Can't read info structure");
110 sb->s_id);
111 return -1; 109 return -1;
112 } 110 }
113 info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS); 111 info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
@@ -167,8 +165,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
167 size = sb->s_op->quota_write(sb, type, (char *)&dinfo, 165 size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
168 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); 166 sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
169 if (size != sizeof(struct v2_disk_dqinfo)) { 167 if (size != sizeof(struct v2_disk_dqinfo)) {
170 q_warn(KERN_WARNING "Can't write info structure on device %s.\n", 168 quota_error(sb, "Can't write info structure");
171 sb->s_id);
172 return -1; 169 return -1;
173 } 170 }
174 return 0; 171 return 0;
diff --git a/fs/readdir.c b/fs/readdir.c
index 7723401f8d8b..356f71528ad6 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -4,6 +4,7 @@
4 * Copyright (C) 1995 Linus Torvalds 4 * Copyright (C) 1995 Linus Torvalds
5 */ 5 */
6 6
7#include <linux/stddef.h>
7#include <linux/kernel.h> 8#include <linux/kernel.h>
8#include <linux/module.h> 9#include <linux/module.h>
9#include <linux/time.h> 10#include <linux/time.h>
@@ -54,7 +55,6 @@ EXPORT_SYMBOL(vfs_readdir);
54 * anyway. Thus the special "fillonedir()" function for that 55 * anyway. Thus the special "fillonedir()" function for that
55 * case (the low-level handlers don't need to care about this). 56 * case (the low-level handlers don't need to care about this).
56 */ 57 */
57#define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de)))
58 58
59#ifdef __ARCH_WANT_OLD_READDIR 59#ifdef __ARCH_WANT_OLD_READDIR
60 60
@@ -152,7 +152,8 @@ static int filldir(void * __buf, const char * name, int namlen, loff_t offset,
152 struct linux_dirent __user * dirent; 152 struct linux_dirent __user * dirent;
153 struct getdents_callback * buf = (struct getdents_callback *) __buf; 153 struct getdents_callback * buf = (struct getdents_callback *) __buf;
154 unsigned long d_ino; 154 unsigned long d_ino;
155 int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(long)); 155 int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2,
156 sizeof(long));
156 157
157 buf->error = -EINVAL; /* only used if we fail.. */ 158 buf->error = -EINVAL; /* only used if we fail.. */
158 if (reclen > buf->count) 159 if (reclen > buf->count)
@@ -237,7 +238,8 @@ static int filldir64(void * __buf, const char * name, int namlen, loff_t offset,
237{ 238{
238 struct linux_dirent64 __user *dirent; 239 struct linux_dirent64 __user *dirent;
239 struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf; 240 struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf;
240 int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 1, sizeof(u64)); 241 int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
242 sizeof(u64));
241 243
242 buf->error = -EINVAL; /* only used if we fail.. */ 244 buf->error = -EINVAL; /* only used if we fail.. */
243 if (reclen > buf->count) 245 if (reclen > buf->count)
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1beaa739d0a6..1b27b5688f62 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -593,7 +593,8 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
593 * @mode: file permissions. 593 * @mode: file permissions.
594 * 594 *
595 */ 595 */
596int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode) 596int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
597 mode_t mode)
597{ 598{
598 struct sysfs_dirent *sd; 599 struct sysfs_dirent *sd;
599 struct iattr newattrs; 600 struct iattr newattrs;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 94e06d6bddbd..6e450e01a1bb 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -36,7 +36,6 @@
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
38#include <linux/aio.h> 38#include <linux/aio.h>
39#include <linux/smp_lock.h>
40 39
41#include "udf_i.h" 40#include "udf_i.h"
42#include "udf_sb.h" 41#include "udf_sb.h"
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 612d1e2e285a..12bb651e5400 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1579,9 +1579,7 @@ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh,
1579{ 1579{
1580 struct anchorVolDescPtr *anchor; 1580 struct anchorVolDescPtr *anchor;
1581 long main_s, main_e, reserve_s, reserve_e; 1581 long main_s, main_e, reserve_s, reserve_e;
1582 struct udf_sb_info *sbi;
1583 1582
1584 sbi = UDF_SB(sb);
1585 anchor = (struct anchorVolDescPtr *)bh->b_data; 1583 anchor = (struct anchorVolDescPtr *)bh->b_data;
1586 1584
1587 /* Locate the main sequence */ 1585 /* Locate the main sequence */
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index bfd5ac9d1f6f..29b9d642e93d 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -68,15 +68,15 @@ xfs_fs_set_xstate(
68 if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) 68 if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
69 return -ENOSYS; 69 return -ENOSYS;
70 70
71 if (uflags & XFS_QUOTA_UDQ_ACCT) 71 if (uflags & FS_QUOTA_UDQ_ACCT)
72 flags |= XFS_UQUOTA_ACCT; 72 flags |= XFS_UQUOTA_ACCT;
73 if (uflags & XFS_QUOTA_PDQ_ACCT) 73 if (uflags & FS_QUOTA_PDQ_ACCT)
74 flags |= XFS_PQUOTA_ACCT; 74 flags |= XFS_PQUOTA_ACCT;
75 if (uflags & XFS_QUOTA_GDQ_ACCT) 75 if (uflags & FS_QUOTA_GDQ_ACCT)
76 flags |= XFS_GQUOTA_ACCT; 76 flags |= XFS_GQUOTA_ACCT;
77 if (uflags & XFS_QUOTA_UDQ_ENFD) 77 if (uflags & FS_QUOTA_UDQ_ENFD)
78 flags |= XFS_UQUOTA_ENFD; 78 flags |= XFS_UQUOTA_ENFD;
79 if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD)) 79 if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
80 flags |= XFS_OQUOTA_ENFD; 80 flags |= XFS_OQUOTA_ENFD;
81 81
82 switch (op) { 82 switch (op) {
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index d257eb8557c4..45e5849df238 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -810,9 +810,9 @@ xfs_qm_export_dquot(
810 } 810 }
811 811
812#ifdef DEBUG 812#ifdef DEBUG
813 if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == XFS_USER_QUOTA) || 813 if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
814 (XFS_IS_OQUOTA_ENFORCED(mp) && 814 (XFS_IS_OQUOTA_ENFORCED(mp) &&
815 (dst->d_flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)))) && 815 (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
816 dst->d_id != 0) { 816 dst->d_id != 0) {
817 if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) && 817 if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
818 (dst->d_blk_softlimit > 0)) { 818 (dst->d_blk_softlimit > 0)) {
@@ -833,17 +833,17 @@ xfs_qm_export_qtype_flags(
833 /* 833 /*
834 * Can't be more than one, or none. 834 * Can't be more than one, or none.
835 */ 835 */
836 ASSERT((flags & (XFS_PROJ_QUOTA | XFS_USER_QUOTA)) != 836 ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) !=
837 (XFS_PROJ_QUOTA | XFS_USER_QUOTA)); 837 (FS_PROJ_QUOTA | FS_USER_QUOTA));
838 ASSERT((flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)) != 838 ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) !=
839 (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)); 839 (FS_PROJ_QUOTA | FS_GROUP_QUOTA));
840 ASSERT((flags & (XFS_USER_QUOTA | XFS_GROUP_QUOTA)) != 840 ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) !=
841 (XFS_USER_QUOTA | XFS_GROUP_QUOTA)); 841 (FS_USER_QUOTA | FS_GROUP_QUOTA));
842 ASSERT((flags & (XFS_PROJ_QUOTA|XFS_USER_QUOTA|XFS_GROUP_QUOTA)) != 0); 842 ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0);
843 843
844 return (flags & XFS_DQ_USER) ? 844 return (flags & XFS_DQ_USER) ?
845 XFS_USER_QUOTA : (flags & XFS_DQ_PROJ) ? 845 FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
846 XFS_PROJ_QUOTA : XFS_GROUP_QUOTA; 846 FS_PROJ_QUOTA : FS_GROUP_QUOTA;
847} 847}
848 848
849STATIC uint 849STATIC uint
@@ -854,16 +854,16 @@ xfs_qm_export_flags(
854 854
855 uflags = 0; 855 uflags = 0;
856 if (flags & XFS_UQUOTA_ACCT) 856 if (flags & XFS_UQUOTA_ACCT)
857 uflags |= XFS_QUOTA_UDQ_ACCT; 857 uflags |= FS_QUOTA_UDQ_ACCT;
858 if (flags & XFS_PQUOTA_ACCT) 858 if (flags & XFS_PQUOTA_ACCT)
859 uflags |= XFS_QUOTA_PDQ_ACCT; 859 uflags |= FS_QUOTA_PDQ_ACCT;
860 if (flags & XFS_GQUOTA_ACCT) 860 if (flags & XFS_GQUOTA_ACCT)
861 uflags |= XFS_QUOTA_GDQ_ACCT; 861 uflags |= FS_QUOTA_GDQ_ACCT;
862 if (flags & XFS_UQUOTA_ENFD) 862 if (flags & XFS_UQUOTA_ENFD)
863 uflags |= XFS_QUOTA_UDQ_ENFD; 863 uflags |= FS_QUOTA_UDQ_ENFD;
864 if (flags & (XFS_OQUOTA_ENFD)) { 864 if (flags & (XFS_OQUOTA_ENFD)) {
865 uflags |= (flags & XFS_GQUOTA_ACCT) ? 865 uflags |= (flags & XFS_GQUOTA_ACCT) ?
866 XFS_QUOTA_GDQ_ENFD : XFS_QUOTA_PDQ_ENFD; 866 FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
867 } 867 }
868 return (uflags); 868 return (uflags);
869} 869}