aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJames Morris <jmorris@macbook.(none)>2009-12-03 01:33:40 -0500
committerJames Morris <jmorris@macbook.(none)>2009-12-03 01:33:40 -0500
commitc84d6efd363a3948eb32ec40d46bab6338580454 (patch)
tree3ba7ac46e6626fe8ac843834588609eb6ccee5c6 /fs
parent7539cf4b92be4aecc573ea962135f246a7a33401 (diff)
parent22763c5cf3690a681551162c15d34d935308c8d7 (diff)
Merge branch 'master' into next
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c14
-rw-r--r--fs/9p/vfs_dir.c93
-rw-r--r--fs/9p/vfs_inode.c5
-rw-r--r--fs/Kconfig4
-rw-r--r--fs/afs/cache.h12
-rw-r--r--fs/afs/file.c15
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/bio.c77
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/btrfs/acl.c6
-rw-r--r--fs/btrfs/async-thread.c81
-rw-r--r--fs/btrfs/async-thread.h10
-rw-r--r--fs/btrfs/btrfs_inode.h16
-rw-r--r--fs/btrfs/ctree.h38
-rw-r--r--fs/btrfs/disk-io.c60
-rw-r--r--fs/btrfs/extent-tree.c724
-rw-r--r--fs/btrfs/extent_io.c134
-rw-r--r--fs/btrfs/extent_io.h31
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/file.c79
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/inode.c442
-rw-r--r--fs/btrfs/ioctl.c69
-rw-r--r--fs/btrfs/ordered-data.c99
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/relocation.c4
-rw-r--r--fs/btrfs/root-tree.c2
-rw-r--r--fs/btrfs/super.c9
-rw-r--r--fs/btrfs/transaction.c72
-rw-r--r--fs/btrfs/transaction.h5
-rw-r--r--fs/btrfs/tree-log.c56
-rw-r--r--fs/btrfs/tree-log.h3
-rw-r--r--fs/btrfs/volumes.c4
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/cachefiles/interface.c32
-rw-r--r--fs/cachefiles/namei.c187
-rw-r--r--fs/cachefiles/rdwr.c130
-rw-r--r--fs/cifs/CHANGES9
-rw-r--r--fs/cifs/cifsfs.c2
-rw-r--r--fs/cifs/cifsproto.h1
-rw-r--r--fs/cifs/connect.c11
-rw-r--r--fs/cifs/dir.c8
-rw-r--r--fs/cifs/inode.c7
-rw-r--r--fs/cifs/misc.c14
-rw-r--r--fs/cifs/readdir.c7
-rw-r--r--fs/coda/psdev.c1
-rw-r--r--fs/compat.c2
-rw-r--r--fs/compat_ioctl.c4
-rw-r--r--fs/dlm/lowcomms.c36
-rw-r--r--fs/ecryptfs/Kconfig3
-rw-r--r--fs/ecryptfs/main.c7
-rw-r--r--fs/exec.c8
-rw-r--r--fs/ext3/fsync.c36
-rw-r--r--fs/ext3/inode.c36
-rw-r--r--fs/ext3/super.c15
-rw-r--r--fs/ext4/Kconfig14
-rw-r--r--fs/ext4/ext4.h56
-rw-r--r--fs/ext4/ext4_extents.h7
-rw-r--r--fs/ext4/ext4_jbd2.h6
-rw-r--r--fs/ext4/extents.c458
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/ext4/inode.c586
-rw-r--r--fs/ext4/mballoc.c305
-rw-r--r--fs/ext4/mballoc.h35
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c20
-rw-r--r--fs/ext4/namei.c19
-rw-r--r--fs/ext4/super.c150
-rw-r--r--fs/fat/fat.h2
-rw-r--r--fs/fat/inode.c18
-rw-r--r--fs/fat/misc.c8
-rw-r--r--fs/fat/namei_vfat.c15
-rw-r--r--fs/fcntl.c4
-rw-r--r--fs/file.c1
-rw-r--r--fs/fscache/Kconfig7
-rw-r--r--fs/fscache/Makefile1
-rw-r--r--fs/fscache/cache.c5
-rw-r--r--fs/fscache/cookie.c26
-rw-r--r--fs/fscache/internal.h56
-rw-r--r--fs/fscache/main.c6
-rw-r--r--fs/fscache/object-list.c432
-rw-r--r--fs/fscache/object.c104
-rw-r--r--fs/fscache/operation.c120
-rw-r--r--fs/fscache/page.c273
-rw-r--r--fs/fscache/proc.c13
-rw-r--r--fs/fscache/stats.c94
-rw-r--r--fs/fuse/dir.c7
-rw-r--r--fs/fuse/file.c5
-rw-r--r--fs/gfs2/main.c4
-rw-r--r--fs/gfs2/recovery.c2
-rw-r--r--fs/hfs/btree.c5
-rw-r--r--fs/hfsplus/wrapper.c4
-rw-r--r--fs/ioctl.c2
-rw-r--r--fs/jbd/journal.c3
-rw-r--r--fs/jbd2/checkpoint.c7
-rw-r--r--fs/jbd2/commit.c59
-rw-r--r--fs/jbd2/journal.c200
-rw-r--r--fs/jffs2/read.c9
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/direct.c1
-rw-r--r--fs/nfs/fscache.c10
-rw-r--r--fs/nfs/nfs4namespace.c12
-rw-r--r--fs/nfs/nfs4proc.c17
-rw-r--r--fs/nfs/nfs4renewd.c6
-rw-r--r--fs/nfs/nfs4xdr.c1
-rw-r--r--fs/nfs/super.c37
-rw-r--r--fs/nfsd/nfs3xdr.c2
-rw-r--r--fs/nfsd/nfsctl.c2
-rw-r--r--fs/nilfs2/btnode.c5
-rw-r--r--fs/nilfs2/cpfile.c2
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/nilfs2/inode.c2
-rw-r--r--fs/nilfs2/ioctl.c39
-rw-r--r--fs/nilfs2/mdt.c2
-rw-r--r--fs/nilfs2/nilfs.h4
-rw-r--r--fs/nilfs2/segment.c17
-rw-r--r--fs/nls/nls_base.c8
-rw-r--r--fs/notify/dnotify/dnotify.c3
-rw-r--r--fs/notify/inode_mark.c6
-rw-r--r--fs/notify/notification.c2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/cluster/netdebug.c4
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c8
-rw-r--r--fs/ocfs2/file.c3
-rw-r--r--fs/ocfs2/ocfs2.h7
-rw-r--r--fs/ocfs2/refcounttree.c69
-rw-r--r--fs/ocfs2/super.c22
-rw-r--r--fs/ocfs2/uptodate.c5
-rw-r--r--fs/omfs/dir.c2
-rw-r--r--fs/omfs/file.c2
-rw-r--r--fs/omfs/omfs.h4
-rw-r--r--fs/pipe.c41
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/base.c3
-rw-r--r--fs/proc/kcore.c1
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/proc/page.c5
-rw-r--r--fs/romfs/storage.c4
-rw-r--r--fs/select.c1
-rw-r--r--fs/sysfs/dir.c7
-rw-r--r--fs/sysfs/file.c14
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c38
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c41
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c59
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c36
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c1
-rw-r--r--fs/xfs/xfs_dfrag.c8
-rw-r--r--fs/xfs/xfs_dir2_leaf.c4
-rw-r--r--fs/xfs/xfs_ialloc.c1
-rw-r--r--fs/xfs/xfs_inode.c4
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.c18
-rw-r--r--fs/xfs/xfs_itable.c21
-rw-r--r--fs/xfs/xfs_log_recover.c4
-rw-r--r--fs/xfs/xfs_trans_ail.c23
-rw-r--r--fs/xfs/xfs_vnodeops.c6
162 files changed, 4679 insertions, 1885 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 51c94e26a34..e777961939f 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -343,18 +343,7 @@ int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
343 343
344 BUG_ON(!vcookie->fscache); 344 BUG_ON(!vcookie->fscache);
345 345
346 if (PageFsCache(page)) { 346 return fscache_maybe_release_page(vcookie->fscache, page, gfp);
347 if (fscache_check_page_write(vcookie->fscache, page)) {
348 if (!(gfp & __GFP_WAIT))
349 return 0;
350 fscache_wait_on_page_write(vcookie->fscache, page);
351 }
352
353 fscache_uncache_page(vcookie->fscache, page);
354 ClearPageFsCache(page);
355 }
356
357 return 1;
358} 347}
359 348
360void __v9fs_fscache_invalidate_page(struct page *page) 349void __v9fs_fscache_invalidate_page(struct page *page)
@@ -368,7 +357,6 @@ void __v9fs_fscache_invalidate_page(struct page *page)
368 fscache_wait_on_page_write(vcookie->fscache, page); 357 fscache_wait_on_page_write(vcookie->fscache, page);
369 BUG_ON(!PageLocked(page)); 358 BUG_ON(!PageLocked(page));
370 fscache_uncache_page(vcookie->fscache, page); 359 fscache_uncache_page(vcookie->fscache, page);
371 ClearPageFsCache(page);
372 } 360 }
373} 361}
374 362
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 873cd31baa4..15cce53bf61 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -40,6 +40,24 @@
40#include "fid.h" 40#include "fid.h"
41 41
42/** 42/**
43 * struct p9_rdir - readdir accounting
44 * @mutex: mutex protecting readdir
45 * @head: start offset of current dirread buffer
46 * @tail: end offset of current dirread buffer
47 * @buf: dirread buffer
48 *
49 * private structure for keeping track of readdir
50 * allocated on demand
51 */
52
53struct p9_rdir {
54 struct mutex mutex;
55 int head;
56 int tail;
57 uint8_t *buf;
58};
59
60/**
43 * dt_type - return file type 61 * dt_type - return file type
44 * @mistat: mistat structure 62 * @mistat: mistat structure
45 * 63 *
@@ -70,56 +88,79 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
70{ 88{
71 int over; 89 int over;
72 struct p9_wstat st; 90 struct p9_wstat st;
73 int err; 91 int err = 0;
74 struct p9_fid *fid; 92 struct p9_fid *fid;
75 int buflen; 93 int buflen;
76 char *statbuf; 94 int reclen = 0;
77 int n, i = 0; 95 struct p9_rdir *rdir;
78 96
79 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); 97 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
80 fid = filp->private_data; 98 fid = filp->private_data;
81 99
82 buflen = fid->clnt->msize - P9_IOHDRSZ; 100 buflen = fid->clnt->msize - P9_IOHDRSZ;
83 statbuf = kmalloc(buflen, GFP_KERNEL); 101
84 if (!statbuf) 102 /* allocate rdir on demand */
85 return -ENOMEM; 103 if (!fid->rdir) {
86 104 rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
87 while (1) { 105
88 err = v9fs_file_readn(filp, statbuf, NULL, buflen, 106 if (rdir == NULL) {
89 fid->rdir_fpos); 107 err = -ENOMEM;
90 if (err <= 0) 108 goto exit;
91 break; 109 }
92 110 spin_lock(&filp->f_dentry->d_lock);
93 n = err; 111 if (!fid->rdir) {
94 while (i < n) { 112 rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir);
95 err = p9stat_read(statbuf + i, buflen-i, &st, 113 mutex_init(&rdir->mutex);
96 fid->clnt->dotu); 114 rdir->head = rdir->tail = 0;
115 fid->rdir = (void *) rdir;
116 rdir = NULL;
117 }
118 spin_unlock(&filp->f_dentry->d_lock);
119 kfree(rdir);
120 }
121 rdir = (struct p9_rdir *) fid->rdir;
122
123 err = mutex_lock_interruptible(&rdir->mutex);
124 while (err == 0) {
125 if (rdir->tail == rdir->head) {
126 err = v9fs_file_readn(filp, rdir->buf, NULL,
127 buflen, filp->f_pos);
128 if (err <= 0)
129 goto unlock_and_exit;
130
131 rdir->head = 0;
132 rdir->tail = err;
133 }
134
135 while (rdir->head < rdir->tail) {
136 err = p9stat_read(rdir->buf + rdir->head,
137 buflen - rdir->head, &st,
138 fid->clnt->dotu);
97 if (err) { 139 if (err) {
98 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); 140 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
99 err = -EIO; 141 err = -EIO;
100 p9stat_free(&st); 142 p9stat_free(&st);
101 goto free_and_exit; 143 goto unlock_and_exit;
102 } 144 }
103 145 reclen = st.size+2;
104 i += st.size+2;
105 fid->rdir_fpos += st.size+2;
106 146
107 over = filldir(dirent, st.name, strlen(st.name), 147 over = filldir(dirent, st.name, strlen(st.name),
108 filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st)); 148 filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st));
109 149
110 filp->f_pos += st.size+2;
111
112 p9stat_free(&st); 150 p9stat_free(&st);
113 151
114 if (over) { 152 if (over) {
115 err = 0; 153 err = 0;
116 goto free_and_exit; 154 goto unlock_and_exit;
117 } 155 }
156 rdir->head += reclen;
157 filp->f_pos += reclen;
118 } 158 }
119 } 159 }
120 160
121free_and_exit: 161unlock_and_exit:
122 kfree(statbuf); 162 mutex_unlock(&rdir->mutex);
163exit:
123 return err; 164 return err;
124} 165}
125 166
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 5947628aefe..18f74ec4dce 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -994,8 +994,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
994 P9_DPRINTK(P9_DEBUG_VFS, 994 P9_DPRINTK(P9_DEBUG_VFS,
995 "%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer); 995 "%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer);
996 996
997 retval = buflen; 997 retval = strnlen(buffer, buflen);
998
999done: 998done:
1000 kfree(st); 999 kfree(st);
1001 return retval; 1000 return retval;
@@ -1062,7 +1061,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
1062 __putname(link); 1061 __putname(link);
1063 link = ERR_PTR(len); 1062 link = ERR_PTR(len);
1064 } else 1063 } else
1065 link[len] = 0; 1064 link[min(len, PATH_MAX-1)] = 0;
1066 } 1065 }
1067 nd_set_link(nd, link); 1066 nd_set_link(nd, link);
1068 1067
diff --git a/fs/Kconfig b/fs/Kconfig
index d4bf8caad8d..64d44efad7a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -135,8 +135,8 @@ config TMPFS_POSIX_ACL
135 135
136config HUGETLBFS 136config HUGETLBFS
137 bool "HugeTLB file system support" 137 bool "HugeTLB file system support"
138 depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \ 138 depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
139 (S390 && 64BIT) || SYS_SUPPORTS_HUGETLBFS || BROKEN 139 SYS_SUPPORTS_HUGETLBFS || BROKEN
140 help 140 help
141 hugetlbfs is a filesystem backing for HugeTLB pages, based on 141 hugetlbfs is a filesystem backing for HugeTLB pages, based on
142 ramfs. For architectures that support it, say Y here and read 142 ramfs. For architectures that support it, say Y here and read
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
deleted file mode 100644
index 5c4f6b499e9..00000000000
--- a/fs/afs/cache.h
+++ /dev/null
@@ -1,12 +0,0 @@
1/* AFS local cache management interface
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/fscache.h>
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 681c2a7b013..39b301662f2 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -315,7 +315,6 @@ static void afs_invalidatepage(struct page *page, unsigned long offset)
315 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); 315 struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
316 fscache_wait_on_page_write(vnode->cache, page); 316 fscache_wait_on_page_write(vnode->cache, page);
317 fscache_uncache_page(vnode->cache, page); 317 fscache_uncache_page(vnode->cache, page);
318 ClearPageFsCache(page);
319 } 318 }
320#endif 319#endif
321 320
@@ -349,17 +348,9 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags)
349 /* deny if page is being written to the cache and the caller hasn't 348 /* deny if page is being written to the cache and the caller hasn't
350 * elected to wait */ 349 * elected to wait */
351#ifdef CONFIG_AFS_FSCACHE 350#ifdef CONFIG_AFS_FSCACHE
352 if (PageFsCache(page)) { 351 if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) {
353 if (fscache_check_page_write(vnode->cache, page)) { 352 _leave(" = F [cache busy]");
354 if (!(gfp_flags & __GFP_WAIT)) { 353 return 0;
355 _leave(" = F [cache busy]");
356 return 0;
357 }
358 fscache_wait_on_page_write(vnode->cache, page);
359 }
360
361 fscache_uncache_page(vnode->cache, page);
362 ClearPageFsCache(page);
363 } 354 }
364#endif 355#endif
365 356
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 106be66dafd..6ece2a13bf7 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -18,10 +18,10 @@
18#include <linux/key.h> 18#include <linux/key.h>
19#include <linux/workqueue.h> 19#include <linux/workqueue.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/fscache.h>
21 22
22#include "afs.h" 23#include "afs.h"
23#include "afs_vl.h" 24#include "afs_vl.h"
24#include "cache.h"
25 25
26#define AFS_CELL_MAX_ADDRS 15 26#define AFS_CELL_MAX_ADDRS 15
27 27
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index d11c51fc2a3..2ca7a7cafdb 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -8,8 +8,10 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/cred.h>
11#include <linux/file.h> 12#include <linux/file.h>
12#include <linux/poll.h> 13#include <linux/poll.h>
14#include <linux/sched.h>
13#include <linux/slab.h> 15#include <linux/slab.h>
14#include <linux/init.h> 16#include <linux/init.h>
15#include <linux/fs.h> 17#include <linux/fs.h>
diff --git a/fs/bio.c b/fs/bio.c
index 76738005c8e..12da5db8682 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -249,6 +249,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
249 249
250 mempool_free(p, bs->bio_pool); 250 mempool_free(p, bs->bio_pool);
251} 251}
252EXPORT_SYMBOL(bio_free);
252 253
253void bio_init(struct bio *bio) 254void bio_init(struct bio *bio)
254{ 255{
@@ -257,6 +258,7 @@ void bio_init(struct bio *bio)
257 bio->bi_comp_cpu = -1; 258 bio->bi_comp_cpu = -1;
258 atomic_set(&bio->bi_cnt, 1); 259 atomic_set(&bio->bi_cnt, 1);
259} 260}
261EXPORT_SYMBOL(bio_init);
260 262
261/** 263/**
262 * bio_alloc_bioset - allocate a bio for I/O 264 * bio_alloc_bioset - allocate a bio for I/O
@@ -311,6 +313,7 @@ err_free:
311 mempool_free(p, bs->bio_pool); 313 mempool_free(p, bs->bio_pool);
312 return NULL; 314 return NULL;
313} 315}
316EXPORT_SYMBOL(bio_alloc_bioset);
314 317
315static void bio_fs_destructor(struct bio *bio) 318static void bio_fs_destructor(struct bio *bio)
316{ 319{
@@ -322,8 +325,16 @@ static void bio_fs_destructor(struct bio *bio)
322 * @gfp_mask: allocation mask to use 325 * @gfp_mask: allocation mask to use
323 * @nr_iovecs: number of iovecs 326 * @nr_iovecs: number of iovecs
324 * 327 *
325 * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask 328 * bio_alloc will allocate a bio and associated bio_vec array that can hold
326 * contains __GFP_WAIT, the allocation is guaranteed to succeed. 329 * at least @nr_iovecs entries. Allocations will be done from the
330 * fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc.
331 *
332 * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
333 * a bio. This is due to the mempool guarantees. To make this work, callers
334 * must never allocate more than 1 bio at a time from this pool. Callers
335 * that need to allocate more than 1 bio must always submit the previously
336 * allocated bio for IO before attempting to allocate a new one. Failure to
337 * do so can cause livelocks under memory pressure.
327 * 338 *
328 * RETURNS: 339 * RETURNS:
329 * Pointer to new bio on success, NULL on failure. 340 * Pointer to new bio on success, NULL on failure.
@@ -337,6 +348,7 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
337 348
338 return bio; 349 return bio;
339} 350}
351EXPORT_SYMBOL(bio_alloc);
340 352
341static void bio_kmalloc_destructor(struct bio *bio) 353static void bio_kmalloc_destructor(struct bio *bio)
342{ 354{
@@ -346,21 +358,13 @@ static void bio_kmalloc_destructor(struct bio *bio)
346} 358}
347 359
348/** 360/**
349 * bio_alloc - allocate a bio for I/O 361 * bio_kmalloc - allocate a bio for I/O using kmalloc()
350 * @gfp_mask: the GFP_ mask given to the slab allocator 362 * @gfp_mask: the GFP_ mask given to the slab allocator
351 * @nr_iovecs: number of iovecs to pre-allocate 363 * @nr_iovecs: number of iovecs to pre-allocate
352 * 364 *
353 * Description: 365 * Description:
354 * bio_alloc will allocate a bio and associated bio_vec array that can hold 366 * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask contains
355 * at least @nr_iovecs entries. Allocations will be done from the 367 * %__GFP_WAIT, the allocation is guaranteed to succeed.
356 * fs_bio_set. Also see @bio_alloc_bioset.
357 *
358 * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
359 * a bio. This is due to the mempool guarantees. To make this work, callers
360 * must never allocate more than 1 bio at a time from this pool. Callers
361 * that need to allocate more than 1 bio must always submit the previously
362 * allocated bio for IO before attempting to allocate a new one. Failure to
363 * do so can cause livelocks under memory pressure.
364 * 368 *
365 **/ 369 **/
366struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) 370struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
@@ -380,6 +384,7 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
380 384
381 return bio; 385 return bio;
382} 386}
387EXPORT_SYMBOL(bio_kmalloc);
383 388
384void zero_fill_bio(struct bio *bio) 389void zero_fill_bio(struct bio *bio)
385{ 390{
@@ -402,7 +407,7 @@ EXPORT_SYMBOL(zero_fill_bio);
402 * 407 *
403 * Description: 408 * Description:
404 * Put a reference to a &struct bio, either one you have gotten with 409 * Put a reference to a &struct bio, either one you have gotten with
405 * bio_alloc or bio_get. The last put of a bio will free it. 410 * bio_alloc, bio_get or bio_clone. The last put of a bio will free it.
406 **/ 411 **/
407void bio_put(struct bio *bio) 412void bio_put(struct bio *bio)
408{ 413{
@@ -416,6 +421,7 @@ void bio_put(struct bio *bio)
416 bio->bi_destructor(bio); 421 bio->bi_destructor(bio);
417 } 422 }
418} 423}
424EXPORT_SYMBOL(bio_put);
419 425
420inline int bio_phys_segments(struct request_queue *q, struct bio *bio) 426inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
421{ 427{
@@ -424,6 +430,7 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
424 430
425 return bio->bi_phys_segments; 431 return bio->bi_phys_segments;
426} 432}
433EXPORT_SYMBOL(bio_phys_segments);
427 434
428/** 435/**
429 * __bio_clone - clone a bio 436 * __bio_clone - clone a bio
@@ -451,6 +458,7 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
451 bio->bi_size = bio_src->bi_size; 458 bio->bi_size = bio_src->bi_size;
452 bio->bi_idx = bio_src->bi_idx; 459 bio->bi_idx = bio_src->bi_idx;
453} 460}
461EXPORT_SYMBOL(__bio_clone);
454 462
455/** 463/**
456 * bio_clone - clone a bio 464 * bio_clone - clone a bio
@@ -482,6 +490,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
482 490
483 return b; 491 return b;
484} 492}
493EXPORT_SYMBOL(bio_clone);
485 494
486/** 495/**
487 * bio_get_nr_vecs - return approx number of vecs 496 * bio_get_nr_vecs - return approx number of vecs
@@ -505,6 +514,7 @@ int bio_get_nr_vecs(struct block_device *bdev)
505 514
506 return nr_pages; 515 return nr_pages;
507} 516}
517EXPORT_SYMBOL(bio_get_nr_vecs);
508 518
509static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page 519static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
510 *page, unsigned int len, unsigned int offset, 520 *page, unsigned int len, unsigned int offset,
@@ -635,6 +645,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
635 return __bio_add_page(q, bio, page, len, offset, 645 return __bio_add_page(q, bio, page, len, offset,
636 queue_max_hw_sectors(q)); 646 queue_max_hw_sectors(q));
637} 647}
648EXPORT_SYMBOL(bio_add_pc_page);
638 649
639/** 650/**
640 * bio_add_page - attempt to add page to bio 651 * bio_add_page - attempt to add page to bio
@@ -655,6 +666,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
655 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 666 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
656 return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q)); 667 return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
657} 668}
669EXPORT_SYMBOL(bio_add_page);
658 670
659struct bio_map_data { 671struct bio_map_data {
660 struct bio_vec *iovecs; 672 struct bio_vec *iovecs;
@@ -776,6 +788,7 @@ int bio_uncopy_user(struct bio *bio)
776 bio_put(bio); 788 bio_put(bio);
777 return ret; 789 return ret;
778} 790}
791EXPORT_SYMBOL(bio_uncopy_user);
779 792
780/** 793/**
781 * bio_copy_user_iov - copy user data to bio 794 * bio_copy_user_iov - copy user data to bio
@@ -920,6 +933,7 @@ struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
920 933
921 return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); 934 return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
922} 935}
936EXPORT_SYMBOL(bio_copy_user);
923 937
924static struct bio *__bio_map_user_iov(struct request_queue *q, 938static struct bio *__bio_map_user_iov(struct request_queue *q,
925 struct block_device *bdev, 939 struct block_device *bdev,
@@ -1050,6 +1064,7 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
1050 1064
1051 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); 1065 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
1052} 1066}
1067EXPORT_SYMBOL(bio_map_user);
1053 1068
1054/** 1069/**
1055 * bio_map_user_iov - map user sg_iovec table into bio 1070 * bio_map_user_iov - map user sg_iovec table into bio
@@ -1117,13 +1132,13 @@ void bio_unmap_user(struct bio *bio)
1117 __bio_unmap_user(bio); 1132 __bio_unmap_user(bio);
1118 bio_put(bio); 1133 bio_put(bio);
1119} 1134}
1135EXPORT_SYMBOL(bio_unmap_user);
1120 1136
1121static void bio_map_kern_endio(struct bio *bio, int err) 1137static void bio_map_kern_endio(struct bio *bio, int err)
1122{ 1138{
1123 bio_put(bio); 1139 bio_put(bio);
1124} 1140}
1125 1141
1126
1127static struct bio *__bio_map_kern(struct request_queue *q, void *data, 1142static struct bio *__bio_map_kern(struct request_queue *q, void *data,
1128 unsigned int len, gfp_t gfp_mask) 1143 unsigned int len, gfp_t gfp_mask)
1129{ 1144{
@@ -1189,6 +1204,7 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
1189 bio_put(bio); 1204 bio_put(bio);
1190 return ERR_PTR(-EINVAL); 1205 return ERR_PTR(-EINVAL);
1191} 1206}
1207EXPORT_SYMBOL(bio_map_kern);
1192 1208
1193static void bio_copy_kern_endio(struct bio *bio, int err) 1209static void bio_copy_kern_endio(struct bio *bio, int err)
1194{ 1210{
@@ -1250,6 +1266,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
1250 1266
1251 return bio; 1267 return bio;
1252} 1268}
1269EXPORT_SYMBOL(bio_copy_kern);
1253 1270
1254/* 1271/*
1255 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions 1272 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
@@ -1400,6 +1417,7 @@ void bio_endio(struct bio *bio, int error)
1400 if (bio->bi_end_io) 1417 if (bio->bi_end_io)
1401 bio->bi_end_io(bio, error); 1418 bio->bi_end_io(bio, error);
1402} 1419}
1420EXPORT_SYMBOL(bio_endio);
1403 1421
1404void bio_pair_release(struct bio_pair *bp) 1422void bio_pair_release(struct bio_pair *bp)
1405{ 1423{
@@ -1410,6 +1428,7 @@ void bio_pair_release(struct bio_pair *bp)
1410 mempool_free(bp, bp->bio2.bi_private); 1428 mempool_free(bp, bp->bio2.bi_private);
1411 } 1429 }
1412} 1430}
1431EXPORT_SYMBOL(bio_pair_release);
1413 1432
1414static void bio_pair_end_1(struct bio *bi, int err) 1433static void bio_pair_end_1(struct bio *bi, int err)
1415{ 1434{
@@ -1477,6 +1496,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1477 1496
1478 return bp; 1497 return bp;
1479} 1498}
1499EXPORT_SYMBOL(bio_split);
1480 1500
1481/** 1501/**
1482 * bio_sector_offset - Find hardware sector offset in bio 1502 * bio_sector_offset - Find hardware sector offset in bio
@@ -1547,6 +1567,7 @@ void bioset_free(struct bio_set *bs)
1547 1567
1548 kfree(bs); 1568 kfree(bs);
1549} 1569}
1570EXPORT_SYMBOL(bioset_free);
1550 1571
1551/** 1572/**
1552 * bioset_create - Create a bio_set 1573 * bioset_create - Create a bio_set
@@ -1592,6 +1613,7 @@ bad:
1592 bioset_free(bs); 1613 bioset_free(bs);
1593 return NULL; 1614 return NULL;
1594} 1615}
1616EXPORT_SYMBOL(bioset_create);
1595 1617
1596static void __init biovec_init_slabs(void) 1618static void __init biovec_init_slabs(void)
1597{ 1619{
@@ -1636,29 +1658,4 @@ static int __init init_bio(void)
1636 1658
1637 return 0; 1659 return 0;
1638} 1660}
1639
1640subsys_initcall(init_bio); 1661subsys_initcall(init_bio);
1641
1642EXPORT_SYMBOL(bio_alloc);
1643EXPORT_SYMBOL(bio_kmalloc);
1644EXPORT_SYMBOL(bio_put);
1645EXPORT_SYMBOL(bio_free);
1646EXPORT_SYMBOL(bio_endio);
1647EXPORT_SYMBOL(bio_init);
1648EXPORT_SYMBOL(__bio_clone);
1649EXPORT_SYMBOL(bio_clone);
1650EXPORT_SYMBOL(bio_phys_segments);
1651EXPORT_SYMBOL(bio_add_page);
1652EXPORT_SYMBOL(bio_add_pc_page);
1653EXPORT_SYMBOL(bio_get_nr_vecs);
1654EXPORT_SYMBOL(bio_map_user);
1655EXPORT_SYMBOL(bio_unmap_user);
1656EXPORT_SYMBOL(bio_map_kern);
1657EXPORT_SYMBOL(bio_copy_kern);
1658EXPORT_SYMBOL(bio_pair_release);
1659EXPORT_SYMBOL(bio_split);
1660EXPORT_SYMBOL(bio_copy_user);
1661EXPORT_SYMBOL(bio_uncopy_user);
1662EXPORT_SYMBOL(bioset_create);
1663EXPORT_SYMBOL(bioset_free);
1664EXPORT_SYMBOL(bio_alloc_bioset);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9cf4b926f8e..8bed0557d88 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1248,8 +1248,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1248 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); 1248 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
1249 } 1249 }
1250 } else { 1250 } else {
1251 put_disk(disk);
1252 module_put(disk->fops->owner); 1251 module_put(disk->fops->owner);
1252 put_disk(disk);
1253 disk = NULL; 1253 disk = NULL;
1254 if (bdev->bd_contains == bdev) { 1254 if (bdev->bd_contains == bdev) {
1255 if (bdev->bd_disk->fops->open) { 1255 if (bdev->bd_disk->fops->open) {
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index f128427b995..36160424427 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -27,7 +27,7 @@
27#include "btrfs_inode.h" 27#include "btrfs_inode.h"
28#include "xattr.h" 28#include "xattr.h"
29 29
30#ifdef CONFIG_FS_POSIX_ACL 30#ifdef CONFIG_BTRFS_FS_POSIX_ACL
31 31
32static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) 32static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
33{ 33{
@@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
313 .set = btrfs_xattr_acl_access_set, 313 .set = btrfs_xattr_acl_access_set,
314}; 314};
315 315
316#else /* CONFIG_FS_POSIX_ACL */ 316#else /* CONFIG_BTRFS_FS_POSIX_ACL */
317 317
318int btrfs_acl_chmod(struct inode *inode) 318int btrfs_acl_chmod(struct inode *inode)
319{ 319{
@@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
325 return 0; 325 return 0;
326} 326}
327 327
328#endif /* CONFIG_FS_POSIX_ACL */ 328#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 282ca085c2f..c0861e781cd 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -64,6 +64,51 @@ struct btrfs_worker_thread {
64}; 64};
65 65
66/* 66/*
67 * btrfs_start_workers uses kthread_run, which can block waiting for memory
68 * for a very long time. It will actually throttle on page writeback,
69 * and so it may not make progress until after our btrfs worker threads
70 * process all of the pending work structs in their queue
71 *
72 * This means we can't use btrfs_start_workers from inside a btrfs worker
73 * thread that is used as part of cleaning dirty memory, which pretty much
74 * involves all of the worker threads.
75 *
76 * Instead we have a helper queue who never has more than one thread
77 * where we scheduler thread start operations. This worker_start struct
78 * is used to contain the work and hold a pointer to the queue that needs
79 * another worker.
80 */
81struct worker_start {
82 struct btrfs_work work;
83 struct btrfs_workers *queue;
84};
85
86static void start_new_worker_func(struct btrfs_work *work)
87{
88 struct worker_start *start;
89 start = container_of(work, struct worker_start, work);
90 btrfs_start_workers(start->queue, 1);
91 kfree(start);
92}
93
94static int start_new_worker(struct btrfs_workers *queue)
95{
96 struct worker_start *start;
97 int ret;
98
99 start = kzalloc(sizeof(*start), GFP_NOFS);
100 if (!start)
101 return -ENOMEM;
102
103 start->work.func = start_new_worker_func;
104 start->queue = queue;
105 ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
106 if (ret)
107 kfree(start);
108 return ret;
109}
110
111/*
67 * helper function to move a thread onto the idle list after it 112 * helper function to move a thread onto the idle list after it
68 * has finished some requests. 113 * has finished some requests.
69 */ 114 */
@@ -118,11 +163,13 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
118 goto out; 163 goto out;
119 164
120 workers->atomic_start_pending = 0; 165 workers->atomic_start_pending = 0;
121 if (workers->num_workers >= workers->max_workers) 166 if (workers->num_workers + workers->num_workers_starting >=
167 workers->max_workers)
122 goto out; 168 goto out;
123 169
170 workers->num_workers_starting += 1;
124 spin_unlock_irqrestore(&workers->lock, flags); 171 spin_unlock_irqrestore(&workers->lock, flags);
125 btrfs_start_workers(workers, 1); 172 start_new_worker(workers);
126 return; 173 return;
127 174
128out: 175out:
@@ -390,9 +437,11 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
390/* 437/*
391 * simple init on struct btrfs_workers 438 * simple init on struct btrfs_workers
392 */ 439 */
393void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) 440void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
441 struct btrfs_workers *async_helper)
394{ 442{
395 workers->num_workers = 0; 443 workers->num_workers = 0;
444 workers->num_workers_starting = 0;
396 INIT_LIST_HEAD(&workers->worker_list); 445 INIT_LIST_HEAD(&workers->worker_list);
397 INIT_LIST_HEAD(&workers->idle_list); 446 INIT_LIST_HEAD(&workers->idle_list);
398 INIT_LIST_HEAD(&workers->order_list); 447 INIT_LIST_HEAD(&workers->order_list);
@@ -404,14 +453,15 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
404 workers->name = name; 453 workers->name = name;
405 workers->ordered = 0; 454 workers->ordered = 0;
406 workers->atomic_start_pending = 0; 455 workers->atomic_start_pending = 0;
407 workers->atomic_worker_start = 0; 456 workers->atomic_worker_start = async_helper;
408} 457}
409 458
410/* 459/*
411 * starts new worker threads. This does not enforce the max worker 460 * starts new worker threads. This does not enforce the max worker
412 * count in case you need to temporarily go past it. 461 * count in case you need to temporarily go past it.
413 */ 462 */
414int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) 463static int __btrfs_start_workers(struct btrfs_workers *workers,
464 int num_workers)
415{ 465{
416 struct btrfs_worker_thread *worker; 466 struct btrfs_worker_thread *worker;
417 int ret = 0; 467 int ret = 0;
@@ -444,6 +494,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
444 list_add_tail(&worker->worker_list, &workers->idle_list); 494 list_add_tail(&worker->worker_list, &workers->idle_list);
445 worker->idle = 1; 495 worker->idle = 1;
446 workers->num_workers++; 496 workers->num_workers++;
497 workers->num_workers_starting--;
498 WARN_ON(workers->num_workers_starting < 0);
447 spin_unlock_irq(&workers->lock); 499 spin_unlock_irq(&workers->lock);
448 } 500 }
449 return 0; 501 return 0;
@@ -452,6 +504,14 @@ fail:
452 return ret; 504 return ret;
453} 505}
454 506
507int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
508{
509 spin_lock_irq(&workers->lock);
510 workers->num_workers_starting += num_workers;
511 spin_unlock_irq(&workers->lock);
512 return __btrfs_start_workers(workers, num_workers);
513}
514
455/* 515/*
456 * run through the list and find a worker thread that doesn't have a lot 516 * run through the list and find a worker thread that doesn't have a lot
457 * to do right now. This can return null if we aren't yet at the thread 517 * to do right now. This can return null if we aren't yet at the thread
@@ -461,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
461{ 521{
462 struct btrfs_worker_thread *worker; 522 struct btrfs_worker_thread *worker;
463 struct list_head *next; 523 struct list_head *next;
464 int enforce_min = workers->num_workers < workers->max_workers; 524 int enforce_min;
525
526 enforce_min = (workers->num_workers + workers->num_workers_starting) <
527 workers->max_workers;
465 528
466 /* 529 /*
467 * if we find an idle thread, don't move it to the end of the 530 * if we find an idle thread, don't move it to the end of the
@@ -509,15 +572,17 @@ again:
509 worker = next_worker(workers); 572 worker = next_worker(workers);
510 573
511 if (!worker) { 574 if (!worker) {
512 if (workers->num_workers >= workers->max_workers) { 575 if (workers->num_workers + workers->num_workers_starting >=
576 workers->max_workers) {
513 goto fallback; 577 goto fallback;
514 } else if (workers->atomic_worker_start) { 578 } else if (workers->atomic_worker_start) {
515 workers->atomic_start_pending = 1; 579 workers->atomic_start_pending = 1;
516 goto fallback; 580 goto fallback;
517 } else { 581 } else {
582 workers->num_workers_starting++;
518 spin_unlock_irqrestore(&workers->lock, flags); 583 spin_unlock_irqrestore(&workers->lock, flags);
519 /* we're below the limit, start another worker */ 584 /* we're below the limit, start another worker */
520 btrfs_start_workers(workers, 1); 585 __btrfs_start_workers(workers, 1);
521 goto again; 586 goto again;
522 } 587 }
523 } 588 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index fc089b95ec1..5077746cf85 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -64,6 +64,8 @@ struct btrfs_workers {
64 /* current number of running workers */ 64 /* current number of running workers */
65 int num_workers; 65 int num_workers;
66 66
67 int num_workers_starting;
68
67 /* max number of workers allowed. changed by btrfs_start_workers */ 69 /* max number of workers allowed. changed by btrfs_start_workers */
68 int max_workers; 70 int max_workers;
69 71
@@ -78,9 +80,10 @@ struct btrfs_workers {
78 80
79 /* 81 /*
80 * are we allowed to sleep while starting workers or are we required 82 * are we allowed to sleep while starting workers or are we required
81 * to start them at a later time? 83 * to start them at a later time? If we can't sleep, this indicates
84 * which queue we need to use to schedule thread creation.
82 */ 85 */
83 int atomic_worker_start; 86 struct btrfs_workers *atomic_worker_start;
84 87
85 /* list with all the work threads. The workers on the idle thread 88 /* list with all the work threads. The workers on the idle thread
86 * may be actively servicing jobs, but they haven't yet hit the 89 * may be actively servicing jobs, but they haven't yet hit the
@@ -109,7 +112,8 @@ struct btrfs_workers {
109int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); 112int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
110int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); 113int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
111int btrfs_stop_workers(struct btrfs_workers *workers); 114int btrfs_stop_workers(struct btrfs_workers *workers);
112void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); 115void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
116 struct btrfs_workers *async_starter);
113int btrfs_requeue_work(struct btrfs_work *work); 117int btrfs_requeue_work(struct btrfs_work *work);
114void btrfs_set_work_high_prio(struct btrfs_work *work); 118void btrfs_set_work_high_prio(struct btrfs_work *work);
115#endif 119#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 82ee56bba29..f6783a42f01 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -86,6 +86,12 @@ struct btrfs_inode {
86 * transid of the trans_handle that last modified this inode 86 * transid of the trans_handle that last modified this inode
87 */ 87 */
88 u64 last_trans; 88 u64 last_trans;
89
90 /*
91 * log transid when this inode was last modified
92 */
93 u64 last_sub_trans;
94
89 /* 95 /*
90 * transid that last logged this inode 96 * transid that last logged this inode
91 */ 97 */
@@ -128,6 +134,16 @@ struct btrfs_inode {
128 u64 last_unlink_trans; 134 u64 last_unlink_trans;
129 135
130 /* 136 /*
137 * Counters to keep track of the number of extent item's we may use due
138 * to delalloc and such. outstanding_extents is the number of extent
139 * items we think we'll end up using, and reserved_extents is the number
140 * of extent items we've reserved metadata for.
141 */
142 spinlock_t accounting_lock;
143 int reserved_extents;
144 int outstanding_extents;
145
146 /*
131 * ordered_data_close is set by truncate when a file that used 147 * ordered_data_close is set by truncate when a file that used
132 * to have good data has been truncated to zero. When it is set 148 * to have good data has been truncated to zero. When it is set
133 * the btrfs file release call will add this inode to the 149 * the btrfs file release call will add this inode to the
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 80599b4e42b..444b3e9b92a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -675,21 +675,28 @@ struct btrfs_space_info {
675 current allocations */ 675 current allocations */
676 u64 bytes_readonly; /* total bytes that are read only */ 676 u64 bytes_readonly; /* total bytes that are read only */
677 u64 bytes_super; /* total bytes reserved for the super blocks */ 677 u64 bytes_super; /* total bytes reserved for the super blocks */
678 678 u64 bytes_root; /* the number of bytes needed to commit a
679 /* delalloc accounting */ 679 transaction */
680 u64 bytes_delalloc; /* number of bytes reserved for allocation,
681 this space is not necessarily reserved yet
682 by the allocator */
683 u64 bytes_may_use; /* number of bytes that may be used for 680 u64 bytes_may_use; /* number of bytes that may be used for
684 delalloc */ 681 delalloc/allocations */
682 u64 bytes_delalloc; /* number of bytes currently reserved for
683 delayed allocation */
685 684
686 int full; /* indicates that we cannot allocate any more 685 int full; /* indicates that we cannot allocate any more
687 chunks for this space */ 686 chunks for this space */
688 int force_alloc; /* set if we need to force a chunk alloc for 687 int force_alloc; /* set if we need to force a chunk alloc for
689 this space */ 688 this space */
689 int force_delalloc; /* make people start doing filemap_flush until
690 we're under a threshold */
690 691
691 struct list_head list; 692 struct list_head list;
692 693
694 /* for controlling how we free up space for allocations */
695 wait_queue_head_t allocate_wait;
696 wait_queue_head_t flush_wait;
697 int allocating_chunk;
698 int flushing;
699
693 /* for block groups in our same type */ 700 /* for block groups in our same type */
694 struct list_head block_groups; 701 struct list_head block_groups;
695 spinlock_t lock; 702 spinlock_t lock;
@@ -903,6 +910,7 @@ struct btrfs_fs_info {
903 * A third pool does submit_bio to avoid deadlocking with the other 910 * A third pool does submit_bio to avoid deadlocking with the other
904 * two 911 * two
905 */ 912 */
913 struct btrfs_workers generic_worker;
906 struct btrfs_workers workers; 914 struct btrfs_workers workers;
907 struct btrfs_workers delalloc_workers; 915 struct btrfs_workers delalloc_workers;
908 struct btrfs_workers endio_workers; 916 struct btrfs_workers endio_workers;
@@ -910,6 +918,7 @@ struct btrfs_fs_info {
910 struct btrfs_workers endio_meta_write_workers; 918 struct btrfs_workers endio_meta_write_workers;
911 struct btrfs_workers endio_write_workers; 919 struct btrfs_workers endio_write_workers;
912 struct btrfs_workers submit_workers; 920 struct btrfs_workers submit_workers;
921 struct btrfs_workers enospc_workers;
913 /* 922 /*
914 * fixup workers take dirty pages that didn't properly go through 923 * fixup workers take dirty pages that didn't properly go through
915 * the cow mechanism and make them safe to write. It happens 924 * the cow mechanism and make them safe to write. It happens
@@ -1000,7 +1009,10 @@ struct btrfs_root {
1000 atomic_t log_writers; 1009 atomic_t log_writers;
1001 atomic_t log_commit[2]; 1010 atomic_t log_commit[2];
1002 unsigned long log_transid; 1011 unsigned long log_transid;
1012 unsigned long last_log_commit;
1003 unsigned long log_batch; 1013 unsigned long log_batch;
1014 pid_t log_start_pid;
1015 bool log_multiple_pids;
1004 1016
1005 u64 objectid; 1017 u64 objectid;
1006 u64 last_trans; 1018 u64 last_trans;
@@ -1141,6 +1153,7 @@ struct btrfs_root {
1141#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) 1153#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
1142#define BTRFS_MOUNT_SSD_SPREAD (1 << 8) 1154#define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
1143#define BTRFS_MOUNT_NOSSD (1 << 9) 1155#define BTRFS_MOUNT_NOSSD (1 << 9)
1156#define BTRFS_MOUNT_DISCARD (1 << 10)
1144 1157
1145#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1158#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1146#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1159#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2022,7 +2035,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2022void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2035void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2023void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2036void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2024 2037
2025int btrfs_check_metadata_free_space(struct btrfs_root *root); 2038int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
2039int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
2040int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2041 struct inode *inode, int num_items);
2042int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
2043 struct inode *inode, int num_items);
2026int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2044int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2027 u64 bytes); 2045 u64 bytes);
2028void btrfs_free_reserved_data_space(struct btrfs_root *root, 2046void btrfs_free_reserved_data_space(struct btrfs_root *root,
@@ -2314,7 +2332,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2314void btrfs_orphan_cleanup(struct btrfs_root *root); 2332void btrfs_orphan_cleanup(struct btrfs_root *root);
2315int btrfs_cont_expand(struct inode *inode, loff_t size); 2333int btrfs_cont_expand(struct inode *inode, loff_t size);
2316int btrfs_invalidate_inodes(struct btrfs_root *root); 2334int btrfs_invalidate_inodes(struct btrfs_root *root);
2317extern struct dentry_operations btrfs_dentry_operations; 2335extern const struct dentry_operations btrfs_dentry_operations;
2318 2336
2319/* ioctl.c */ 2337/* ioctl.c */
2320long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 2338long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -2326,7 +2344,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
2326int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2344int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2327 int skip_pinned); 2345 int skip_pinned);
2328int btrfs_check_file(struct btrfs_root *root, struct inode *inode); 2346int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
2329extern struct file_operations btrfs_file_operations; 2347extern const struct file_operations btrfs_file_operations;
2330int btrfs_drop_extents(struct btrfs_trans_handle *trans, 2348int btrfs_drop_extents(struct btrfs_trans_handle *trans,
2331 struct btrfs_root *root, struct inode *inode, 2349 struct btrfs_root *root, struct inode *inode,
2332 u64 start, u64 end, u64 locked_end, 2350 u64 start, u64 end, u64 locked_end,
@@ -2357,7 +2375,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
2357int btrfs_sync_fs(struct super_block *sb, int wait); 2375int btrfs_sync_fs(struct super_block *sb, int wait);
2358 2376
2359/* acl.c */ 2377/* acl.c */
2360#ifdef CONFIG_FS_POSIX_ACL 2378#ifdef CONFIG_BTRFS_FS_POSIX_ACL
2361int btrfs_check_acl(struct inode *inode, int mask); 2379int btrfs_check_acl(struct inode *inode, int mask);
2362#else 2380#else
2363#define btrfs_check_acl NULL 2381#define btrfs_check_acl NULL
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 644e796fd64..02b6afbd745 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -822,14 +822,14 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
822 822
823int btrfs_write_tree_block(struct extent_buffer *buf) 823int btrfs_write_tree_block(struct extent_buffer *buf)
824{ 824{
825 return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, 825 return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
826 buf->start + buf->len - 1, WB_SYNC_ALL); 826 buf->start + buf->len - 1);
827} 827}
828 828
829int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 829int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
830{ 830{
831 return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, 831 return filemap_fdatawait_range(buf->first_page->mapping,
832 buf->start, buf->start + buf->len - 1); 832 buf->start, buf->start + buf->len - 1);
833} 833}
834 834
835struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, 835struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -917,6 +917,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
917 atomic_set(&root->log_writers, 0); 917 atomic_set(&root->log_writers, 0);
918 root->log_batch = 0; 918 root->log_batch = 0;
919 root->log_transid = 0; 919 root->log_transid = 0;
920 root->last_log_commit = 0;
920 extent_io_tree_init(&root->dirty_log_pages, 921 extent_io_tree_init(&root->dirty_log_pages,
921 fs_info->btree_inode->i_mapping, GFP_NOFS); 922 fs_info->btree_inode->i_mapping, GFP_NOFS);
922 923
@@ -1087,6 +1088,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1087 WARN_ON(root->log_root); 1088 WARN_ON(root->log_root);
1088 root->log_root = log_root; 1089 root->log_root = log_root;
1089 root->log_transid = 0; 1090 root->log_transid = 0;
1091 root->last_log_commit = 0;
1090 return 0; 1092 return 0;
1091} 1093}
1092 1094
@@ -1630,7 +1632,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1630 fs_info->sb = sb; 1632 fs_info->sb = sb;
1631 fs_info->max_extent = (u64)-1; 1633 fs_info->max_extent = (u64)-1;
1632 fs_info->max_inline = 8192 * 1024; 1634 fs_info->max_inline = 8192 * 1024;
1633 fs_info->metadata_ratio = 8; 1635 fs_info->metadata_ratio = 0;
1634 1636
1635 fs_info->thread_pool_size = min_t(unsigned long, 1637 fs_info->thread_pool_size = min_t(unsigned long,
1636 num_online_cpus() + 2, 8); 1638 num_online_cpus() + 2, 8);
@@ -1746,21 +1748,25 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1746 err = -EINVAL; 1748 err = -EINVAL;
1747 goto fail_iput; 1749 goto fail_iput;
1748 } 1750 }
1749printk("thread pool is %d\n", fs_info->thread_pool_size); 1751
1750 /* 1752 btrfs_init_workers(&fs_info->generic_worker,
1751 * we need to start all the end_io workers up front because the 1753 "genwork", 1, NULL);
1752 * queue work function gets called at interrupt time, and so it 1754
1753 * cannot dynamically grow.
1754 */
1755 btrfs_init_workers(&fs_info->workers, "worker", 1755 btrfs_init_workers(&fs_info->workers, "worker",
1756 fs_info->thread_pool_size); 1756 fs_info->thread_pool_size,
1757 &fs_info->generic_worker);
1757 1758
1758 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", 1759 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
1759 fs_info->thread_pool_size); 1760 fs_info->thread_pool_size,
1761 &fs_info->generic_worker);
1760 1762
1761 btrfs_init_workers(&fs_info->submit_workers, "submit", 1763 btrfs_init_workers(&fs_info->submit_workers, "submit",
1762 min_t(u64, fs_devices->num_devices, 1764 min_t(u64, fs_devices->num_devices,
1763 fs_info->thread_pool_size)); 1765 fs_info->thread_pool_size),
1766 &fs_info->generic_worker);
1767 btrfs_init_workers(&fs_info->enospc_workers, "enospc",
1768 fs_info->thread_pool_size,
1769 &fs_info->generic_worker);
1764 1770
1765 /* a higher idle thresh on the submit workers makes it much more 1771 /* a higher idle thresh on the submit workers makes it much more
1766 * likely that bios will be send down in a sane order to the 1772 * likely that bios will be send down in a sane order to the
@@ -1774,15 +1780,20 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
1774 fs_info->delalloc_workers.idle_thresh = 2; 1780 fs_info->delalloc_workers.idle_thresh = 2;
1775 fs_info->delalloc_workers.ordered = 1; 1781 fs_info->delalloc_workers.ordered = 1;
1776 1782
1777 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); 1783 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
1784 &fs_info->generic_worker);
1778 btrfs_init_workers(&fs_info->endio_workers, "endio", 1785 btrfs_init_workers(&fs_info->endio_workers, "endio",
1779 fs_info->thread_pool_size); 1786 fs_info->thread_pool_size,
1787 &fs_info->generic_worker);
1780 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", 1788 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
1781 fs_info->thread_pool_size); 1789 fs_info->thread_pool_size,
1790 &fs_info->generic_worker);
1782 btrfs_init_workers(&fs_info->endio_meta_write_workers, 1791 btrfs_init_workers(&fs_info->endio_meta_write_workers,
1783 "endio-meta-write", fs_info->thread_pool_size); 1792 "endio-meta-write", fs_info->thread_pool_size,
1793 &fs_info->generic_worker);
1784 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 1794 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1785 fs_info->thread_pool_size); 1795 fs_info->thread_pool_size,
1796 &fs_info->generic_worker);
1786 1797
1787 /* 1798 /*
1788 * endios are largely parallel and should have a very 1799 * endios are largely parallel and should have a very
@@ -1794,12 +1805,8 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
1794 fs_info->endio_write_workers.idle_thresh = 2; 1805 fs_info->endio_write_workers.idle_thresh = 2;
1795 fs_info->endio_meta_write_workers.idle_thresh = 2; 1806 fs_info->endio_meta_write_workers.idle_thresh = 2;
1796 1807
1797 fs_info->endio_workers.atomic_worker_start = 1;
1798 fs_info->endio_meta_workers.atomic_worker_start = 1;
1799 fs_info->endio_write_workers.atomic_worker_start = 1;
1800 fs_info->endio_meta_write_workers.atomic_worker_start = 1;
1801
1802 btrfs_start_workers(&fs_info->workers, 1); 1808 btrfs_start_workers(&fs_info->workers, 1);
1809 btrfs_start_workers(&fs_info->generic_worker, 1);
1803 btrfs_start_workers(&fs_info->submit_workers, 1); 1810 btrfs_start_workers(&fs_info->submit_workers, 1);
1804 btrfs_start_workers(&fs_info->delalloc_workers, 1); 1811 btrfs_start_workers(&fs_info->delalloc_workers, 1);
1805 btrfs_start_workers(&fs_info->fixup_workers, 1); 1812 btrfs_start_workers(&fs_info->fixup_workers, 1);
@@ -1807,6 +1814,7 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
1807 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1814 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1808 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1815 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1809 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1816 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1817 btrfs_start_workers(&fs_info->enospc_workers, 1);
1810 1818
1811 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1819 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1812 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1820 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2012,6 +2020,7 @@ fail_chunk_root:
2012 free_extent_buffer(chunk_root->node); 2020 free_extent_buffer(chunk_root->node);
2013 free_extent_buffer(chunk_root->commit_root); 2021 free_extent_buffer(chunk_root->commit_root);
2014fail_sb_buffer: 2022fail_sb_buffer:
2023 btrfs_stop_workers(&fs_info->generic_worker);
2015 btrfs_stop_workers(&fs_info->fixup_workers); 2024 btrfs_stop_workers(&fs_info->fixup_workers);
2016 btrfs_stop_workers(&fs_info->delalloc_workers); 2025 btrfs_stop_workers(&fs_info->delalloc_workers);
2017 btrfs_stop_workers(&fs_info->workers); 2026 btrfs_stop_workers(&fs_info->workers);
@@ -2020,6 +2029,7 @@ fail_sb_buffer:
2020 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2029 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2021 btrfs_stop_workers(&fs_info->endio_write_workers); 2030 btrfs_stop_workers(&fs_info->endio_write_workers);
2022 btrfs_stop_workers(&fs_info->submit_workers); 2031 btrfs_stop_workers(&fs_info->submit_workers);
2032 btrfs_stop_workers(&fs_info->enospc_workers);
2023fail_iput: 2033fail_iput:
2024 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2034 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2025 iput(fs_info->btree_inode); 2035 iput(fs_info->btree_inode);
@@ -2437,6 +2447,7 @@ int close_ctree(struct btrfs_root *root)
2437 2447
2438 iput(fs_info->btree_inode); 2448 iput(fs_info->btree_inode);
2439 2449
2450 btrfs_stop_workers(&fs_info->generic_worker);
2440 btrfs_stop_workers(&fs_info->fixup_workers); 2451 btrfs_stop_workers(&fs_info->fixup_workers);
2441 btrfs_stop_workers(&fs_info->delalloc_workers); 2452 btrfs_stop_workers(&fs_info->delalloc_workers);
2442 btrfs_stop_workers(&fs_info->workers); 2453 btrfs_stop_workers(&fs_info->workers);
@@ -2445,6 +2456,7 @@ int close_ctree(struct btrfs_root *root)
2445 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2456 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2446 btrfs_stop_workers(&fs_info->endio_write_workers); 2457 btrfs_stop_workers(&fs_info->endio_write_workers);
2447 btrfs_stop_workers(&fs_info->submit_workers); 2458 btrfs_stop_workers(&fs_info->submit_workers);
2459 btrfs_stop_workers(&fs_info->enospc_workers);
2448 2460
2449 btrfs_close_devices(fs_info->fs_devices); 2461 btrfs_close_devices(fs_info->fs_devices);
2450 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2462 btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 993f93ff7ba..94627c4cc19 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -68,6 +68,8 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
68 struct extent_buffer **must_clean); 68 struct extent_buffer **must_clean);
69static int find_next_key(struct btrfs_path *path, int level, 69static int find_next_key(struct btrfs_path *path, int level,
70 struct btrfs_key *key); 70 struct btrfs_key *key);
71static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
72 int dump_block_groups);
71 73
72static noinline int 74static noinline int
73block_group_cache_done(struct btrfs_block_group_cache *cache) 75block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -1566,23 +1568,23 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1566 return ret; 1568 return ret;
1567} 1569}
1568 1570
1569#ifdef BIO_RW_DISCARD
1570static void btrfs_issue_discard(struct block_device *bdev, 1571static void btrfs_issue_discard(struct block_device *bdev,
1571 u64 start, u64 len) 1572 u64 start, u64 len)
1572{ 1573{
1573 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1574 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1574 DISCARD_FL_BARRIER); 1575 DISCARD_FL_BARRIER);
1575} 1576}
1576#endif
1577 1577
1578static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1578static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1579 u64 num_bytes) 1579 u64 num_bytes)
1580{ 1580{
1581#ifdef BIO_RW_DISCARD
1582 int ret; 1581 int ret;
1583 u64 map_length = num_bytes; 1582 u64 map_length = num_bytes;
1584 struct btrfs_multi_bio *multi = NULL; 1583 struct btrfs_multi_bio *multi = NULL;
1585 1584
1585 if (!btrfs_test_opt(root, DISCARD))
1586 return 0;
1587
1586 /* Tell the block device(s) that the sectors can be discarded */ 1588 /* Tell the block device(s) that the sectors can be discarded */
1587 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, 1589 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
1588 bytenr, &map_length, &multi, 0); 1590 bytenr, &map_length, &multi, 0);
@@ -1602,9 +1604,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1602 } 1604 }
1603 1605
1604 return ret; 1606 return ret;
1605#else
1606 return 0;
1607#endif
1608} 1607}
1609 1608
1610int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1609int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
@@ -2765,67 +2764,448 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2765 alloc_target); 2764 alloc_target);
2766} 2765}
2767 2766
2767static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2768{
2769 u64 num_bytes;
2770 int level;
2771
2772 level = BTRFS_MAX_LEVEL - 2;
2773 /*
2774 * NOTE: these calculations are absolutely the worst possible case.
2775 * This assumes that _every_ item we insert will require a new leaf, and
2776 * that the tree has grown to its maximum level size.
2777 */
2778
2779 /*
2780 * for every item we insert we could insert both an extent item and a
2781 * extent ref item. Then for ever item we insert, we will need to cow
2782 * both the original leaf, plus the leaf to the left and right of it.
2783 *
2784 * Unless we are talking about the extent root, then we just want the
2785 * number of items * 2, since we just need the extent item plus its ref.
2786 */
2787 if (root == root->fs_info->extent_root)
2788 num_bytes = num_items * 2;
2789 else
2790 num_bytes = (num_items + (2 * num_items)) * 3;
2791
2792 /*
2793 * num_bytes is total number of leaves we could need times the leaf
2794 * size, and then for every leaf we could end up cow'ing 2 nodes per
2795 * level, down to the leaf level.
2796 */
2797 num_bytes = (num_bytes * root->leafsize) +
2798 (num_bytes * (level * 2)) * root->nodesize;
2799
2800 return num_bytes;
2801}
2802
2768/* 2803/*
2769 * for now this just makes sure we have at least 5% of our metadata space free 2804 * Unreserve metadata space for delalloc. If we have less reserved credits than
2770 * for use. 2805 * we have extents, this function does nothing.
2771 */ 2806 */
2772int btrfs_check_metadata_free_space(struct btrfs_root *root) 2807int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2808 struct inode *inode, int num_items)
2773{ 2809{
2774 struct btrfs_fs_info *info = root->fs_info; 2810 struct btrfs_fs_info *info = root->fs_info;
2775 struct btrfs_space_info *meta_sinfo; 2811 struct btrfs_space_info *meta_sinfo;
2776 u64 alloc_target, thresh; 2812 u64 num_bytes;
2777 int committed = 0, ret; 2813 u64 alloc_target;
2814 bool bug = false;
2778 2815
2779 /* get the space info for where the metadata will live */ 2816 /* get the space info for where the metadata will live */
2780 alloc_target = btrfs_get_alloc_profile(root, 0); 2817 alloc_target = btrfs_get_alloc_profile(root, 0);
2781 meta_sinfo = __find_space_info(info, alloc_target); 2818 meta_sinfo = __find_space_info(info, alloc_target);
2782 if (!meta_sinfo)
2783 goto alloc;
2784 2819
2785again: 2820 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2821 num_items);
2822
2786 spin_lock(&meta_sinfo->lock); 2823 spin_lock(&meta_sinfo->lock);
2787 if (!meta_sinfo->full) 2824 spin_lock(&BTRFS_I(inode)->accounting_lock);
2788 thresh = meta_sinfo->total_bytes * 80; 2825 if (BTRFS_I(inode)->reserved_extents <=
2789 else 2826 BTRFS_I(inode)->outstanding_extents) {
2790 thresh = meta_sinfo->total_bytes * 95; 2827 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2828 spin_unlock(&meta_sinfo->lock);
2829 return 0;
2830 }
2831 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2832
2833 BTRFS_I(inode)->reserved_extents--;
2834 BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2835
2836 if (meta_sinfo->bytes_delalloc < num_bytes) {
2837 bug = true;
2838 meta_sinfo->bytes_delalloc = 0;
2839 } else {
2840 meta_sinfo->bytes_delalloc -= num_bytes;
2841 }
2842 spin_unlock(&meta_sinfo->lock);
2843
2844 BUG_ON(bug);
2791 2845
2846 return 0;
2847}
2848
2849static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2850{
2851 u64 thresh;
2852
2853 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2854 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2855 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2856 meta_sinfo->bytes_may_use;
2857
2858 thresh = meta_sinfo->total_bytes - thresh;
2859 thresh *= 80;
2792 do_div(thresh, 100); 2860 do_div(thresh, 100);
2861 if (thresh <= meta_sinfo->bytes_delalloc)
2862 meta_sinfo->force_delalloc = 1;
2863 else
2864 meta_sinfo->force_delalloc = 0;
2865}
2793 2866
2794 if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 2867struct async_flush {
2795 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 2868 struct btrfs_root *root;
2796 meta_sinfo->bytes_super > thresh) { 2869 struct btrfs_space_info *info;
2797 struct btrfs_trans_handle *trans; 2870 struct btrfs_work work;
2798 if (!meta_sinfo->full) { 2871};
2799 meta_sinfo->force_alloc = 1; 2872
2873static noinline void flush_delalloc_async(struct btrfs_work *work)
2874{
2875 struct async_flush *async;
2876 struct btrfs_root *root;
2877 struct btrfs_space_info *info;
2878
2879 async = container_of(work, struct async_flush, work);
2880 root = async->root;
2881 info = async->info;
2882
2883 btrfs_start_delalloc_inodes(root);
2884 wake_up(&info->flush_wait);
2885 btrfs_wait_ordered_extents(root, 0);
2886
2887 spin_lock(&info->lock);
2888 info->flushing = 0;
2889 spin_unlock(&info->lock);
2890 wake_up(&info->flush_wait);
2891
2892 kfree(async);
2893}
2894
2895static void wait_on_flush(struct btrfs_space_info *info)
2896{
2897 DEFINE_WAIT(wait);
2898 u64 used;
2899
2900 while (1) {
2901 prepare_to_wait(&info->flush_wait, &wait,
2902 TASK_UNINTERRUPTIBLE);
2903 spin_lock(&info->lock);
2904 if (!info->flushing) {
2905 spin_unlock(&info->lock);
2906 break;
2907 }
2908
2909 used = info->bytes_used + info->bytes_reserved +
2910 info->bytes_pinned + info->bytes_readonly +
2911 info->bytes_super + info->bytes_root +
2912 info->bytes_may_use + info->bytes_delalloc;
2913 if (used < info->total_bytes) {
2914 spin_unlock(&info->lock);
2915 break;
2916 }
2917 spin_unlock(&info->lock);
2918 schedule();
2919 }
2920 finish_wait(&info->flush_wait, &wait);
2921}
2922
2923static void flush_delalloc(struct btrfs_root *root,
2924 struct btrfs_space_info *info)
2925{
2926 struct async_flush *async;
2927 bool wait = false;
2928
2929 spin_lock(&info->lock);
2930
2931 if (!info->flushing) {
2932 info->flushing = 1;
2933 init_waitqueue_head(&info->flush_wait);
2934 } else {
2935 wait = true;
2936 }
2937
2938 spin_unlock(&info->lock);
2939
2940 if (wait) {
2941 wait_on_flush(info);
2942 return;
2943 }
2944
2945 async = kzalloc(sizeof(*async), GFP_NOFS);
2946 if (!async)
2947 goto flush;
2948
2949 async->root = root;
2950 async->info = info;
2951 async->work.func = flush_delalloc_async;
2952
2953 btrfs_queue_worker(&root->fs_info->enospc_workers,
2954 &async->work);
2955 wait_on_flush(info);
2956 return;
2957
2958flush:
2959 btrfs_start_delalloc_inodes(root);
2960 btrfs_wait_ordered_extents(root, 0);
2961
2962 spin_lock(&info->lock);
2963 info->flushing = 0;
2964 spin_unlock(&info->lock);
2965 wake_up(&info->flush_wait);
2966}
2967
2968static int maybe_allocate_chunk(struct btrfs_root *root,
2969 struct btrfs_space_info *info)
2970{
2971 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2972 struct btrfs_trans_handle *trans;
2973 bool wait = false;
2974 int ret = 0;
2975 u64 min_metadata;
2976 u64 free_space;
2977
2978 free_space = btrfs_super_total_bytes(disk_super);
2979 /*
2980 * we allow the metadata to grow to a max of either 10gb or 5% of the
2981 * space in the volume.
2982 */
2983 min_metadata = min((u64)10 * 1024 * 1024 * 1024,
2984 div64_u64(free_space * 5, 100));
2985 if (info->total_bytes >= min_metadata) {
2986 spin_unlock(&info->lock);
2987 return 0;
2988 }
2989
2990 if (info->full) {
2991 spin_unlock(&info->lock);
2992 return 0;
2993 }
2994
2995 if (!info->allocating_chunk) {
2996 info->force_alloc = 1;
2997 info->allocating_chunk = 1;
2998 init_waitqueue_head(&info->allocate_wait);
2999 } else {
3000 wait = true;
3001 }
3002
3003 spin_unlock(&info->lock);
3004
3005 if (wait) {
3006 wait_event(info->allocate_wait,
3007 !info->allocating_chunk);
3008 return 1;
3009 }
3010
3011 trans = btrfs_start_transaction(root, 1);
3012 if (!trans) {
3013 ret = -ENOMEM;
3014 goto out;
3015 }
3016
3017 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3018 4096 + 2 * 1024 * 1024,
3019 info->flags, 0);
3020 btrfs_end_transaction(trans, root);
3021 if (ret)
3022 goto out;
3023out:
3024 spin_lock(&info->lock);
3025 info->allocating_chunk = 0;
3026 spin_unlock(&info->lock);
3027 wake_up(&info->allocate_wait);
3028
3029 if (ret)
3030 return 0;
3031 return 1;
3032}
3033
3034/*
3035 * Reserve metadata space for delalloc.
3036 */
3037int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
3038 struct inode *inode, int num_items)
3039{
3040 struct btrfs_fs_info *info = root->fs_info;
3041 struct btrfs_space_info *meta_sinfo;
3042 u64 num_bytes;
3043 u64 used;
3044 u64 alloc_target;
3045 int flushed = 0;
3046 int force_delalloc;
3047
3048 /* get the space info for where the metadata will live */
3049 alloc_target = btrfs_get_alloc_profile(root, 0);
3050 meta_sinfo = __find_space_info(info, alloc_target);
3051
3052 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
3053 num_items);
3054again:
3055 spin_lock(&meta_sinfo->lock);
3056
3057 force_delalloc = meta_sinfo->force_delalloc;
3058
3059 if (unlikely(!meta_sinfo->bytes_root))
3060 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3061
3062 if (!flushed)
3063 meta_sinfo->bytes_delalloc += num_bytes;
3064
3065 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3066 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3067 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3068 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3069
3070 if (used > meta_sinfo->total_bytes) {
3071 flushed++;
3072
3073 if (flushed == 1) {
3074 if (maybe_allocate_chunk(root, meta_sinfo))
3075 goto again;
3076 flushed++;
3077 } else {
2800 spin_unlock(&meta_sinfo->lock); 3078 spin_unlock(&meta_sinfo->lock);
2801alloc: 3079 }
2802 trans = btrfs_start_transaction(root, 1);
2803 if (!trans)
2804 return -ENOMEM;
2805 3080
2806 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3081 if (flushed == 2) {
2807 2 * 1024 * 1024, alloc_target, 0); 3082 filemap_flush(inode->i_mapping);
2808 btrfs_end_transaction(trans, root); 3083 goto again;
2809 if (!meta_sinfo) { 3084 } else if (flushed == 3) {
2810 meta_sinfo = __find_space_info(info, 3085 flush_delalloc(root, meta_sinfo);
2811 alloc_target);
2812 }
2813 goto again; 3086 goto again;
2814 } 3087 }
3088 spin_lock(&meta_sinfo->lock);
3089 meta_sinfo->bytes_delalloc -= num_bytes;
2815 spin_unlock(&meta_sinfo->lock); 3090 spin_unlock(&meta_sinfo->lock);
3091 printk(KERN_ERR "enospc, has %d, reserved %d\n",
3092 BTRFS_I(inode)->outstanding_extents,
3093 BTRFS_I(inode)->reserved_extents);
3094 dump_space_info(meta_sinfo, 0, 0);
3095 return -ENOSPC;
3096 }
2816 3097
2817 if (!committed) { 3098 BTRFS_I(inode)->reserved_extents++;
2818 committed = 1; 3099 check_force_delalloc(meta_sinfo);
2819 trans = btrfs_join_transaction(root, 1); 3100 spin_unlock(&meta_sinfo->lock);
2820 if (!trans) 3101
2821 return -ENOMEM; 3102 if (!flushed && force_delalloc)
2822 ret = btrfs_commit_transaction(trans, root); 3103 filemap_flush(inode->i_mapping);
2823 if (ret) 3104
2824 return ret; 3105 return 0;
3106}
3107
3108/*
3109 * unreserve num_items number of items worth of metadata space. This needs to
3110 * be paired with btrfs_reserve_metadata_space.
3111 *
3112 * NOTE: if you have the option, run this _AFTER_ you do a
3113 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3114 * oprations which will result in more used metadata, so we want to make sure we
3115 * can do that without issue.
3116 */
3117int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3118{
3119 struct btrfs_fs_info *info = root->fs_info;
3120 struct btrfs_space_info *meta_sinfo;
3121 u64 num_bytes;
3122 u64 alloc_target;
3123 bool bug = false;
3124
3125 /* get the space info for where the metadata will live */
3126 alloc_target = btrfs_get_alloc_profile(root, 0);
3127 meta_sinfo = __find_space_info(info, alloc_target);
3128
3129 num_bytes = calculate_bytes_needed(root, num_items);
3130
3131 spin_lock(&meta_sinfo->lock);
3132 if (meta_sinfo->bytes_may_use < num_bytes) {
3133 bug = true;
3134 meta_sinfo->bytes_may_use = 0;
3135 } else {
3136 meta_sinfo->bytes_may_use -= num_bytes;
3137 }
3138 spin_unlock(&meta_sinfo->lock);
3139
3140 BUG_ON(bug);
3141
3142 return 0;
3143}
3144
3145/*
3146 * Reserve some metadata space for use. We'll calculate the worste case number
3147 * of bytes that would be needed to modify num_items number of items. If we
3148 * have space, fantastic, if not, you get -ENOSPC. Please call
3149 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3150 * items you reserved, since whatever metadata you needed should have already
3151 * been allocated.
3152 *
3153 * This will commit the transaction to make more space if we don't have enough
3154 * metadata space. THe only time we don't do this is if we're reserving space
3155 * inside of a transaction, then we will just return -ENOSPC and it is the
3156 * callers responsibility to handle it properly.
3157 */
3158int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3159{
3160 struct btrfs_fs_info *info = root->fs_info;
3161 struct btrfs_space_info *meta_sinfo;
3162 u64 num_bytes;
3163 u64 used;
3164 u64 alloc_target;
3165 int retries = 0;
3166
3167 /* get the space info for where the metadata will live */
3168 alloc_target = btrfs_get_alloc_profile(root, 0);
3169 meta_sinfo = __find_space_info(info, alloc_target);
3170
3171 num_bytes = calculate_bytes_needed(root, num_items);
3172again:
3173 spin_lock(&meta_sinfo->lock);
3174
3175 if (unlikely(!meta_sinfo->bytes_root))
3176 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3177
3178 if (!retries)
3179 meta_sinfo->bytes_may_use += num_bytes;
3180
3181 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3182 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3183 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3184 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3185
3186 if (used > meta_sinfo->total_bytes) {
3187 retries++;
3188 if (retries == 1) {
3189 if (maybe_allocate_chunk(root, meta_sinfo))
3190 goto again;
3191 retries++;
3192 } else {
3193 spin_unlock(&meta_sinfo->lock);
3194 }
3195
3196 if (retries == 2) {
3197 flush_delalloc(root, meta_sinfo);
2825 goto again; 3198 goto again;
2826 } 3199 }
3200 spin_lock(&meta_sinfo->lock);
3201 meta_sinfo->bytes_may_use -= num_bytes;
3202 spin_unlock(&meta_sinfo->lock);
3203
3204 dump_space_info(meta_sinfo, 0, 0);
2827 return -ENOSPC; 3205 return -ENOSPC;
2828 } 3206 }
3207
3208 check_force_delalloc(meta_sinfo);
2829 spin_unlock(&meta_sinfo->lock); 3209 spin_unlock(&meta_sinfo->lock);
2830 3210
2831 return 0; 3211 return 0;
@@ -2888,7 +3268,7 @@ alloc:
2888 spin_unlock(&data_sinfo->lock); 3268 spin_unlock(&data_sinfo->lock);
2889 3269
2890 /* commit the current transaction and try again */ 3270 /* commit the current transaction and try again */
2891 if (!committed) { 3271 if (!committed && !root->fs_info->open_ioctl_trans) {
2892 committed = 1; 3272 committed = 1;
2893 trans = btrfs_join_transaction(root, 1); 3273 trans = btrfs_join_transaction(root, 1);
2894 if (!trans) 3274 if (!trans)
@@ -2916,7 +3296,7 @@ alloc:
2916 BTRFS_I(inode)->reserved_bytes += bytes; 3296 BTRFS_I(inode)->reserved_bytes += bytes;
2917 spin_unlock(&data_sinfo->lock); 3297 spin_unlock(&data_sinfo->lock);
2918 3298
2919 return btrfs_check_metadata_free_space(root); 3299 return 0;
2920} 3300}
2921 3301
2922/* 3302/*
@@ -3015,17 +3395,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3015 BUG_ON(!space_info); 3395 BUG_ON(!space_info);
3016 3396
3017 spin_lock(&space_info->lock); 3397 spin_lock(&space_info->lock);
3018 if (space_info->force_alloc) { 3398 if (space_info->force_alloc)
3019 force = 1; 3399 force = 1;
3020 space_info->force_alloc = 0;
3021 }
3022 if (space_info->full) { 3400 if (space_info->full) {
3023 spin_unlock(&space_info->lock); 3401 spin_unlock(&space_info->lock);
3024 goto out; 3402 goto out;
3025 } 3403 }
3026 3404
3027 thresh = space_info->total_bytes - space_info->bytes_readonly; 3405 thresh = space_info->total_bytes - space_info->bytes_readonly;
3028 thresh = div_factor(thresh, 6); 3406 thresh = div_factor(thresh, 8);
3029 if (!force && 3407 if (!force &&
3030 (space_info->bytes_used + space_info->bytes_pinned + 3408 (space_info->bytes_used + space_info->bytes_pinned +
3031 space_info->bytes_reserved + alloc_bytes) < thresh) { 3409 space_info->bytes_reserved + alloc_bytes) < thresh) {
@@ -3039,7 +3417,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3039 * we keep a reasonable number of metadata chunks allocated in the 3417 * we keep a reasonable number of metadata chunks allocated in the
3040 * FS as well. 3418 * FS as well.
3041 */ 3419 */
3042 if (flags & BTRFS_BLOCK_GROUP_DATA) { 3420 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3043 fs_info->data_chunk_allocations++; 3421 fs_info->data_chunk_allocations++;
3044 if (!(fs_info->data_chunk_allocations % 3422 if (!(fs_info->data_chunk_allocations %
3045 fs_info->metadata_ratio)) 3423 fs_info->metadata_ratio))
@@ -3047,8 +3425,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3047 } 3425 }
3048 3426
3049 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3427 ret = btrfs_alloc_chunk(trans, extent_root, flags);
3428 spin_lock(&space_info->lock);
3050 if (ret) 3429 if (ret)
3051 space_info->full = 1; 3430 space_info->full = 1;
3431 space_info->force_alloc = 0;
3432 spin_unlock(&space_info->lock);
3052out: 3433out:
3053 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3434 mutex_unlock(&extent_root->fs_info->chunk_mutex);
3054 return ret; 3435 return ret;
@@ -3306,6 +3687,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
3306 if (is_data) 3687 if (is_data)
3307 goto pinit; 3688 goto pinit;
3308 3689
3690 /*
3691 * discard is sloooow, and so triggering discards on
3692 * individual btree blocks isn't a good plan. Just
3693 * pin everything in discard mode.
3694 */
3695 if (btrfs_test_opt(root, DISCARD))
3696 goto pinit;
3697
3309 buf = btrfs_find_tree_block(root, bytenr, num_bytes); 3698 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
3310 if (!buf) 3699 if (!buf)
3311 goto pinit; 3700 goto pinit;
@@ -3713,7 +4102,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
3713} 4102}
3714 4103
3715enum btrfs_loop_type { 4104enum btrfs_loop_type {
3716 LOOP_CACHED_ONLY = 0, 4105 LOOP_FIND_IDEAL = 0,
3717 LOOP_CACHING_NOWAIT = 1, 4106 LOOP_CACHING_NOWAIT = 1,
3718 LOOP_CACHING_WAIT = 2, 4107 LOOP_CACHING_WAIT = 2,
3719 LOOP_ALLOC_CHUNK = 3, 4108 LOOP_ALLOC_CHUNK = 3,
@@ -3742,11 +4131,15 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3742 struct btrfs_block_group_cache *block_group = NULL; 4131 struct btrfs_block_group_cache *block_group = NULL;
3743 int empty_cluster = 2 * 1024 * 1024; 4132 int empty_cluster = 2 * 1024 * 1024;
3744 int allowed_chunk_alloc = 0; 4133 int allowed_chunk_alloc = 0;
4134 int done_chunk_alloc = 0;
3745 struct btrfs_space_info *space_info; 4135 struct btrfs_space_info *space_info;
3746 int last_ptr_loop = 0; 4136 int last_ptr_loop = 0;
3747 int loop = 0; 4137 int loop = 0;
3748 bool found_uncached_bg = false; 4138 bool found_uncached_bg = false;
3749 bool failed_cluster_refill = false; 4139 bool failed_cluster_refill = false;
4140 bool failed_alloc = false;
4141 u64 ideal_cache_percent = 0;
4142 u64 ideal_cache_offset = 0;
3750 4143
3751 WARN_ON(num_bytes < root->sectorsize); 4144 WARN_ON(num_bytes < root->sectorsize);
3752 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 4145 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3782,14 +4175,19 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3782 empty_cluster = 0; 4175 empty_cluster = 0;
3783 4176
3784 if (search_start == hint_byte) { 4177 if (search_start == hint_byte) {
4178ideal_cache:
3785 block_group = btrfs_lookup_block_group(root->fs_info, 4179 block_group = btrfs_lookup_block_group(root->fs_info,
3786 search_start); 4180 search_start);
3787 /* 4181 /*
3788 * we don't want to use the block group if it doesn't match our 4182 * we don't want to use the block group if it doesn't match our
3789 * allocation bits, or if its not cached. 4183 * allocation bits, or if its not cached.
4184 *
4185 * However if we are re-searching with an ideal block group
4186 * picked out then we don't care that the block group is cached.
3790 */ 4187 */
3791 if (block_group && block_group_bits(block_group, data) && 4188 if (block_group && block_group_bits(block_group, data) &&
3792 block_group_cache_done(block_group)) { 4189 (block_group->cached != BTRFS_CACHE_NO ||
4190 search_start == ideal_cache_offset)) {
3793 down_read(&space_info->groups_sem); 4191 down_read(&space_info->groups_sem);
3794 if (list_empty(&block_group->list) || 4192 if (list_empty(&block_group->list) ||
3795 block_group->ro) { 4193 block_group->ro) {
@@ -3801,13 +4199,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3801 */ 4199 */
3802 btrfs_put_block_group(block_group); 4200 btrfs_put_block_group(block_group);
3803 up_read(&space_info->groups_sem); 4201 up_read(&space_info->groups_sem);
3804 } else 4202 } else {
3805 goto have_block_group; 4203 goto have_block_group;
4204 }
3806 } else if (block_group) { 4205 } else if (block_group) {
3807 btrfs_put_block_group(block_group); 4206 btrfs_put_block_group(block_group);
3808 } 4207 }
3809 } 4208 }
3810
3811search: 4209search:
3812 down_read(&space_info->groups_sem); 4210 down_read(&space_info->groups_sem);
3813 list_for_each_entry(block_group, &space_info->block_groups, list) { 4211 list_for_each_entry(block_group, &space_info->block_groups, list) {
@@ -3819,28 +4217,45 @@ search:
3819 4217
3820have_block_group: 4218have_block_group:
3821 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 4219 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4220 u64 free_percent;
4221
4222 free_percent = btrfs_block_group_used(&block_group->item);
4223 free_percent *= 100;
4224 free_percent = div64_u64(free_percent,
4225 block_group->key.offset);
4226 free_percent = 100 - free_percent;
4227 if (free_percent > ideal_cache_percent &&
4228 likely(!block_group->ro)) {
4229 ideal_cache_offset = block_group->key.objectid;
4230 ideal_cache_percent = free_percent;
4231 }
4232
3822 /* 4233 /*
3823 * we want to start caching kthreads, but not too many 4234 * We only want to start kthread caching if we are at
3824 * right off the bat so we don't overwhelm the system, 4235 * the point where we will wait for caching to make
3825 * so only start them if there are less than 2 and we're 4236 * progress, or if our ideal search is over and we've
3826 * in the initial allocation phase. 4237 * found somebody to start caching.
3827 */ 4238 */
3828 if (loop > LOOP_CACHING_NOWAIT || 4239 if (loop > LOOP_CACHING_NOWAIT ||
3829 atomic_read(&space_info->caching_threads) < 2) { 4240 (loop > LOOP_FIND_IDEAL &&
4241 atomic_read(&space_info->caching_threads) < 2)) {
3830 ret = cache_block_group(block_group); 4242 ret = cache_block_group(block_group);
3831 BUG_ON(ret); 4243 BUG_ON(ret);
3832 } 4244 }
3833 }
3834
3835 cached = block_group_cache_done(block_group);
3836 if (unlikely(!cached)) {
3837 found_uncached_bg = true; 4245 found_uncached_bg = true;
3838 4246
3839 /* if we only want cached bgs, loop */ 4247 /*
3840 if (loop == LOOP_CACHED_ONLY) 4248 * If loop is set for cached only, try the next block
4249 * group.
4250 */
4251 if (loop == LOOP_FIND_IDEAL)
3841 goto loop; 4252 goto loop;
3842 } 4253 }
3843 4254
4255 cached = block_group_cache_done(block_group);
4256 if (unlikely(!cached))
4257 found_uncached_bg = true;
4258
3844 if (unlikely(block_group->ro)) 4259 if (unlikely(block_group->ro))
3845 goto loop; 4260 goto loop;
3846 4261
@@ -3951,14 +4366,23 @@ refill_cluster:
3951 4366
3952 offset = btrfs_find_space_for_alloc(block_group, search_start, 4367 offset = btrfs_find_space_for_alloc(block_group, search_start,
3953 num_bytes, empty_size); 4368 num_bytes, empty_size);
3954 if (!offset && (cached || (!cached && 4369 /*
3955 loop == LOOP_CACHING_NOWAIT))) { 4370 * If we didn't find a chunk, and we haven't failed on this
3956 goto loop; 4371 * block group before, and this block group is in the middle of
3957 } else if (!offset && (!cached && 4372 * caching and we are ok with waiting, then go ahead and wait
3958 loop > LOOP_CACHING_NOWAIT)) { 4373 * for progress to be made, and set failed_alloc to true.
4374 *
4375 * If failed_alloc is true then we've already waited on this
4376 * block group once and should move on to the next block group.
4377 */
4378 if (!offset && !failed_alloc && !cached &&
4379 loop > LOOP_CACHING_NOWAIT) {
3959 wait_block_group_cache_progress(block_group, 4380 wait_block_group_cache_progress(block_group,
3960 num_bytes + empty_size); 4381 num_bytes + empty_size);
4382 failed_alloc = true;
3961 goto have_block_group; 4383 goto have_block_group;
4384 } else if (!offset) {
4385 goto loop;
3962 } 4386 }
3963checks: 4387checks:
3964 search_start = stripe_align(root, offset); 4388 search_start = stripe_align(root, offset);
@@ -4006,13 +4430,16 @@ checks:
4006 break; 4430 break;
4007loop: 4431loop:
4008 failed_cluster_refill = false; 4432 failed_cluster_refill = false;
4433 failed_alloc = false;
4009 btrfs_put_block_group(block_group); 4434 btrfs_put_block_group(block_group);
4010 } 4435 }
4011 up_read(&space_info->groups_sem); 4436 up_read(&space_info->groups_sem);
4012 4437
4013 /* LOOP_CACHED_ONLY, only search fully cached block groups 4438 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
4014 * LOOP_CACHING_NOWAIT, search partially cached block groups, but 4439 * for them to make caching progress. Also
4015 * dont wait foR them to finish caching 4440 * determine the best possible bg to cache
4441 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
4442 * caching kthreads as we move along
4016 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 4443 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
4017 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 4444 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
4018 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 4445 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
@@ -4021,12 +4448,47 @@ loop:
4021 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && 4448 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
4022 (found_uncached_bg || empty_size || empty_cluster || 4449 (found_uncached_bg || empty_size || empty_cluster ||
4023 allowed_chunk_alloc)) { 4450 allowed_chunk_alloc)) {
4024 if (found_uncached_bg) { 4451 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
4025 found_uncached_bg = false; 4452 found_uncached_bg = false;
4026 if (loop < LOOP_CACHING_WAIT) { 4453 loop++;
4027 loop++; 4454 if (!ideal_cache_percent &&
4455 atomic_read(&space_info->caching_threads))
4028 goto search; 4456 goto search;
4029 } 4457
4458 /*
4459 * 1 of the following 2 things have happened so far
4460 *
4461 * 1) We found an ideal block group for caching that
4462 * is mostly full and will cache quickly, so we might
4463 * as well wait for it.
4464 *
4465 * 2) We searched for cached only and we didn't find
4466 * anything, and we didn't start any caching kthreads
4467 * either, so chances are we will loop through and
4468 * start a couple caching kthreads, and then come back
4469 * around and just wait for them. This will be slower
4470 * because we will have 2 caching kthreads reading at
4471 * the same time when we could have just started one
4472 * and waited for it to get far enough to give us an
4473 * allocation, so go ahead and go to the wait caching
4474 * loop.
4475 */
4476 loop = LOOP_CACHING_WAIT;
4477 search_start = ideal_cache_offset;
4478 ideal_cache_percent = 0;
4479 goto ideal_cache;
4480 } else if (loop == LOOP_FIND_IDEAL) {
4481 /*
4482 * Didn't find a uncached bg, wait on anything we find
4483 * next.
4484 */
4485 loop = LOOP_CACHING_WAIT;
4486 goto search;
4487 }
4488
4489 if (loop < LOOP_CACHING_WAIT) {
4490 loop++;
4491 goto search;
4030 } 4492 }
4031 4493
4032 if (loop == LOOP_ALLOC_CHUNK) { 4494 if (loop == LOOP_ALLOC_CHUNK) {
@@ -4038,7 +4500,8 @@ loop:
4038 ret = do_chunk_alloc(trans, root, num_bytes + 4500 ret = do_chunk_alloc(trans, root, num_bytes +
4039 2 * 1024 * 1024, data, 1); 4501 2 * 1024 * 1024, data, 1);
4040 allowed_chunk_alloc = 0; 4502 allowed_chunk_alloc = 0;
4041 } else { 4503 done_chunk_alloc = 1;
4504 } else if (!done_chunk_alloc) {
4042 space_info->force_alloc = 1; 4505 space_info->force_alloc = 1;
4043 } 4506 }
4044 4507
@@ -4063,21 +4526,32 @@ loop:
4063 return ret; 4526 return ret;
4064} 4527}
4065 4528
4066static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 4529static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4530 int dump_block_groups)
4067{ 4531{
4068 struct btrfs_block_group_cache *cache; 4532 struct btrfs_block_group_cache *cache;
4069 4533
4534 spin_lock(&info->lock);
4070 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4535 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4071 (unsigned long long)(info->total_bytes - info->bytes_used - 4536 (unsigned long long)(info->total_bytes - info->bytes_used -
4072 info->bytes_pinned - info->bytes_reserved), 4537 info->bytes_pinned - info->bytes_reserved -
4538 info->bytes_super),
4073 (info->full) ? "" : "not "); 4539 (info->full) ? "" : "not ");
4074 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4540 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
4075 " may_use=%llu, used=%llu\n", 4541 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
4542 "\n",
4076 (unsigned long long)info->total_bytes, 4543 (unsigned long long)info->total_bytes,
4077 (unsigned long long)info->bytes_pinned, 4544 (unsigned long long)info->bytes_pinned,
4078 (unsigned long long)info->bytes_delalloc, 4545 (unsigned long long)info->bytes_delalloc,
4079 (unsigned long long)info->bytes_may_use, 4546 (unsigned long long)info->bytes_may_use,
4080 (unsigned long long)info->bytes_used); 4547 (unsigned long long)info->bytes_used,
4548 (unsigned long long)info->bytes_root,
4549 (unsigned long long)info->bytes_super,
4550 (unsigned long long)info->bytes_reserved);
4551 spin_unlock(&info->lock);
4552
4553 if (!dump_block_groups)
4554 return;
4081 4555
4082 down_read(&info->groups_sem); 4556 down_read(&info->groups_sem);
4083 list_for_each_entry(cache, &info->block_groups, list) { 4557 list_for_each_entry(cache, &info->block_groups, list) {
@@ -4145,7 +4619,7 @@ again:
4145 printk(KERN_ERR "btrfs allocation failed flags %llu, " 4619 printk(KERN_ERR "btrfs allocation failed flags %llu, "
4146 "wanted %llu\n", (unsigned long long)data, 4620 "wanted %llu\n", (unsigned long long)data,
4147 (unsigned long long)num_bytes); 4621 (unsigned long long)num_bytes);
4148 dump_space_info(sinfo, num_bytes); 4622 dump_space_info(sinfo, num_bytes, 1);
4149 } 4623 }
4150 4624
4151 return ret; 4625 return ret;
@@ -4506,6 +4980,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4506 u64 bytenr; 4980 u64 bytenr;
4507 u64 generation; 4981 u64 generation;
4508 u64 refs; 4982 u64 refs;
4983 u64 flags;
4509 u64 last = 0; 4984 u64 last = 0;
4510 u32 nritems; 4985 u32 nritems;
4511 u32 blocksize; 4986 u32 blocksize;
@@ -4543,15 +5018,19 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4543 generation <= root->root_key.offset) 5018 generation <= root->root_key.offset)
4544 continue; 5019 continue;
4545 5020
5021 /* We don't lock the tree block, it's OK to be racy here */
5022 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5023 &refs, &flags);
5024 BUG_ON(ret);
5025 BUG_ON(refs == 0);
5026
4546 if (wc->stage == DROP_REFERENCE) { 5027 if (wc->stage == DROP_REFERENCE) {
4547 ret = btrfs_lookup_extent_info(trans, root,
4548 bytenr, blocksize,
4549 &refs, NULL);
4550 BUG_ON(ret);
4551 BUG_ON(refs == 0);
4552 if (refs == 1) 5028 if (refs == 1)
4553 goto reada; 5029 goto reada;
4554 5030
5031 if (wc->level == 1 &&
5032 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5033 continue;
4555 if (!wc->update_ref || 5034 if (!wc->update_ref ||
4556 generation <= root->root_key.offset) 5035 generation <= root->root_key.offset)
4557 continue; 5036 continue;
@@ -4560,6 +5039,10 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4560 &wc->update_progress); 5039 &wc->update_progress);
4561 if (ret < 0) 5040 if (ret < 0)
4562 continue; 5041 continue;
5042 } else {
5043 if (wc->level == 1 &&
5044 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5045 continue;
4563 } 5046 }
4564reada: 5047reada:
4565 ret = readahead_tree_block(root, bytenr, blocksize, 5048 ret = readahead_tree_block(root, bytenr, blocksize,
@@ -4583,7 +5066,7 @@ reada:
4583static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 5066static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4584 struct btrfs_root *root, 5067 struct btrfs_root *root,
4585 struct btrfs_path *path, 5068 struct btrfs_path *path,
4586 struct walk_control *wc) 5069 struct walk_control *wc, int lookup_info)
4587{ 5070{
4588 int level = wc->level; 5071 int level = wc->level;
4589 struct extent_buffer *eb = path->nodes[level]; 5072 struct extent_buffer *eb = path->nodes[level];
@@ -4598,8 +5081,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4598 * when reference count of tree block is 1, it won't increase 5081 * when reference count of tree block is 1, it won't increase
4599 * again. once full backref flag is set, we never clear it. 5082 * again. once full backref flag is set, we never clear it.
4600 */ 5083 */
4601 if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 5084 if (lookup_info &&
4602 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) { 5085 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5086 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
4603 BUG_ON(!path->locks[level]); 5087 BUG_ON(!path->locks[level]);
4604 ret = btrfs_lookup_extent_info(trans, root, 5088 ret = btrfs_lookup_extent_info(trans, root,
4605 eb->start, eb->len, 5089 eb->start, eb->len,
@@ -4660,7 +5144,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4660static noinline int do_walk_down(struct btrfs_trans_handle *trans, 5144static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4661 struct btrfs_root *root, 5145 struct btrfs_root *root,
4662 struct btrfs_path *path, 5146 struct btrfs_path *path,
4663 struct walk_control *wc) 5147 struct walk_control *wc, int *lookup_info)
4664{ 5148{
4665 u64 bytenr; 5149 u64 bytenr;
4666 u64 generation; 5150 u64 generation;
@@ -4680,8 +5164,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4680 * for the subtree 5164 * for the subtree
4681 */ 5165 */
4682 if (wc->stage == UPDATE_BACKREF && 5166 if (wc->stage == UPDATE_BACKREF &&
4683 generation <= root->root_key.offset) 5167 generation <= root->root_key.offset) {
5168 *lookup_info = 1;
4684 return 1; 5169 return 1;
5170 }
4685 5171
4686 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 5172 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
4687 blocksize = btrfs_level_size(root, level - 1); 5173 blocksize = btrfs_level_size(root, level - 1);
@@ -4694,14 +5180,19 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4694 btrfs_tree_lock(next); 5180 btrfs_tree_lock(next);
4695 btrfs_set_lock_blocking(next); 5181 btrfs_set_lock_blocking(next);
4696 5182
4697 if (wc->stage == DROP_REFERENCE) { 5183 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
4698 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, 5184 &wc->refs[level - 1],
4699 &wc->refs[level - 1], 5185 &wc->flags[level - 1]);
4700 &wc->flags[level - 1]); 5186 BUG_ON(ret);
4701 BUG_ON(ret); 5187 BUG_ON(wc->refs[level - 1] == 0);
4702 BUG_ON(wc->refs[level - 1] == 0); 5188 *lookup_info = 0;
4703 5189
5190 if (wc->stage == DROP_REFERENCE) {
4704 if (wc->refs[level - 1] > 1) { 5191 if (wc->refs[level - 1] > 1) {
5192 if (level == 1 &&
5193 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5194 goto skip;
5195
4705 if (!wc->update_ref || 5196 if (!wc->update_ref ||
4706 generation <= root->root_key.offset) 5197 generation <= root->root_key.offset)
4707 goto skip; 5198 goto skip;
@@ -4715,12 +5206,17 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4715 wc->stage = UPDATE_BACKREF; 5206 wc->stage = UPDATE_BACKREF;
4716 wc->shared_level = level - 1; 5207 wc->shared_level = level - 1;
4717 } 5208 }
5209 } else {
5210 if (level == 1 &&
5211 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5212 goto skip;
4718 } 5213 }
4719 5214
4720 if (!btrfs_buffer_uptodate(next, generation)) { 5215 if (!btrfs_buffer_uptodate(next, generation)) {
4721 btrfs_tree_unlock(next); 5216 btrfs_tree_unlock(next);
4722 free_extent_buffer(next); 5217 free_extent_buffer(next);
4723 next = NULL; 5218 next = NULL;
5219 *lookup_info = 1;
4724 } 5220 }
4725 5221
4726 if (!next) { 5222 if (!next) {
@@ -4743,21 +5239,22 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4743skip: 5239skip:
4744 wc->refs[level - 1] = 0; 5240 wc->refs[level - 1] = 0;
4745 wc->flags[level - 1] = 0; 5241 wc->flags[level - 1] = 0;
5242 if (wc->stage == DROP_REFERENCE) {
5243 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5244 parent = path->nodes[level]->start;
5245 } else {
5246 BUG_ON(root->root_key.objectid !=
5247 btrfs_header_owner(path->nodes[level]));
5248 parent = 0;
5249 }
4746 5250
4747 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 5251 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
4748 parent = path->nodes[level]->start; 5252 root->root_key.objectid, level - 1, 0);
4749 } else { 5253 BUG_ON(ret);
4750 BUG_ON(root->root_key.objectid !=
4751 btrfs_header_owner(path->nodes[level]));
4752 parent = 0;
4753 } 5254 }
4754
4755 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
4756 root->root_key.objectid, level - 1, 0);
4757 BUG_ON(ret);
4758
4759 btrfs_tree_unlock(next); 5255 btrfs_tree_unlock(next);
4760 free_extent_buffer(next); 5256 free_extent_buffer(next);
5257 *lookup_info = 1;
4761 return 1; 5258 return 1;
4762} 5259}
4763 5260
@@ -4871,6 +5368,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4871 struct walk_control *wc) 5368 struct walk_control *wc)
4872{ 5369{
4873 int level = wc->level; 5370 int level = wc->level;
5371 int lookup_info = 1;
4874 int ret; 5372 int ret;
4875 5373
4876 while (level >= 0) { 5374 while (level >= 0) {
@@ -4878,14 +5376,14 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4878 btrfs_header_nritems(path->nodes[level])) 5376 btrfs_header_nritems(path->nodes[level]))
4879 break; 5377 break;
4880 5378
4881 ret = walk_down_proc(trans, root, path, wc); 5379 ret = walk_down_proc(trans, root, path, wc, lookup_info);
4882 if (ret > 0) 5380 if (ret > 0)
4883 break; 5381 break;
4884 5382
4885 if (level == 0) 5383 if (level == 0)
4886 break; 5384 break;
4887 5385
4888 ret = do_walk_down(trans, root, path, wc); 5386 ret = do_walk_down(trans, root, path, wc, &lookup_info);
4889 if (ret > 0) { 5387 if (ret > 0) {
4890 path->slots[level]++; 5388 path->slots[level]++;
4891 continue; 5389 continue;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0cb88f8146e..96577e8bf9f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
280 return NULL; 280 return NULL;
281} 281}
282 282
283static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
284 struct extent_state *other)
285{
286 if (tree->ops && tree->ops->merge_extent_hook)
287 tree->ops->merge_extent_hook(tree->mapping->host, new,
288 other);
289}
290
283/* 291/*
284 * utility function to look for merge candidates inside a given range. 292 * utility function to look for merge candidates inside a given range.
285 * Any extents with matching state are merged together into a single 293 * Any extents with matching state are merged together into a single
@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
303 other = rb_entry(other_node, struct extent_state, rb_node); 311 other = rb_entry(other_node, struct extent_state, rb_node);
304 if (other->end == state->start - 1 && 312 if (other->end == state->start - 1 &&
305 other->state == state->state) { 313 other->state == state->state) {
314 merge_cb(tree, state, other);
306 state->start = other->start; 315 state->start = other->start;
307 other->tree = NULL; 316 other->tree = NULL;
308 rb_erase(&other->rb_node, &tree->state); 317 rb_erase(&other->rb_node, &tree->state);
@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
314 other = rb_entry(other_node, struct extent_state, rb_node); 323 other = rb_entry(other_node, struct extent_state, rb_node);
315 if (other->start == state->end + 1 && 324 if (other->start == state->end + 1 &&
316 other->state == state->state) { 325 other->state == state->state) {
326 merge_cb(tree, state, other);
317 other->start = state->start; 327 other->start = state->start;
318 state->tree = NULL; 328 state->tree = NULL;
319 rb_erase(&state->rb_node, &tree->state); 329 rb_erase(&state->rb_node, &tree->state);
320 free_extent_state(state); 330 free_extent_state(state);
331 state = NULL;
321 } 332 }
322 } 333 }
334
323 return 0; 335 return 0;
324} 336}
325 337
326static void set_state_cb(struct extent_io_tree *tree, 338static int set_state_cb(struct extent_io_tree *tree,
327 struct extent_state *state, 339 struct extent_state *state,
328 unsigned long bits) 340 unsigned long bits)
329{ 341{
330 if (tree->ops && tree->ops->set_bit_hook) { 342 if (tree->ops && tree->ops->set_bit_hook) {
331 tree->ops->set_bit_hook(tree->mapping->host, state->start, 343 return tree->ops->set_bit_hook(tree->mapping->host,
332 state->end, state->state, bits); 344 state->start, state->end,
345 state->state, bits);
333 } 346 }
347
348 return 0;
334} 349}
335 350
336static void clear_state_cb(struct extent_io_tree *tree, 351static void clear_state_cb(struct extent_io_tree *tree,
337 struct extent_state *state, 352 struct extent_state *state,
338 unsigned long bits) 353 unsigned long bits)
339{ 354{
340 if (tree->ops && tree->ops->clear_bit_hook) { 355 if (tree->ops && tree->ops->clear_bit_hook)
341 tree->ops->clear_bit_hook(tree->mapping->host, state->start, 356 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
342 state->end, state->state, bits);
343 }
344} 357}
345 358
346/* 359/*
@@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree,
358 int bits) 371 int bits)
359{ 372{
360 struct rb_node *node; 373 struct rb_node *node;
374 int ret;
361 375
362 if (end < start) { 376 if (end < start) {
363 printk(KERN_ERR "btrfs end < start %llu %llu\n", 377 printk(KERN_ERR "btrfs end < start %llu %llu\n",
@@ -365,11 +379,14 @@ static int insert_state(struct extent_io_tree *tree,
365 (unsigned long long)start); 379 (unsigned long long)start);
366 WARN_ON(1); 380 WARN_ON(1);
367 } 381 }
368 if (bits & EXTENT_DIRTY)
369 tree->dirty_bytes += end - start + 1;
370 state->start = start; 382 state->start = start;
371 state->end = end; 383 state->end = end;
372 set_state_cb(tree, state, bits); 384 ret = set_state_cb(tree, state, bits);
385 if (ret)
386 return ret;
387
388 if (bits & EXTENT_DIRTY)
389 tree->dirty_bytes += end - start + 1;
373 state->state |= bits; 390 state->state |= bits;
374 node = tree_insert(&tree->state, end, &state->rb_node); 391 node = tree_insert(&tree->state, end, &state->rb_node);
375 if (node) { 392 if (node) {
@@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree,
387 return 0; 404 return 0;
388} 405}
389 406
407static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
408 u64 split)
409{
410 if (tree->ops && tree->ops->split_extent_hook)
411 return tree->ops->split_extent_hook(tree->mapping->host,
412 orig, split);
413 return 0;
414}
415
390/* 416/*
391 * split a given extent state struct in two, inserting the preallocated 417 * split a given extent state struct in two, inserting the preallocated
392 * struct 'prealloc' as the newly created second half. 'split' indicates an 418 * struct 'prealloc' as the newly created second half. 'split' indicates an
@@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
405 struct extent_state *prealloc, u64 split) 431 struct extent_state *prealloc, u64 split)
406{ 432{
407 struct rb_node *node; 433 struct rb_node *node;
434
435 split_cb(tree, orig, split);
436
408 prealloc->start = orig->start; 437 prealloc->start = orig->start;
409 prealloc->end = split - 1; 438 prealloc->end = split - 1;
410 prealloc->state = orig->state; 439 prealloc->state = orig->state;
@@ -431,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
431 struct extent_state *state, int bits, int wake, 460 struct extent_state *state, int bits, int wake,
432 int delete) 461 int delete)
433{ 462{
434 int ret = state->state & bits; 463 int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
464 int ret = state->state & bits_to_clear;
435 465
436 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 466 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
437 u64 range = state->end - state->start + 1; 467 u64 range = state->end - state->start + 1;
@@ -439,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree,
439 tree->dirty_bytes -= range; 469 tree->dirty_bytes -= range;
440 } 470 }
441 clear_state_cb(tree, state, bits); 471 clear_state_cb(tree, state, bits);
442 state->state &= ~bits; 472 state->state &= ~bits_to_clear;
443 if (wake) 473 if (wake)
444 wake_up(&state->wq); 474 wake_up(&state->wq);
445 if (delete || state->state == 0) { 475 if (delete || state->state == 0) {
@@ -542,8 +572,8 @@ hit_next:
542 if (err) 572 if (err)
543 goto out; 573 goto out;
544 if (state->end <= end) { 574 if (state->end <= end) {
545 set |= clear_state_bit(tree, state, bits, 575 set |= clear_state_bit(tree, state, bits, wake,
546 wake, delete); 576 delete);
547 if (last_end == (u64)-1) 577 if (last_end == (u64)-1)
548 goto out; 578 goto out;
549 start = last_end + 1; 579 start = last_end + 1;
@@ -561,12 +591,11 @@ hit_next:
561 prealloc = alloc_extent_state(GFP_ATOMIC); 591 prealloc = alloc_extent_state(GFP_ATOMIC);
562 err = split_state(tree, state, prealloc, end + 1); 592 err = split_state(tree, state, prealloc, end + 1);
563 BUG_ON(err == -EEXIST); 593 BUG_ON(err == -EEXIST);
564
565 if (wake) 594 if (wake)
566 wake_up(&state->wq); 595 wake_up(&state->wq);
567 596
568 set |= clear_state_bit(tree, prealloc, bits, 597 set |= clear_state_bit(tree, prealloc, bits, wake, delete);
569 wake, delete); 598
570 prealloc = NULL; 599 prealloc = NULL;
571 goto out; 600 goto out;
572 } 601 }
@@ -667,16 +696,23 @@ out:
667 return 0; 696 return 0;
668} 697}
669 698
670static void set_state_bits(struct extent_io_tree *tree, 699static int set_state_bits(struct extent_io_tree *tree,
671 struct extent_state *state, 700 struct extent_state *state,
672 int bits) 701 int bits)
673{ 702{
703 int ret;
704
705 ret = set_state_cb(tree, state, bits);
706 if (ret)
707 return ret;
708
674 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 709 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
675 u64 range = state->end - state->start + 1; 710 u64 range = state->end - state->start + 1;
676 tree->dirty_bytes += range; 711 tree->dirty_bytes += range;
677 } 712 }
678 set_state_cb(tree, state, bits);
679 state->state |= bits; 713 state->state |= bits;
714
715 return 0;
680} 716}
681 717
682static void cache_state(struct extent_state *state, 718static void cache_state(struct extent_state *state,
@@ -758,7 +794,10 @@ hit_next:
758 goto out; 794 goto out;
759 } 795 }
760 796
761 set_state_bits(tree, state, bits); 797 err = set_state_bits(tree, state, bits);
798 if (err)
799 goto out;
800
762 cache_state(state, cached_state); 801 cache_state(state, cached_state);
763 merge_state(tree, state); 802 merge_state(tree, state);
764 if (last_end == (u64)-1) 803 if (last_end == (u64)-1)
@@ -805,7 +844,9 @@ hit_next:
805 if (err) 844 if (err)
806 goto out; 845 goto out;
807 if (state->end <= end) { 846 if (state->end <= end) {
808 set_state_bits(tree, state, bits); 847 err = set_state_bits(tree, state, bits);
848 if (err)
849 goto out;
809 cache_state(state, cached_state); 850 cache_state(state, cached_state);
810 merge_state(tree, state); 851 merge_state(tree, state);
811 if (last_end == (u64)-1) 852 if (last_end == (u64)-1)
@@ -829,11 +870,13 @@ hit_next:
829 this_end = last_start - 1; 870 this_end = last_start - 1;
830 err = insert_state(tree, prealloc, start, this_end, 871 err = insert_state(tree, prealloc, start, this_end,
831 bits); 872 bits);
832 cache_state(prealloc, cached_state);
833 prealloc = NULL;
834 BUG_ON(err == -EEXIST); 873 BUG_ON(err == -EEXIST);
835 if (err) 874 if (err) {
875 prealloc = NULL;
836 goto out; 876 goto out;
877 }
878 cache_state(prealloc, cached_state);
879 prealloc = NULL;
837 start = this_end + 1; 880 start = this_end + 1;
838 goto search_again; 881 goto search_again;
839 } 882 }
@@ -852,7 +895,11 @@ hit_next:
852 err = split_state(tree, state, prealloc, end + 1); 895 err = split_state(tree, state, prealloc, end + 1);
853 BUG_ON(err == -EEXIST); 896 BUG_ON(err == -EEXIST);
854 897
855 set_state_bits(tree, prealloc, bits); 898 err = set_state_bits(tree, prealloc, bits);
899 if (err) {
900 prealloc = NULL;
901 goto out;
902 }
856 cache_state(prealloc, cached_state); 903 cache_state(prealloc, cached_state);
857 merge_state(tree, prealloc); 904 merge_state(tree, prealloc);
858 prealloc = NULL; 905 prealloc = NULL;
@@ -910,7 +957,8 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
910 gfp_t mask) 957 gfp_t mask)
911{ 958{
912 return clear_extent_bit(tree, start, end, 959 return clear_extent_bit(tree, start, end,
913 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, 960 EXTENT_DIRTY | EXTENT_DELALLOC |
961 EXTENT_DO_ACCOUNTING, 0, 0,
914 NULL, mask); 962 NULL, mask);
915} 963}
916 964
@@ -1355,12 +1403,7 @@ out_failed:
1355int extent_clear_unlock_delalloc(struct inode *inode, 1403int extent_clear_unlock_delalloc(struct inode *inode,
1356 struct extent_io_tree *tree, 1404 struct extent_io_tree *tree,
1357 u64 start, u64 end, struct page *locked_page, 1405 u64 start, u64 end, struct page *locked_page,
1358 int unlock_pages, 1406 unsigned long op)
1359 int clear_unlock,
1360 int clear_delalloc, int clear_dirty,
1361 int set_writeback,
1362 int end_writeback,
1363 int set_private2)
1364{ 1407{
1365 int ret; 1408 int ret;
1366 struct page *pages[16]; 1409 struct page *pages[16];
@@ -1370,17 +1413,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1370 int i; 1413 int i;
1371 int clear_bits = 0; 1414 int clear_bits = 0;
1372 1415
1373 if (clear_unlock) 1416 if (op & EXTENT_CLEAR_UNLOCK)
1374 clear_bits |= EXTENT_LOCKED; 1417 clear_bits |= EXTENT_LOCKED;
1375 if (clear_dirty) 1418 if (op & EXTENT_CLEAR_DIRTY)
1376 clear_bits |= EXTENT_DIRTY; 1419 clear_bits |= EXTENT_DIRTY;
1377 1420
1378 if (clear_delalloc) 1421 if (op & EXTENT_CLEAR_DELALLOC)
1379 clear_bits |= EXTENT_DELALLOC; 1422 clear_bits |= EXTENT_DELALLOC;
1380 1423
1424 if (op & EXTENT_CLEAR_ACCOUNTING)
1425 clear_bits |= EXTENT_DO_ACCOUNTING;
1426
1381 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1427 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1382 if (!(unlock_pages || clear_dirty || set_writeback || end_writeback || 1428 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1383 set_private2)) 1429 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1430 EXTENT_SET_PRIVATE2)))
1384 return 0; 1431 return 0;
1385 1432
1386 while (nr_pages > 0) { 1433 while (nr_pages > 0) {
@@ -1389,20 +1436,20 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1389 nr_pages, ARRAY_SIZE(pages)), pages); 1436 nr_pages, ARRAY_SIZE(pages)), pages);
1390 for (i = 0; i < ret; i++) { 1437 for (i = 0; i < ret; i++) {
1391 1438
1392 if (set_private2) 1439 if (op & EXTENT_SET_PRIVATE2)
1393 SetPagePrivate2(pages[i]); 1440 SetPagePrivate2(pages[i]);
1394 1441
1395 if (pages[i] == locked_page) { 1442 if (pages[i] == locked_page) {
1396 page_cache_release(pages[i]); 1443 page_cache_release(pages[i]);
1397 continue; 1444 continue;
1398 } 1445 }
1399 if (clear_dirty) 1446 if (op & EXTENT_CLEAR_DIRTY)
1400 clear_page_dirty_for_io(pages[i]); 1447 clear_page_dirty_for_io(pages[i]);
1401 if (set_writeback) 1448 if (op & EXTENT_SET_WRITEBACK)
1402 set_page_writeback(pages[i]); 1449 set_page_writeback(pages[i]);
1403 if (end_writeback) 1450 if (op & EXTENT_END_WRITEBACK)
1404 end_page_writeback(pages[i]); 1451 end_page_writeback(pages[i]);
1405 if (unlock_pages) 1452 if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1406 unlock_page(pages[i]); 1453 unlock_page(pages[i]);
1407 page_cache_release(pages[i]); 1454 page_cache_release(pages[i]);
1408 } 1455 }
@@ -2668,7 +2715,8 @@ int extent_invalidatepage(struct extent_io_tree *tree,
2668 lock_extent(tree, start, end, GFP_NOFS); 2715 lock_extent(tree, start, end, GFP_NOFS);
2669 wait_on_page_writeback(page); 2716 wait_on_page_writeback(page);
2670 clear_extent_bit(tree, start, end, 2717 clear_extent_bit(tree, start, end,
2671 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, 2718 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2719 EXTENT_DO_ACCOUNTING,
2672 1, 1, NULL, GFP_NOFS); 2720 1, 1, NULL, GFP_NOFS);
2673 return 0; 2721 return 0;
2674} 2722}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 14ed16fd862..36de250a7b2 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -15,6 +15,7 @@
15#define EXTENT_BUFFER_FILLED (1 << 8) 15#define EXTENT_BUFFER_FILLED (1 << 8)
16#define EXTENT_BOUNDARY (1 << 9) 16#define EXTENT_BOUNDARY (1 << 9)
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11)
18#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
19 20
20/* flags for bio submission */ 21/* flags for bio submission */
@@ -25,6 +26,16 @@
25#define EXTENT_BUFFER_BLOCKING 1 26#define EXTENT_BUFFER_BLOCKING 1
26#define EXTENT_BUFFER_DIRTY 2 27#define EXTENT_BUFFER_DIRTY 2
27 28
29/* these are flags for extent_clear_unlock_delalloc */
30#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
31#define EXTENT_CLEAR_UNLOCK 0x2
32#define EXTENT_CLEAR_DELALLOC 0x4
33#define EXTENT_CLEAR_DIRTY 0x8
34#define EXTENT_SET_WRITEBACK 0x10
35#define EXTENT_END_WRITEBACK 0x20
36#define EXTENT_SET_PRIVATE2 0x40
37#define EXTENT_CLEAR_ACCOUNTING 0x80
38
28/* 39/*
29 * page->private values. Every page that is controlled by the extent 40 * page->private values. Every page that is controlled by the extent
30 * map has page->private set to one. 41 * map has page->private set to one.
@@ -60,8 +71,13 @@ struct extent_io_ops {
60 struct extent_state *state, int uptodate); 71 struct extent_state *state, int uptodate);
61 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, 72 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
62 unsigned long old, unsigned long bits); 73 unsigned long old, unsigned long bits);
63 int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end, 74 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
64 unsigned long old, unsigned long bits); 75 unsigned long bits);
76 int (*merge_extent_hook)(struct inode *inode,
77 struct extent_state *new,
78 struct extent_state *other);
79 int (*split_extent_hook)(struct inode *inode,
80 struct extent_state *orig, u64 split);
65 int (*write_cache_pages_lock_hook)(struct page *page); 81 int (*write_cache_pages_lock_hook)(struct page *page);
66}; 82};
67 83
@@ -79,10 +95,14 @@ struct extent_state {
79 u64 start; 95 u64 start;
80 u64 end; /* inclusive */ 96 u64 end; /* inclusive */
81 struct rb_node rb_node; 97 struct rb_node rb_node;
98
99 /* ADD NEW ELEMENTS AFTER THIS */
82 struct extent_io_tree *tree; 100 struct extent_io_tree *tree;
83 wait_queue_head_t wq; 101 wait_queue_head_t wq;
84 atomic_t refs; 102 atomic_t refs;
85 unsigned long state; 103 unsigned long state;
104 u64 split_start;
105 u64 split_end;
86 106
87 /* for use by the FS */ 107 /* for use by the FS */
88 u64 private; 108 u64 private;
@@ -279,10 +299,5 @@ int extent_range_uptodate(struct extent_io_tree *tree,
279int extent_clear_unlock_delalloc(struct inode *inode, 299int extent_clear_unlock_delalloc(struct inode *inode,
280 struct extent_io_tree *tree, 300 struct extent_io_tree *tree,
281 u64 start, u64 end, struct page *locked_page, 301 u64 start, u64 end, struct page *locked_page,
282 int unlock_page, 302 unsigned long op);
283 int clear_unlock,
284 int clear_delalloc, int clear_dirty,
285 int set_writeback,
286 int end_writeback,
287 int set_private2);
288#endif 303#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 2c726b7b9fa..ccbdcb54ec5 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -208,7 +208,7 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
208 write_lock(&tree->lock); 208 write_lock(&tree->lock);
209 em = lookup_extent_mapping(tree, start, len); 209 em = lookup_extent_mapping(tree, start, len);
210 210
211 WARN_ON(em->start != start || !em); 211 WARN_ON(!em || em->start != start);
212 212
213 if (!em) 213 if (!em)
214 goto out; 214 goto out;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a3492a3ad96..06550affbd2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -123,7 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
124 124
125 end_of_last_block = start_pos + num_bytes - 1; 125 end_of_last_block = start_pos + num_bytes - 1;
126 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); 126 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
127 if (err)
128 return err;
129
127 for (i = 0; i < num_pages; i++) { 130 for (i = 0; i < num_pages; i++) {
128 struct page *p = pages[i]; 131 struct page *p = pages[i];
129 SetPageUptodate(p); 132 SetPageUptodate(p);
@@ -875,7 +878,8 @@ again:
875 btrfs_put_ordered_extent(ordered); 878 btrfs_put_ordered_extent(ordered);
876 879
877 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, 880 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
878 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC, 881 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
882 EXTENT_DO_ACCOUNTING,
879 GFP_NOFS); 883 GFP_NOFS);
880 unlock_extent(&BTRFS_I(inode)->io_tree, 884 unlock_extent(&BTRFS_I(inode)->io_tree,
881 start_pos, last_pos - 1, GFP_NOFS); 885 start_pos, last_pos - 1, GFP_NOFS);
@@ -917,21 +921,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
917 start_pos = pos; 921 start_pos = pos;
918 922
919 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 923 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
924
925 /* do the reserve before the mutex lock in case we have to do some
926 * flushing. We wouldn't deadlock, but this is more polite.
927 */
928 err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
929 if (err)
930 goto out_nolock;
931
932 mutex_lock(&inode->i_mutex);
933
920 current->backing_dev_info = inode->i_mapping->backing_dev_info; 934 current->backing_dev_info = inode->i_mapping->backing_dev_info;
921 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 935 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
922 if (err) 936 if (err)
923 goto out_nolock; 937 goto out;
938
924 if (count == 0) 939 if (count == 0)
925 goto out_nolock; 940 goto out;
926 941
927 err = file_remove_suid(file); 942 err = file_remove_suid(file);
928 if (err) 943 if (err)
929 goto out_nolock; 944 goto out;
945
930 file_update_time(file); 946 file_update_time(file);
931 947
932 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 948 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
933 949
934 mutex_lock(&inode->i_mutex); 950 /* generic_write_checks can change our pos */
951 start_pos = pos;
952
935 BTRFS_I(inode)->sequence++; 953 BTRFS_I(inode)->sequence++;
936 first_index = pos >> PAGE_CACHE_SHIFT; 954 first_index = pos >> PAGE_CACHE_SHIFT;
937 last_index = (pos + count) >> PAGE_CACHE_SHIFT; 955 last_index = (pos + count) >> PAGE_CACHE_SHIFT;
@@ -1005,9 +1023,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1005 } 1023 }
1006 1024
1007 if (will_write) { 1025 if (will_write) {
1008 btrfs_fdatawrite_range(inode->i_mapping, pos, 1026 filemap_fdatawrite_range(inode->i_mapping, pos,
1009 pos + write_bytes - 1, 1027 pos + write_bytes - 1);
1010 WB_SYNC_ALL);
1011 } else { 1028 } else {
1012 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1029 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1013 num_pages); 1030 num_pages);
@@ -1028,6 +1045,7 @@ out:
1028 mutex_unlock(&inode->i_mutex); 1045 mutex_unlock(&inode->i_mutex);
1029 if (ret) 1046 if (ret)
1030 err = ret; 1047 err = ret;
1048 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
1031 1049
1032out_nolock: 1050out_nolock:
1033 kfree(pages); 1051 kfree(pages);
@@ -1068,8 +1086,10 @@ out_nolock:
1068 btrfs_end_transaction(trans, root); 1086 btrfs_end_transaction(trans, root);
1069 else 1087 else
1070 btrfs_commit_transaction(trans, root); 1088 btrfs_commit_transaction(trans, root);
1071 } else { 1089 } else if (ret != BTRFS_NO_LOG_SYNC) {
1072 btrfs_commit_transaction(trans, root); 1090 btrfs_commit_transaction(trans, root);
1091 } else {
1092 btrfs_end_transaction(trans, root);
1073 } 1093 }
1074 } 1094 }
1075 if (file->f_flags & O_DIRECT) { 1095 if (file->f_flags & O_DIRECT) {
@@ -1119,6 +1139,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1119 int ret = 0; 1139 int ret = 0;
1120 struct btrfs_trans_handle *trans; 1140 struct btrfs_trans_handle *trans;
1121 1141
1142
1143 /* we wait first, since the writeback may change the inode */
1144 root->log_batch++;
1145 /* the VFS called filemap_fdatawrite for us */
1146 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1147 root->log_batch++;
1148
1122 /* 1149 /*
1123 * check the transaction that last modified this inode 1150 * check the transaction that last modified this inode
1124 * and see if its already been committed 1151 * and see if its already been committed
@@ -1126,6 +1153,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1126 if (!BTRFS_I(inode)->last_trans) 1153 if (!BTRFS_I(inode)->last_trans)
1127 goto out; 1154 goto out;
1128 1155
1156 /*
1157 * if the last transaction that changed this file was before
1158 * the current transaction, we can bail out now without any
1159 * syncing
1160 */
1129 mutex_lock(&root->fs_info->trans_mutex); 1161 mutex_lock(&root->fs_info->trans_mutex);
1130 if (BTRFS_I(inode)->last_trans <= 1162 if (BTRFS_I(inode)->last_trans <=
1131 root->fs_info->last_trans_committed) { 1163 root->fs_info->last_trans_committed) {
@@ -1135,13 +1167,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1135 } 1167 }
1136 mutex_unlock(&root->fs_info->trans_mutex); 1168 mutex_unlock(&root->fs_info->trans_mutex);
1137 1169
1138 root->log_batch++;
1139 filemap_fdatawrite(inode->i_mapping);
1140 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1141 root->log_batch++;
1142
1143 if (datasync && !(inode->i_state & I_DIRTY_PAGES))
1144 goto out;
1145 /* 1170 /*
1146 * ok we haven't committed the transaction yet, lets do a commit 1171 * ok we haven't committed the transaction yet, lets do a commit
1147 */ 1172 */
@@ -1170,14 +1195,18 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1170 */ 1195 */
1171 mutex_unlock(&dentry->d_inode->i_mutex); 1196 mutex_unlock(&dentry->d_inode->i_mutex);
1172 1197
1173 if (ret > 0) { 1198 if (ret != BTRFS_NO_LOG_SYNC) {
1174 ret = btrfs_commit_transaction(trans, root); 1199 if (ret > 0) {
1175 } else {
1176 ret = btrfs_sync_log(trans, root);
1177 if (ret == 0)
1178 ret = btrfs_end_transaction(trans, root);
1179 else
1180 ret = btrfs_commit_transaction(trans, root); 1200 ret = btrfs_commit_transaction(trans, root);
1201 } else {
1202 ret = btrfs_sync_log(trans, root);
1203 if (ret == 0)
1204 ret = btrfs_end_transaction(trans, root);
1205 else
1206 ret = btrfs_commit_transaction(trans, root);
1207 }
1208 } else {
1209 ret = btrfs_end_transaction(trans, root);
1181 } 1210 }
1182 mutex_lock(&dentry->d_inode->i_mutex); 1211 mutex_lock(&dentry->d_inode->i_mutex);
1183out: 1212out:
@@ -1196,7 +1225,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1196 return 0; 1225 return 0;
1197} 1226}
1198 1227
1199struct file_operations btrfs_file_operations = { 1228const struct file_operations btrfs_file_operations = {
1200 .llseek = generic_file_llseek, 1229 .llseek = generic_file_llseek,
1201 .read = do_sync_read, 1230 .read = do_sync_read,
1202 .aio_read = generic_file_aio_read, 1231 .aio_read = generic_file_aio_read,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5c2caad7621..cb2849f0325 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1296,7 +1296,7 @@ again:
1296 window_start = entry->offset; 1296 window_start = entry->offset;
1297 window_free = entry->bytes; 1297 window_free = entry->bytes;
1298 last = entry; 1298 last = entry;
1299 max_extent = 0; 1299 max_extent = entry->bytes;
1300 } else { 1300 } else {
1301 last = next; 1301 last = next;
1302 window_free += next->bytes; 1302 window_free += next->bytes;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e9b76bcd1c1..b3ad168a0bf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -62,7 +62,7 @@ static const struct inode_operations btrfs_special_inode_operations;
62static const struct inode_operations btrfs_file_inode_operations; 62static const struct inode_operations btrfs_file_inode_operations;
63static const struct address_space_operations btrfs_aops; 63static const struct address_space_operations btrfs_aops;
64static const struct address_space_operations btrfs_symlink_aops; 64static const struct address_space_operations btrfs_symlink_aops;
65static struct file_operations btrfs_dir_file_operations; 65static const struct file_operations btrfs_dir_file_operations;
66static struct extent_io_ops btrfs_extent_io_ops; 66static struct extent_io_ops btrfs_extent_io_ops;
67 67
68static struct kmem_cache *btrfs_inode_cachep; 68static struct kmem_cache *btrfs_inode_cachep;
@@ -424,9 +424,12 @@ again:
424 * and free up our temp pages. 424 * and free up our temp pages.
425 */ 425 */
426 extent_clear_unlock_delalloc(inode, 426 extent_clear_unlock_delalloc(inode,
427 &BTRFS_I(inode)->io_tree, 427 &BTRFS_I(inode)->io_tree,
428 start, end, NULL, 1, 0, 428 start, end, NULL,
429 0, 1, 1, 1, 0); 429 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
430 EXTENT_CLEAR_DELALLOC |
431 EXTENT_CLEAR_ACCOUNTING |
432 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
430 ret = 0; 433 ret = 0;
431 goto free_pages_out; 434 goto free_pages_out;
432 } 435 }
@@ -535,7 +538,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
535 struct btrfs_root *root = BTRFS_I(inode)->root; 538 struct btrfs_root *root = BTRFS_I(inode)->root;
536 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 539 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
537 struct extent_io_tree *io_tree; 540 struct extent_io_tree *io_tree;
538 int ret; 541 int ret = 0;
539 542
540 if (list_empty(&async_cow->extents)) 543 if (list_empty(&async_cow->extents))
541 return 0; 544 return 0;
@@ -549,6 +552,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
549 552
550 io_tree = &BTRFS_I(inode)->io_tree; 553 io_tree = &BTRFS_I(inode)->io_tree;
551 554
555retry:
552 /* did the compression code fall back to uncompressed IO? */ 556 /* did the compression code fall back to uncompressed IO? */
553 if (!async_extent->pages) { 557 if (!async_extent->pages) {
554 int page_started = 0; 558 int page_started = 0;
@@ -559,11 +563,11 @@ static noinline int submit_compressed_extents(struct inode *inode,
559 async_extent->ram_size - 1, GFP_NOFS); 563 async_extent->ram_size - 1, GFP_NOFS);
560 564
561 /* allocate blocks */ 565 /* allocate blocks */
562 cow_file_range(inode, async_cow->locked_page, 566 ret = cow_file_range(inode, async_cow->locked_page,
563 async_extent->start, 567 async_extent->start,
564 async_extent->start + 568 async_extent->start +
565 async_extent->ram_size - 1, 569 async_extent->ram_size - 1,
566 &page_started, &nr_written, 0); 570 &page_started, &nr_written, 0);
567 571
568 /* 572 /*
569 * if page_started, cow_file_range inserted an 573 * if page_started, cow_file_range inserted an
@@ -571,7 +575,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
571 * and IO for us. Otherwise, we need to submit 575 * and IO for us. Otherwise, we need to submit
572 * all those pages down to the drive. 576 * all those pages down to the drive.
573 */ 577 */
574 if (!page_started) 578 if (!page_started && !ret)
575 extent_write_locked_range(io_tree, 579 extent_write_locked_range(io_tree,
576 inode, async_extent->start, 580 inode, async_extent->start,
577 async_extent->start + 581 async_extent->start +
@@ -599,7 +603,21 @@ static noinline int submit_compressed_extents(struct inode *inode,
599 async_extent->compressed_size, 603 async_extent->compressed_size,
600 0, alloc_hint, 604 0, alloc_hint,
601 (u64)-1, &ins, 1); 605 (u64)-1, &ins, 1);
602 BUG_ON(ret); 606 if (ret) {
607 int i;
608 for (i = 0; i < async_extent->nr_pages; i++) {
609 WARN_ON(async_extent->pages[i]->mapping);
610 page_cache_release(async_extent->pages[i]);
611 }
612 kfree(async_extent->pages);
613 async_extent->nr_pages = 0;
614 async_extent->pages = NULL;
615 unlock_extent(io_tree, async_extent->start,
616 async_extent->start +
617 async_extent->ram_size - 1, GFP_NOFS);
618 goto retry;
619 }
620
603 em = alloc_extent_map(GFP_NOFS); 621 em = alloc_extent_map(GFP_NOFS);
604 em->start = async_extent->start; 622 em->start = async_extent->start;
605 em->len = async_extent->ram_size; 623 em->len = async_extent->ram_size;
@@ -637,11 +655,14 @@ static noinline int submit_compressed_extents(struct inode *inode,
637 * clear dirty, set writeback and unlock the pages. 655 * clear dirty, set writeback and unlock the pages.
638 */ 656 */
639 extent_clear_unlock_delalloc(inode, 657 extent_clear_unlock_delalloc(inode,
640 &BTRFS_I(inode)->io_tree, 658 &BTRFS_I(inode)->io_tree,
641 async_extent->start, 659 async_extent->start,
642 async_extent->start + 660 async_extent->start +
643 async_extent->ram_size - 1, 661 async_extent->ram_size - 1,
644 NULL, 1, 1, 0, 1, 1, 0, 0); 662 NULL, EXTENT_CLEAR_UNLOCK_PAGE |
663 EXTENT_CLEAR_UNLOCK |
664 EXTENT_CLEAR_DELALLOC |
665 EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
645 666
646 ret = btrfs_submit_compressed_write(inode, 667 ret = btrfs_submit_compressed_write(inode,
647 async_extent->start, 668 async_extent->start,
@@ -712,9 +733,15 @@ static noinline int cow_file_range(struct inode *inode,
712 start, end, 0, NULL); 733 start, end, 0, NULL);
713 if (ret == 0) { 734 if (ret == 0) {
714 extent_clear_unlock_delalloc(inode, 735 extent_clear_unlock_delalloc(inode,
715 &BTRFS_I(inode)->io_tree, 736 &BTRFS_I(inode)->io_tree,
716 start, end, NULL, 1, 1, 737 start, end, NULL,
717 1, 1, 1, 1, 0); 738 EXTENT_CLEAR_UNLOCK_PAGE |
739 EXTENT_CLEAR_UNLOCK |
740 EXTENT_CLEAR_DELALLOC |
741 EXTENT_CLEAR_ACCOUNTING |
742 EXTENT_CLEAR_DIRTY |
743 EXTENT_SET_WRITEBACK |
744 EXTENT_END_WRITEBACK);
718 *nr_written = *nr_written + 745 *nr_written = *nr_written +
719 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; 746 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
720 *page_started = 1; 747 *page_started = 1;
@@ -731,13 +758,29 @@ static noinline int cow_file_range(struct inode *inode,
731 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, 758 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
732 start, num_bytes); 759 start, num_bytes);
733 if (em) { 760 if (em) {
734 alloc_hint = em->block_start; 761 /*
735 free_extent_map(em); 762 * if block start isn't an actual block number then find the
763 * first block in this inode and use that as a hint. If that
764 * block is also bogus then just don't worry about it.
765 */
766 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
767 free_extent_map(em);
768 em = search_extent_mapping(em_tree, 0, 0);
769 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
770 alloc_hint = em->block_start;
771 if (em)
772 free_extent_map(em);
773 } else {
774 alloc_hint = em->block_start;
775 free_extent_map(em);
776 }
736 } 777 }
737 read_unlock(&BTRFS_I(inode)->extent_tree.lock); 778 read_unlock(&BTRFS_I(inode)->extent_tree.lock);
738 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 779 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
739 780
740 while (disk_num_bytes > 0) { 781 while (disk_num_bytes > 0) {
782 unsigned long op;
783
741 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); 784 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
742 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 785 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
743 root->sectorsize, 0, alloc_hint, 786 root->sectorsize, 0, alloc_hint,
@@ -789,10 +832,13 @@ static noinline int cow_file_range(struct inode *inode,
789 * Do set the Private2 bit so we know this page was properly 832 * Do set the Private2 bit so we know this page was properly
790 * setup for writepage 833 * setup for writepage
791 */ 834 */
835 op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
836 op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
837 EXTENT_SET_PRIVATE2;
838
792 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 839 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
793 start, start + ram_size - 1, 840 start, start + ram_size - 1,
794 locked_page, unlock, 1, 841 locked_page, op);
795 1, 0, 0, 0, 1);
796 disk_num_bytes -= cur_alloc_size; 842 disk_num_bytes -= cur_alloc_size;
797 num_bytes -= cur_alloc_size; 843 num_bytes -= cur_alloc_size;
798 alloc_hint = ins.objectid + ins.offset; 844 alloc_hint = ins.objectid + ins.offset;
@@ -864,8 +910,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
864 u64 cur_end; 910 u64 cur_end;
865 int limit = 10 * 1024 * 1042; 911 int limit = 10 * 1024 * 1042;
866 912
867 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | 913 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
868 EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS); 914 1, 0, NULL, GFP_NOFS);
869 while (start < end) { 915 while (start < end) {
870 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 916 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
871 async_cow->inode = inode; 917 async_cow->inode = inode;
@@ -1006,6 +1052,7 @@ next_slot:
1006 1052
1007 if (found_key.offset > cur_offset) { 1053 if (found_key.offset > cur_offset) {
1008 extent_end = found_key.offset; 1054 extent_end = found_key.offset;
1055 extent_type = 0;
1009 goto out_check; 1056 goto out_check;
1010 } 1057 }
1011 1058
@@ -1112,8 +1159,10 @@ out_check:
1112 BUG_ON(ret); 1159 BUG_ON(ret);
1113 1160
1114 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1161 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1115 cur_offset, cur_offset + num_bytes - 1, 1162 cur_offset, cur_offset + num_bytes - 1,
1116 locked_page, 1, 1, 1, 0, 0, 0, 1); 1163 locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
1164 EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
1165 EXTENT_SET_PRIVATE2);
1117 cur_offset = extent_end; 1166 cur_offset = extent_end;
1118 if (cur_offset > end) 1167 if (cur_offset > end)
1119 break; 1168 break;
@@ -1159,6 +1208,89 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1159 return ret; 1208 return ret;
1160} 1209}
1161 1210
1211static int btrfs_split_extent_hook(struct inode *inode,
1212 struct extent_state *orig, u64 split)
1213{
1214 struct btrfs_root *root = BTRFS_I(inode)->root;
1215 u64 size;
1216
1217 if (!(orig->state & EXTENT_DELALLOC))
1218 return 0;
1219
1220 size = orig->end - orig->start + 1;
1221 if (size > root->fs_info->max_extent) {
1222 u64 num_extents;
1223 u64 new_size;
1224
1225 new_size = orig->end - split + 1;
1226 num_extents = div64_u64(size + root->fs_info->max_extent - 1,
1227 root->fs_info->max_extent);
1228
1229 /*
1230 * if we break a large extent up then leave oustanding_extents
1231 * be, since we've already accounted for the large extent.
1232 */
1233 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1234 root->fs_info->max_extent) < num_extents)
1235 return 0;
1236 }
1237
1238 spin_lock(&BTRFS_I(inode)->accounting_lock);
1239 BTRFS_I(inode)->outstanding_extents++;
1240 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1241
1242 return 0;
1243}
1244
1245/*
1246 * extent_io.c merge_extent_hook, used to track merged delayed allocation
1247 * extents so we can keep track of new extents that are just merged onto old
1248 * extents, such as when we are doing sequential writes, so we can properly
1249 * account for the metadata space we'll need.
1250 */
1251static int btrfs_merge_extent_hook(struct inode *inode,
1252 struct extent_state *new,
1253 struct extent_state *other)
1254{
1255 struct btrfs_root *root = BTRFS_I(inode)->root;
1256 u64 new_size, old_size;
1257 u64 num_extents;
1258
1259 /* not delalloc, ignore it */
1260 if (!(other->state & EXTENT_DELALLOC))
1261 return 0;
1262
1263 old_size = other->end - other->start + 1;
1264 if (new->start < other->start)
1265 new_size = other->end - new->start + 1;
1266 else
1267 new_size = new->end - other->start + 1;
1268
1269 /* we're not bigger than the max, unreserve the space and go */
1270 if (new_size <= root->fs_info->max_extent) {
1271 spin_lock(&BTRFS_I(inode)->accounting_lock);
1272 BTRFS_I(inode)->outstanding_extents--;
1273 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1274 return 0;
1275 }
1276
1277 /*
1278 * If we grew by another max_extent, just return, we want to keep that
1279 * reserved amount.
1280 */
1281 num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
1282 root->fs_info->max_extent);
1283 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1284 root->fs_info->max_extent) > num_extents)
1285 return 0;
1286
1287 spin_lock(&BTRFS_I(inode)->accounting_lock);
1288 BTRFS_I(inode)->outstanding_extents--;
1289 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1290
1291 return 0;
1292}
1293
1162/* 1294/*
1163 * extent_io.c set_bit_hook, used to track delayed allocation 1295 * extent_io.c set_bit_hook, used to track delayed allocation
1164 * bytes in this file, and to maintain the list of inodes that 1296 * bytes in this file, and to maintain the list of inodes that
@@ -1167,6 +1299,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1167static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1299static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1168 unsigned long old, unsigned long bits) 1300 unsigned long old, unsigned long bits)
1169{ 1301{
1302
1170 /* 1303 /*
1171 * set_bit and clear bit hooks normally require _irqsave/restore 1304 * set_bit and clear bit hooks normally require _irqsave/restore
1172 * but in this case, we are only testeing for the DELALLOC 1305 * but in this case, we are only testeing for the DELALLOC
@@ -1174,6 +1307,10 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1174 */ 1307 */
1175 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1308 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1176 struct btrfs_root *root = BTRFS_I(inode)->root; 1309 struct btrfs_root *root = BTRFS_I(inode)->root;
1310
1311 spin_lock(&BTRFS_I(inode)->accounting_lock);
1312 BTRFS_I(inode)->outstanding_extents++;
1313 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1177 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1314 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1178 spin_lock(&root->fs_info->delalloc_lock); 1315 spin_lock(&root->fs_info->delalloc_lock);
1179 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1316 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
@@ -1190,22 +1327,31 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1190/* 1327/*
1191 * extent_io.c clear_bit_hook, see set_bit_hook for why 1328 * extent_io.c clear_bit_hook, see set_bit_hook for why
1192 */ 1329 */
1193static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, 1330static int btrfs_clear_bit_hook(struct inode *inode,
1194 unsigned long old, unsigned long bits) 1331 struct extent_state *state, unsigned long bits)
1195{ 1332{
1196 /* 1333 /*
1197 * set_bit and clear bit hooks normally require _irqsave/restore 1334 * set_bit and clear bit hooks normally require _irqsave/restore
1198 * but in this case, we are only testeing for the DELALLOC 1335 * but in this case, we are only testeing for the DELALLOC
1199 * bit, which is only set or cleared with irqs on 1336 * bit, which is only set or cleared with irqs on
1200 */ 1337 */
1201 if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1338 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1202 struct btrfs_root *root = BTRFS_I(inode)->root; 1339 struct btrfs_root *root = BTRFS_I(inode)->root;
1203 1340
1341 if (bits & EXTENT_DO_ACCOUNTING) {
1342 spin_lock(&BTRFS_I(inode)->accounting_lock);
1343 BTRFS_I(inode)->outstanding_extents--;
1344 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1345 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
1346 }
1347
1204 spin_lock(&root->fs_info->delalloc_lock); 1348 spin_lock(&root->fs_info->delalloc_lock);
1205 if (end - start + 1 > root->fs_info->delalloc_bytes) { 1349 if (state->end - state->start + 1 >
1350 root->fs_info->delalloc_bytes) {
1206 printk(KERN_INFO "btrfs warning: delalloc account " 1351 printk(KERN_INFO "btrfs warning: delalloc account "
1207 "%llu %llu\n", 1352 "%llu %llu\n",
1208 (unsigned long long)end - start + 1, 1353 (unsigned long long)
1354 state->end - state->start + 1,
1209 (unsigned long long) 1355 (unsigned long long)
1210 root->fs_info->delalloc_bytes); 1356 root->fs_info->delalloc_bytes);
1211 btrfs_delalloc_free_space(root, inode, (u64)-1); 1357 btrfs_delalloc_free_space(root, inode, (u64)-1);
@@ -1213,9 +1359,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1213 BTRFS_I(inode)->delalloc_bytes = 0; 1359 BTRFS_I(inode)->delalloc_bytes = 0;
1214 } else { 1360 } else {
1215 btrfs_delalloc_free_space(root, inode, 1361 btrfs_delalloc_free_space(root, inode,
1216 end - start + 1); 1362 state->end -
1217 root->fs_info->delalloc_bytes -= end - start + 1; 1363 state->start + 1);
1218 BTRFS_I(inode)->delalloc_bytes -= end - start + 1; 1364 root->fs_info->delalloc_bytes -= state->end -
1365 state->start + 1;
1366 BTRFS_I(inode)->delalloc_bytes -= state->end -
1367 state->start + 1;
1219 } 1368 }
1220 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1369 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1221 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1370 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
@@ -2354,7 +2503,19 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2354 2503
2355 root = BTRFS_I(dir)->root; 2504 root = BTRFS_I(dir)->root;
2356 2505
2506 /*
2507 * 5 items for unlink inode
2508 * 1 for orphan
2509 */
2510 ret = btrfs_reserve_metadata_space(root, 6);
2511 if (ret)
2512 return ret;
2513
2357 trans = btrfs_start_transaction(root, 1); 2514 trans = btrfs_start_transaction(root, 1);
2515 if (IS_ERR(trans)) {
2516 btrfs_unreserve_metadata_space(root, 6);
2517 return PTR_ERR(trans);
2518 }
2358 2519
2359 btrfs_set_trans_block_group(trans, dir); 2520 btrfs_set_trans_block_group(trans, dir);
2360 2521
@@ -2369,6 +2530,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2369 nr = trans->blocks_used; 2530 nr = trans->blocks_used;
2370 2531
2371 btrfs_end_transaction_throttle(trans, root); 2532 btrfs_end_transaction_throttle(trans, root);
2533 btrfs_unreserve_metadata_space(root, 6);
2372 btrfs_btree_balance_dirty(root, nr); 2534 btrfs_btree_balance_dirty(root, nr);
2373 return ret; 2535 return ret;
2374} 2536}
@@ -2449,7 +2611,16 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2449 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 2611 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
2450 return -ENOTEMPTY; 2612 return -ENOTEMPTY;
2451 2613
2614 ret = btrfs_reserve_metadata_space(root, 5);
2615 if (ret)
2616 return ret;
2617
2452 trans = btrfs_start_transaction(root, 1); 2618 trans = btrfs_start_transaction(root, 1);
2619 if (IS_ERR(trans)) {
2620 btrfs_unreserve_metadata_space(root, 5);
2621 return PTR_ERR(trans);
2622 }
2623
2453 btrfs_set_trans_block_group(trans, dir); 2624 btrfs_set_trans_block_group(trans, dir);
2454 2625
2455 if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 2626 if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
@@ -2472,6 +2643,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2472out: 2643out:
2473 nr = trans->blocks_used; 2644 nr = trans->blocks_used;
2474 ret = btrfs_end_transaction_throttle(trans, root); 2645 ret = btrfs_end_transaction_throttle(trans, root);
2646 btrfs_unreserve_metadata_space(root, 5);
2475 btrfs_btree_balance_dirty(root, nr); 2647 btrfs_btree_balance_dirty(root, nr);
2476 2648
2477 if (ret && !err) 2649 if (ret && !err)
@@ -2912,12 +3084,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
2912 3084
2913 if ((offset & (blocksize - 1)) == 0) 3085 if ((offset & (blocksize - 1)) == 0)
2914 goto out; 3086 goto out;
3087 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
3088 if (ret)
3089 goto out;
3090
3091 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
3092 if (ret)
3093 goto out;
2915 3094
2916 ret = -ENOMEM; 3095 ret = -ENOMEM;
2917again: 3096again:
2918 page = grab_cache_page(mapping, index); 3097 page = grab_cache_page(mapping, index);
2919 if (!page) 3098 if (!page) {
3099 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
3100 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
2920 goto out; 3101 goto out;
3102 }
2921 3103
2922 page_start = page_offset(page); 3104 page_start = page_offset(page);
2923 page_end = page_start + PAGE_CACHE_SIZE - 1; 3105 page_end = page_start + PAGE_CACHE_SIZE - 1;
@@ -2950,7 +3132,16 @@ again:
2950 goto again; 3132 goto again;
2951 } 3133 }
2952 3134
2953 btrfs_set_extent_delalloc(inode, page_start, page_end); 3135 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
3136 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
3137 GFP_NOFS);
3138
3139 ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
3140 if (ret) {
3141 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3142 goto out_unlock;
3143 }
3144
2954 ret = 0; 3145 ret = 0;
2955 if (offset != PAGE_CACHE_SIZE) { 3146 if (offset != PAGE_CACHE_SIZE) {
2956 kaddr = kmap(page); 3147 kaddr = kmap(page);
@@ -2963,6 +3154,9 @@ again:
2963 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3154 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2964 3155
2965out_unlock: 3156out_unlock:
3157 if (ret)
3158 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
3159 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
2966 unlock_page(page); 3160 unlock_page(page);
2967 page_cache_release(page); 3161 page_cache_release(page);
2968out: 3162out:
@@ -2981,17 +3175,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
2981 u64 last_byte; 3175 u64 last_byte;
2982 u64 cur_offset; 3176 u64 cur_offset;
2983 u64 hole_size; 3177 u64 hole_size;
2984 int err; 3178 int err = 0;
2985 3179
2986 if (size <= hole_start) 3180 if (size <= hole_start)
2987 return 0; 3181 return 0;
2988 3182
2989 err = btrfs_check_metadata_free_space(root); 3183 err = btrfs_truncate_page(inode->i_mapping, inode->i_size);
2990 if (err) 3184 if (err)
2991 return err; 3185 return err;
2992 3186
2993 btrfs_truncate_page(inode->i_mapping, inode->i_size);
2994
2995 while (1) { 3187 while (1) {
2996 struct btrfs_ordered_extent *ordered; 3188 struct btrfs_ordered_extent *ordered;
2997 btrfs_wait_ordered_range(inode, hole_start, 3189 btrfs_wait_ordered_range(inode, hole_start,
@@ -3024,12 +3216,18 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3024 cur_offset, &hint_byte, 1); 3216 cur_offset, &hint_byte, 1);
3025 if (err) 3217 if (err)
3026 break; 3218 break;
3219
3220 err = btrfs_reserve_metadata_space(root, 1);
3221 if (err)
3222 break;
3223
3027 err = btrfs_insert_file_extent(trans, root, 3224 err = btrfs_insert_file_extent(trans, root,
3028 inode->i_ino, cur_offset, 0, 3225 inode->i_ino, cur_offset, 0,
3029 0, hole_size, 0, hole_size, 3226 0, hole_size, 0, hole_size,
3030 0, 0, 0); 3227 0, 0, 0);
3031 btrfs_drop_extent_cache(inode, hole_start, 3228 btrfs_drop_extent_cache(inode, hole_start,
3032 last_byte - 1, 0); 3229 last_byte - 1, 0);
3230 btrfs_unreserve_metadata_space(root, 1);
3033 } 3231 }
3034 free_extent_map(em); 3232 free_extent_map(em);
3035 cur_offset = last_byte; 3233 cur_offset = last_byte;
@@ -3353,6 +3551,7 @@ static noinline void init_btrfs_i(struct inode *inode)
3353 bi->generation = 0; 3551 bi->generation = 0;
3354 bi->sequence = 0; 3552 bi->sequence = 0;
3355 bi->last_trans = 0; 3553 bi->last_trans = 0;
3554 bi->last_sub_trans = 0;
3356 bi->logged_trans = 0; 3555 bi->logged_trans = 0;
3357 bi->delalloc_bytes = 0; 3556 bi->delalloc_bytes = 0;
3358 bi->reserved_bytes = 0; 3557 bi->reserved_bytes = 0;
@@ -3503,12 +3702,14 @@ static int btrfs_dentry_delete(struct dentry *dentry)
3503{ 3702{
3504 struct btrfs_root *root; 3703 struct btrfs_root *root;
3505 3704
3506 if (!dentry->d_inode) 3705 if (!dentry->d_inode && !IS_ROOT(dentry))
3507 return 0; 3706 dentry = dentry->d_parent;
3508 3707
3509 root = BTRFS_I(dentry->d_inode)->root; 3708 if (dentry->d_inode) {
3510 if (btrfs_root_refs(&root->root_item) == 0) 3709 root = BTRFS_I(dentry->d_inode)->root;
3511 return 1; 3710 if (btrfs_root_refs(&root->root_item) == 0)
3711 return 1;
3712 }
3512 return 0; 3713 return 0;
3513} 3714}
3514 3715
@@ -3990,11 +4191,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3990 if (!new_valid_dev(rdev)) 4191 if (!new_valid_dev(rdev))
3991 return -EINVAL; 4192 return -EINVAL;
3992 4193
3993 err = btrfs_check_metadata_free_space(root); 4194 /*
4195 * 2 for inode item and ref
4196 * 2 for dir items
4197 * 1 for xattr if selinux is on
4198 */
4199 err = btrfs_reserve_metadata_space(root, 5);
3994 if (err) 4200 if (err)
3995 goto fail; 4201 return err;
3996 4202
3997 trans = btrfs_start_transaction(root, 1); 4203 trans = btrfs_start_transaction(root, 1);
4204 if (!trans)
4205 goto fail;
3998 btrfs_set_trans_block_group(trans, dir); 4206 btrfs_set_trans_block_group(trans, dir);
3999 4207
4000 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4208 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -4032,6 +4240,7 @@ out_unlock:
4032 nr = trans->blocks_used; 4240 nr = trans->blocks_used;
4033 btrfs_end_transaction_throttle(trans, root); 4241 btrfs_end_transaction_throttle(trans, root);
4034fail: 4242fail:
4243 btrfs_unreserve_metadata_space(root, 5);
4035 if (drop_inode) { 4244 if (drop_inode) {
4036 inode_dec_link_count(inode); 4245 inode_dec_link_count(inode);
4037 iput(inode); 4246 iput(inode);
@@ -4052,10 +4261,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4052 u64 objectid; 4261 u64 objectid;
4053 u64 index = 0; 4262 u64 index = 0;
4054 4263
4055 err = btrfs_check_metadata_free_space(root); 4264 /*
4265 * 2 for inode item and ref
4266 * 2 for dir items
4267 * 1 for xattr if selinux is on
4268 */
4269 err = btrfs_reserve_metadata_space(root, 5);
4056 if (err) 4270 if (err)
4057 goto fail; 4271 return err;
4272
4058 trans = btrfs_start_transaction(root, 1); 4273 trans = btrfs_start_transaction(root, 1);
4274 if (!trans)
4275 goto fail;
4059 btrfs_set_trans_block_group(trans, dir); 4276 btrfs_set_trans_block_group(trans, dir);
4060 4277
4061 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4278 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -4096,6 +4313,7 @@ out_unlock:
4096 nr = trans->blocks_used; 4313 nr = trans->blocks_used;
4097 btrfs_end_transaction_throttle(trans, root); 4314 btrfs_end_transaction_throttle(trans, root);
4098fail: 4315fail:
4316 btrfs_unreserve_metadata_space(root, 5);
4099 if (drop_inode) { 4317 if (drop_inode) {
4100 inode_dec_link_count(inode); 4318 inode_dec_link_count(inode);
4101 iput(inode); 4319 iput(inode);
@@ -4118,10 +4336,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4118 if (inode->i_nlink == 0) 4336 if (inode->i_nlink == 0)
4119 return -ENOENT; 4337 return -ENOENT;
4120 4338
4121 btrfs_inc_nlink(inode); 4339 /*
4122 err = btrfs_check_metadata_free_space(root); 4340 * 1 item for inode ref
4341 * 2 items for dir items
4342 */
4343 err = btrfs_reserve_metadata_space(root, 3);
4123 if (err) 4344 if (err)
4124 goto fail; 4345 return err;
4346
4347 btrfs_inc_nlink(inode);
4348
4125 err = btrfs_set_inode_index(dir, &index); 4349 err = btrfs_set_inode_index(dir, &index);
4126 if (err) 4350 if (err)
4127 goto fail; 4351 goto fail;
@@ -4145,6 +4369,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4145 nr = trans->blocks_used; 4369 nr = trans->blocks_used;
4146 btrfs_end_transaction_throttle(trans, root); 4370 btrfs_end_transaction_throttle(trans, root);
4147fail: 4371fail:
4372 btrfs_unreserve_metadata_space(root, 3);
4148 if (drop_inode) { 4373 if (drop_inode) {
4149 inode_dec_link_count(inode); 4374 inode_dec_link_count(inode);
4150 iput(inode); 4375 iput(inode);
@@ -4164,17 +4389,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4164 u64 index = 0; 4389 u64 index = 0;
4165 unsigned long nr = 1; 4390 unsigned long nr = 1;
4166 4391
4167 err = btrfs_check_metadata_free_space(root); 4392 /*
4393 * 2 items for inode and ref
4394 * 2 items for dir items
4395 * 1 for xattr if selinux is on
4396 */
4397 err = btrfs_reserve_metadata_space(root, 5);
4168 if (err) 4398 if (err)
4169 goto out_unlock; 4399 return err;
4170 4400
4171 trans = btrfs_start_transaction(root, 1); 4401 trans = btrfs_start_transaction(root, 1);
4172 btrfs_set_trans_block_group(trans, dir); 4402 if (!trans) {
4173 4403 err = -ENOMEM;
4174 if (IS_ERR(trans)) {
4175 err = PTR_ERR(trans);
4176 goto out_unlock; 4404 goto out_unlock;
4177 } 4405 }
4406 btrfs_set_trans_block_group(trans, dir);
4178 4407
4179 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4408 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4180 if (err) { 4409 if (err) {
@@ -4223,6 +4452,7 @@ out_fail:
4223 btrfs_end_transaction_throttle(trans, root); 4452 btrfs_end_transaction_throttle(trans, root);
4224 4453
4225out_unlock: 4454out_unlock:
4455 btrfs_unreserve_metadata_space(root, 5);
4226 if (drop_on_err) 4456 if (drop_on_err)
4227 iput(inode); 4457 iput(inode);
4228 btrfs_btree_balance_dirty(root, nr); 4458 btrfs_btree_balance_dirty(root, nr);
@@ -4684,7 +4914,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4684 */ 4914 */
4685 clear_extent_bit(tree, page_start, page_end, 4915 clear_extent_bit(tree, page_start, page_end,
4686 EXTENT_DIRTY | EXTENT_DELALLOC | 4916 EXTENT_DIRTY | EXTENT_DELALLOC |
4687 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); 4917 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
4918 NULL, GFP_NOFS);
4688 /* 4919 /*
4689 * whoever cleared the private bit is responsible 4920 * whoever cleared the private bit is responsible
4690 * for the finish_ordered_io 4921 * for the finish_ordered_io
@@ -4697,8 +4928,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4697 lock_extent(tree, page_start, page_end, GFP_NOFS); 4928 lock_extent(tree, page_start, page_end, GFP_NOFS);
4698 } 4929 }
4699 clear_extent_bit(tree, page_start, page_end, 4930 clear_extent_bit(tree, page_start, page_end,
4700 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, 4931 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4701 1, 1, NULL, GFP_NOFS); 4932 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
4702 __btrfs_releasepage(page, GFP_NOFS); 4933 __btrfs_releasepage(page, GFP_NOFS);
4703 4934
4704 ClearPageChecked(page); 4935 ClearPageChecked(page);
@@ -4747,6 +4978,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4747 goto out; 4978 goto out;
4748 } 4979 }
4749 4980
4981 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
4982 if (ret) {
4983 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
4984 ret = VM_FAULT_SIGBUS;
4985 goto out;
4986 }
4987
4750 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 4988 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
4751again: 4989again:
4752 lock_page(page); 4990 lock_page(page);
@@ -4778,7 +5016,24 @@ again:
4778 goto again; 5016 goto again;
4779 } 5017 }
4780 5018
4781 btrfs_set_extent_delalloc(inode, page_start, page_end); 5019 /*
5020 * XXX - page_mkwrite gets called every time the page is dirtied, even
5021 * if it was already dirty, so for space accounting reasons we need to
5022 * clear any delalloc bits for the range we are fixing to save. There
5023 * is probably a better way to do this, but for now keep consistent with
5024 * prepare_pages in the normal write path.
5025 */
5026 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
5027 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
5028 GFP_NOFS);
5029
5030 ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
5031 if (ret) {
5032 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
5033 ret = VM_FAULT_SIGBUS;
5034 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5035 goto out_unlock;
5036 }
4782 ret = 0; 5037 ret = 0;
4783 5038
4784 /* page is wholly or partially inside EOF */ 5039 /* page is wholly or partially inside EOF */
@@ -4797,10 +5052,13 @@ again:
4797 set_page_dirty(page); 5052 set_page_dirty(page);
4798 SetPageUptodate(page); 5053 SetPageUptodate(page);
4799 5054
4800 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 5055 BTRFS_I(inode)->last_trans = root->fs_info->generation;
5056 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
5057
4801 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5058 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4802 5059
4803out_unlock: 5060out_unlock:
5061 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
4804 if (!ret) 5062 if (!ret)
4805 return VM_FAULT_LOCKED; 5063 return VM_FAULT_LOCKED;
4806 unlock_page(page); 5064 unlock_page(page);
@@ -4821,7 +5079,9 @@ static void btrfs_truncate(struct inode *inode)
4821 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 5079 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4822 return; 5080 return;
4823 5081
4824 btrfs_truncate_page(inode->i_mapping, inode->i_size); 5082 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
5083 if (ret)
5084 return;
4825 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 5085 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4826 5086
4827 trans = btrfs_start_transaction(root, 1); 5087 trans = btrfs_start_transaction(root, 1);
@@ -4916,7 +5176,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
4916 if (!ei) 5176 if (!ei)
4917 return NULL; 5177 return NULL;
4918 ei->last_trans = 0; 5178 ei->last_trans = 0;
5179 ei->last_sub_trans = 0;
4919 ei->logged_trans = 0; 5180 ei->logged_trans = 0;
5181 ei->outstanding_extents = 0;
5182 ei->reserved_extents = 0;
5183 ei->root = NULL;
5184 spin_lock_init(&ei->accounting_lock);
4920 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 5185 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
4921 INIT_LIST_HEAD(&ei->i_orphan); 5186 INIT_LIST_HEAD(&ei->i_orphan);
4922 INIT_LIST_HEAD(&ei->ordered_operations); 5187 INIT_LIST_HEAD(&ei->ordered_operations);
@@ -4932,6 +5197,14 @@ void btrfs_destroy_inode(struct inode *inode)
4932 WARN_ON(inode->i_data.nrpages); 5197 WARN_ON(inode->i_data.nrpages);
4933 5198
4934 /* 5199 /*
5200 * This can happen where we create an inode, but somebody else also
5201 * created the same inode and we need to destroy the one we already
5202 * created.
5203 */
5204 if (!root)
5205 goto free;
5206
5207 /*
4935 * Make sure we're properly removed from the ordered operation 5208 * Make sure we're properly removed from the ordered operation
4936 * lists. 5209 * lists.
4937 */ 5210 */
@@ -4966,6 +5239,7 @@ void btrfs_destroy_inode(struct inode *inode)
4966 } 5239 }
4967 inode_tree_del(inode); 5240 inode_tree_del(inode);
4968 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 5241 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
5242free:
4969 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 5243 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
4970} 5244}
4971 5245
@@ -5070,7 +5344,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5070 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 5344 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
5071 return -ENOTEMPTY; 5345 return -ENOTEMPTY;
5072 5346
5073 ret = btrfs_check_metadata_free_space(root); 5347 /*
5348 * We want to reserve the absolute worst case amount of items. So if
5349 * both inodes are subvols and we need to unlink them then that would
5350 * require 4 item modifications, but if they are both normal inodes it
5351 * would require 5 item modifications, so we'll assume their normal
5352 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
5353 * should cover the worst case number of items we'll modify.
5354 */
5355 ret = btrfs_reserve_metadata_space(root, 11);
5074 if (ret) 5356 if (ret)
5075 return ret; 5357 return ret;
5076 5358
@@ -5185,6 +5467,8 @@ out_fail:
5185 5467
5186 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 5468 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5187 up_read(&root->fs_info->subvol_sem); 5469 up_read(&root->fs_info->subvol_sem);
5470
5471 btrfs_unreserve_metadata_space(root, 11);
5188 return ret; 5472 return ret;
5189} 5473}
5190 5474
@@ -5256,11 +5540,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5256 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 5540 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
5257 return -ENAMETOOLONG; 5541 return -ENAMETOOLONG;
5258 5542
5259 err = btrfs_check_metadata_free_space(root); 5543 /*
5544 * 2 items for inode item and ref
5545 * 2 items for dir items
5546 * 1 item for xattr if selinux is on
5547 */
5548 err = btrfs_reserve_metadata_space(root, 5);
5260 if (err) 5549 if (err)
5261 goto out_fail; 5550 return err;
5262 5551
5263 trans = btrfs_start_transaction(root, 1); 5552 trans = btrfs_start_transaction(root, 1);
5553 if (!trans)
5554 goto out_fail;
5264 btrfs_set_trans_block_group(trans, dir); 5555 btrfs_set_trans_block_group(trans, dir);
5265 5556
5266 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 5557 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -5341,6 +5632,7 @@ out_unlock:
5341 nr = trans->blocks_used; 5632 nr = trans->blocks_used;
5342 btrfs_end_transaction_throttle(trans, root); 5633 btrfs_end_transaction_throttle(trans, root);
5343out_fail: 5634out_fail:
5635 btrfs_unreserve_metadata_space(root, 5);
5344 if (drop_inode) { 5636 if (drop_inode) {
5345 inode_dec_link_count(inode); 5637 inode_dec_link_count(inode);
5346 iput(inode); 5638 iput(inode);
@@ -5362,6 +5654,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
5362 5654
5363 while (num_bytes > 0) { 5655 while (num_bytes > 0) {
5364 alloc_size = min(num_bytes, root->fs_info->max_extent); 5656 alloc_size = min(num_bytes, root->fs_info->max_extent);
5657
5658 ret = btrfs_reserve_metadata_space(root, 1);
5659 if (ret)
5660 goto out;
5661
5365 ret = btrfs_reserve_extent(trans, root, alloc_size, 5662 ret = btrfs_reserve_extent(trans, root, alloc_size,
5366 root->sectorsize, 0, alloc_hint, 5663 root->sectorsize, 0, alloc_hint,
5367 (u64)-1, &ins, 1); 5664 (u64)-1, &ins, 1);
@@ -5381,6 +5678,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
5381 num_bytes -= ins.offset; 5678 num_bytes -= ins.offset;
5382 cur_offset += ins.offset; 5679 cur_offset += ins.offset;
5383 alloc_hint = ins.objectid + ins.offset; 5680 alloc_hint = ins.objectid + ins.offset;
5681 btrfs_unreserve_metadata_space(root, 1);
5384 } 5682 }
5385out: 5683out:
5386 if (cur_offset > start) { 5684 if (cur_offset > start) {
@@ -5544,7 +5842,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
5544 .permission = btrfs_permission, 5842 .permission = btrfs_permission,
5545}; 5843};
5546 5844
5547static struct file_operations btrfs_dir_file_operations = { 5845static const struct file_operations btrfs_dir_file_operations = {
5548 .llseek = generic_file_llseek, 5846 .llseek = generic_file_llseek,
5549 .read = generic_read_dir, 5847 .read = generic_read_dir,
5550 .readdir = btrfs_real_readdir, 5848 .readdir = btrfs_real_readdir,
@@ -5566,6 +5864,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
5566 .readpage_io_failed_hook = btrfs_io_failed_hook, 5864 .readpage_io_failed_hook = btrfs_io_failed_hook,
5567 .set_bit_hook = btrfs_set_bit_hook, 5865 .set_bit_hook = btrfs_set_bit_hook,
5568 .clear_bit_hook = btrfs_clear_bit_hook, 5866 .clear_bit_hook = btrfs_clear_bit_hook,
5867 .merge_extent_hook = btrfs_merge_extent_hook,
5868 .split_extent_hook = btrfs_split_extent_hook,
5569}; 5869};
5570 5870
5571/* 5871/*
@@ -5632,6 +5932,6 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
5632 .removexattr = btrfs_removexattr, 5932 .removexattr = btrfs_removexattr,
5633}; 5933};
5634 5934
5635struct dentry_operations btrfs_dentry_operations = { 5935const struct dentry_operations btrfs_dentry_operations = {
5636 .d_delete = btrfs_dentry_delete, 5936 .d_delete = btrfs_dentry_delete,
5637}; 5937};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a8577a7f26a..cdbb054102b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -239,7 +239,13 @@ static noinline int create_subvol(struct btrfs_root *root,
239 u64 index = 0; 239 u64 index = 0;
240 unsigned long nr = 1; 240 unsigned long nr = 1;
241 241
242 ret = btrfs_check_metadata_free_space(root); 242 /*
243 * 1 - inode item
244 * 2 - refs
245 * 1 - root item
246 * 2 - dir items
247 */
248 ret = btrfs_reserve_metadata_space(root, 6);
243 if (ret) 249 if (ret)
244 return ret; 250 return ret;
245 251
@@ -340,6 +346,9 @@ fail:
340 err = btrfs_commit_transaction(trans, root); 346 err = btrfs_commit_transaction(trans, root);
341 if (err && !ret) 347 if (err && !ret)
342 ret = err; 348 ret = err;
349
350 btrfs_unreserve_metadata_space(root, 6);
351 btrfs_btree_balance_dirty(root, nr);
343 return ret; 352 return ret;
344} 353}
345 354
@@ -355,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
355 if (!root->ref_cows) 364 if (!root->ref_cows)
356 return -EINVAL; 365 return -EINVAL;
357 366
358 ret = btrfs_check_metadata_free_space(root); 367 /*
368 * 1 - inode item
369 * 2 - refs
370 * 1 - root item
371 * 2 - dir items
372 */
373 ret = btrfs_reserve_metadata_space(root, 6);
359 if (ret) 374 if (ret)
360 goto fail_unlock; 375 goto fail_unlock;
361 376
362 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 377 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
363 if (!pending_snapshot) { 378 if (!pending_snapshot) {
364 ret = -ENOMEM; 379 ret = -ENOMEM;
380 btrfs_unreserve_metadata_space(root, 6);
365 goto fail_unlock; 381 goto fail_unlock;
366 } 382 }
367 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); 383 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
368 if (!pending_snapshot->name) { 384 if (!pending_snapshot->name) {
369 ret = -ENOMEM; 385 ret = -ENOMEM;
370 kfree(pending_snapshot); 386 kfree(pending_snapshot);
387 btrfs_unreserve_metadata_space(root, 6);
371 goto fail_unlock; 388 goto fail_unlock;
372 } 389 }
373 memcpy(pending_snapshot->name, name, namelen); 390 memcpy(pending_snapshot->name, name, namelen);
@@ -813,6 +830,7 @@ out_up_write:
813out_unlock: 830out_unlock:
814 mutex_unlock(&inode->i_mutex); 831 mutex_unlock(&inode->i_mutex);
815 if (!err) { 832 if (!err) {
833 shrink_dcache_sb(root->fs_info->sb);
816 btrfs_invalidate_inodes(dest); 834 btrfs_invalidate_inodes(dest);
817 d_delete(dentry); 835 d_delete(dentry);
818 } 836 }
@@ -1105,8 +1123,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1105 datao += off - key.offset; 1123 datao += off - key.offset;
1106 datal -= off - key.offset; 1124 datal -= off - key.offset;
1107 } 1125 }
1108 if (key.offset + datao + datal > off + len) 1126
1109 datal = off + len - key.offset - datao; 1127 if (key.offset + datal > off + len)
1128 datal = off + len - key.offset;
1129
1110 /* disko == 0 means it's a hole */ 1130 /* disko == 0 means it's a hole */
1111 if (!disko) 1131 if (!disko)
1112 datao = 0; 1132 datao = 0;
@@ -1215,15 +1235,15 @@ static long btrfs_ioctl_trans_start(struct file *file)
1215 struct inode *inode = fdentry(file)->d_inode; 1235 struct inode *inode = fdentry(file)->d_inode;
1216 struct btrfs_root *root = BTRFS_I(inode)->root; 1236 struct btrfs_root *root = BTRFS_I(inode)->root;
1217 struct btrfs_trans_handle *trans; 1237 struct btrfs_trans_handle *trans;
1218 int ret = 0; 1238 int ret;
1219 1239
1240 ret = -EPERM;
1220 if (!capable(CAP_SYS_ADMIN)) 1241 if (!capable(CAP_SYS_ADMIN))
1221 return -EPERM; 1242 goto out;
1222 1243
1223 if (file->private_data) { 1244 ret = -EINPROGRESS;
1224 ret = -EINPROGRESS; 1245 if (file->private_data)
1225 goto out; 1246 goto out;
1226 }
1227 1247
1228 ret = mnt_want_write(file->f_path.mnt); 1248 ret = mnt_want_write(file->f_path.mnt);
1229 if (ret) 1249 if (ret)
@@ -1233,12 +1253,19 @@ static long btrfs_ioctl_trans_start(struct file *file)
1233 root->fs_info->open_ioctl_trans++; 1253 root->fs_info->open_ioctl_trans++;
1234 mutex_unlock(&root->fs_info->trans_mutex); 1254 mutex_unlock(&root->fs_info->trans_mutex);
1235 1255
1256 ret = -ENOMEM;
1236 trans = btrfs_start_ioctl_transaction(root, 0); 1257 trans = btrfs_start_ioctl_transaction(root, 0);
1237 if (trans) 1258 if (!trans)
1238 file->private_data = trans; 1259 goto out_drop;
1239 else 1260
1240 ret = -ENOMEM; 1261 file->private_data = trans;
1241 /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ 1262 return 0;
1263
1264out_drop:
1265 mutex_lock(&root->fs_info->trans_mutex);
1266 root->fs_info->open_ioctl_trans--;
1267 mutex_unlock(&root->fs_info->trans_mutex);
1268 mnt_drop_write(file->f_path.mnt);
1242out: 1269out:
1243 return ret; 1270 return ret;
1244} 1271}
@@ -1254,24 +1281,20 @@ long btrfs_ioctl_trans_end(struct file *file)
1254 struct inode *inode = fdentry(file)->d_inode; 1281 struct inode *inode = fdentry(file)->d_inode;
1255 struct btrfs_root *root = BTRFS_I(inode)->root; 1282 struct btrfs_root *root = BTRFS_I(inode)->root;
1256 struct btrfs_trans_handle *trans; 1283 struct btrfs_trans_handle *trans;
1257 int ret = 0;
1258 1284
1259 trans = file->private_data; 1285 trans = file->private_data;
1260 if (!trans) { 1286 if (!trans)
1261 ret = -EINVAL; 1287 return -EINVAL;
1262 goto out;
1263 }
1264 btrfs_end_transaction(trans, root);
1265 file->private_data = NULL; 1288 file->private_data = NULL;
1266 1289
1290 btrfs_end_transaction(trans, root);
1291
1267 mutex_lock(&root->fs_info->trans_mutex); 1292 mutex_lock(&root->fs_info->trans_mutex);
1268 root->fs_info->open_ioctl_trans--; 1293 root->fs_info->open_ioctl_trans--;
1269 mutex_unlock(&root->fs_info->trans_mutex); 1294 mutex_unlock(&root->fs_info->trans_mutex);
1270 1295
1271 mnt_drop_write(file->f_path.mnt); 1296 mnt_drop_write(file->f_path.mnt);
1272 1297 return 0;
1273out:
1274 return ret;
1275} 1298}
1276 1299
1277long btrfs_ioctl(struct file *file, unsigned int 1300long btrfs_ioctl(struct file *file, unsigned int
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b5d6d24726b..5799bc46a30 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -306,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode,
306 tree->last = NULL; 306 tree->last = NULL;
307 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 307 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
308 308
309 spin_lock(&BTRFS_I(inode)->accounting_lock);
310 BTRFS_I(inode)->outstanding_extents--;
311 spin_unlock(&BTRFS_I(inode)->accounting_lock);
312 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
313 inode, 1);
314
309 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 315 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
310 list_del_init(&entry->root_extent_list); 316 list_del_init(&entry->root_extent_list);
311 317
@@ -458,7 +464,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
458 * start IO on any dirty ones so the wait doesn't stall waiting 464 * start IO on any dirty ones so the wait doesn't stall waiting
459 * for pdflush to find them 465 * for pdflush to find them
460 */ 466 */
461 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL); 467 filemap_fdatawrite_range(inode->i_mapping, start, end);
462 if (wait) { 468 if (wait) {
463 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 469 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
464 &entry->flags)); 470 &entry->flags));
@@ -488,17 +494,15 @@ again:
488 /* start IO across the range first to instantiate any delalloc 494 /* start IO across the range first to instantiate any delalloc
489 * extents 495 * extents
490 */ 496 */
491 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 497 filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
492 498
493 /* The compression code will leave pages locked but return from 499 /* The compression code will leave pages locked but return from
494 * writepage without setting the page writeback. Starting again 500 * writepage without setting the page writeback. Starting again
495 * with WB_SYNC_ALL will end up waiting for the IO to actually start. 501 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
496 */ 502 */
497 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 503 filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
498 504
499 btrfs_wait_on_page_writeback_range(inode->i_mapping, 505 filemap_fdatawait_range(inode->i_mapping, start, orig_end);
500 start >> PAGE_CACHE_SHIFT,
501 orig_end >> PAGE_CACHE_SHIFT);
502 506
503 end = orig_end; 507 end = orig_end;
504 found = 0; 508 found = 0;
@@ -716,89 +720,6 @@ out:
716} 720}
717 721
718 722
719/**
720 * taken from mm/filemap.c because it isn't exported
721 *
722 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
723 * @mapping: address space structure to write
724 * @start: offset in bytes where the range starts
725 * @end: offset in bytes where the range ends (inclusive)
726 * @sync_mode: enable synchronous operation
727 *
728 * Start writeback against all of a mapping's dirty pages that lie
729 * within the byte offsets <start, end> inclusive.
730 *
731 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
732 * opposed to a regular memory cleansing writeback. The difference between
733 * these two operations is that if a dirty page/buffer is encountered, it must
734 * be waited upon, and not just skipped over.
735 */
736int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
737 loff_t end, int sync_mode)
738{
739 struct writeback_control wbc = {
740 .sync_mode = sync_mode,
741 .nr_to_write = mapping->nrpages * 2,
742 .range_start = start,
743 .range_end = end,
744 };
745 return btrfs_writepages(mapping, &wbc);
746}
747
748/**
749 * taken from mm/filemap.c because it isn't exported
750 *
751 * wait_on_page_writeback_range - wait for writeback to complete
752 * @mapping: target address_space
753 * @start: beginning page index
754 * @end: ending page index
755 *
756 * Wait for writeback to complete against pages indexed by start->end
757 * inclusive
758 */
759int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
760 pgoff_t start, pgoff_t end)
761{
762 struct pagevec pvec;
763 int nr_pages;
764 int ret = 0;
765 pgoff_t index;
766
767 if (end < start)
768 return 0;
769
770 pagevec_init(&pvec, 0);
771 index = start;
772 while ((index <= end) &&
773 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
774 PAGECACHE_TAG_WRITEBACK,
775 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
776 unsigned i;
777
778 for (i = 0; i < nr_pages; i++) {
779 struct page *page = pvec.pages[i];
780
781 /* until radix tree lookup accepts end_index */
782 if (page->index > end)
783 continue;
784
785 wait_on_page_writeback(page);
786 if (PageError(page))
787 ret = -EIO;
788 }
789 pagevec_release(&pvec);
790 cond_resched();
791 }
792
793 /* Check for outstanding write errors */
794 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
795 ret = -ENOSPC;
796 if (test_and_clear_bit(AS_EIO, &mapping->flags))
797 ret = -EIO;
798
799 return ret;
800}
801
802/* 723/*
803 * add a given inode to the list of inodes that must be fully on 724 * add a given inode to the list of inodes that must be fully on
804 * disk before a transaction commit finishes. 725 * disk before a transaction commit finishes.
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 993a7ea45c7..f82e87488ca 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -153,10 +153,6 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
153int btrfs_ordered_update_i_size(struct inode *inode, 153int btrfs_ordered_update_i_size(struct inode *inode,
154 struct btrfs_ordered_extent *ordered); 154 struct btrfs_ordered_extent *ordered);
155int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 155int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
156int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
157 pgoff_t start, pgoff_t end);
158int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
159 loff_t end, int sync_mode);
160int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); 156int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
161int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 157int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
162int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 158int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 361ad323faa..cfcc93c93a7 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3518,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3518 BUG_ON(!rc->block_group); 3518 BUG_ON(!rc->block_group);
3519 3519
3520 btrfs_init_workers(&rc->workers, "relocate", 3520 btrfs_init_workers(&rc->workers, "relocate",
3521 fs_info->thread_pool_size); 3521 fs_info->thread_pool_size, NULL);
3522 3522
3523 rc->extent_root = extent_root; 3523 rc->extent_root = extent_root;
3524 btrfs_prepare_block_group_relocation(extent_root, rc->block_group); 3524 btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
@@ -3701,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3701 mapping_tree_init(&rc->reloc_root_tree); 3701 mapping_tree_init(&rc->reloc_root_tree);
3702 INIT_LIST_HEAD(&rc->reloc_roots); 3702 INIT_LIST_HEAD(&rc->reloc_roots);
3703 btrfs_init_workers(&rc->workers, "relocate", 3703 btrfs_init_workers(&rc->workers, "relocate",
3704 root->fs_info->thread_pool_size); 3704 root->fs_info->thread_pool_size, NULL);
3705 rc->extent_root = root->fs_info->extent_root; 3705 rc->extent_root = root->fs_info->extent_root;
3706 3706
3707 set_reloc_control(rc); 3707 set_reloc_control(rc);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 9351428f30e..67fa2d29d66 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -159,7 +159,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
159 write_extent_buffer(l, item, ptr, sizeof(*item)); 159 write_extent_buffer(l, item, ptr, sizeof(*item));
160 btrfs_mark_buffer_dirty(path->nodes[0]); 160 btrfs_mark_buffer_dirty(path->nodes[0]);
161out: 161out:
162 btrfs_release_path(root, path);
163 btrfs_free_path(path); 162 btrfs_free_path(path);
164 return ret; 163 return ret;
165} 164}
@@ -332,7 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
332 BUG_ON(refs != 0); 331 BUG_ON(refs != 0);
333 ret = btrfs_del_item(trans, root, path); 332 ret = btrfs_del_item(trans, root, path);
334out: 333out:
335 btrfs_release_path(root, path);
336 btrfs_free_path(path); 334 btrfs_free_path(path);
337 return ret; 335 return ret;
338} 336}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 67035385444..752a5463bf5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,7 +66,8 @@ enum {
66 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 66 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
67 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 67 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
68 Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, 68 Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
69 Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err, 69 Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
70 Opt_discard, Opt_err,
70}; 71};
71 72
72static match_table_t tokens = { 73static match_table_t tokens = {
@@ -88,6 +89,7 @@ static match_table_t tokens = {
88 {Opt_notreelog, "notreelog"}, 89 {Opt_notreelog, "notreelog"},
89 {Opt_flushoncommit, "flushoncommit"}, 90 {Opt_flushoncommit, "flushoncommit"},
90 {Opt_ratio, "metadata_ratio=%d"}, 91 {Opt_ratio, "metadata_ratio=%d"},
92 {Opt_discard, "discard"},
91 {Opt_err, NULL}, 93 {Opt_err, NULL},
92}; 94};
93 95
@@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
257 info->metadata_ratio); 259 info->metadata_ratio);
258 } 260 }
259 break; 261 break;
262 case Opt_discard:
263 btrfs_set_opt(info->mount_opt, DISCARD);
264 break;
260 default: 265 default:
261 break; 266 break;
262 } 267 }
@@ -344,7 +349,9 @@ static int btrfs_fill_super(struct super_block *sb,
344 sb->s_export_op = &btrfs_export_ops; 349 sb->s_export_op = &btrfs_export_ops;
345 sb->s_xattr = btrfs_xattr_handlers; 350 sb->s_xattr = btrfs_xattr_handlers;
346 sb->s_time_gran = 1; 351 sb->s_time_gran = 1;
352#ifdef CONFIG_BTRFS_FS_POSIX_ACL
347 sb->s_flags |= MS_POSIXACL; 353 sb->s_flags |= MS_POSIXACL;
354#endif
348 355
349 tree_root = open_ctree(sb, fs_devices, (char *)data); 356 tree_root = open_ctree(sb, fs_devices, (char *)data);
350 357
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 88f866f85e7..c207e8c32c9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -163,8 +163,14 @@ static void wait_current_trans(struct btrfs_root *root)
163 } 163 }
164} 164}
165 165
166enum btrfs_trans_type {
167 TRANS_START,
168 TRANS_JOIN,
169 TRANS_USERSPACE,
170};
171
166static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 172static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
167 int num_blocks, int wait) 173 int num_blocks, int type)
168{ 174{
169 struct btrfs_trans_handle *h = 175 struct btrfs_trans_handle *h =
170 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 176 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
@@ -172,7 +178,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
172 178
173 mutex_lock(&root->fs_info->trans_mutex); 179 mutex_lock(&root->fs_info->trans_mutex);
174 if (!root->fs_info->log_root_recovering && 180 if (!root->fs_info->log_root_recovering &&
175 ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2)) 181 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
182 type == TRANS_USERSPACE))
176 wait_current_trans(root); 183 wait_current_trans(root);
177 ret = join_transaction(root); 184 ret = join_transaction(root);
178 BUG_ON(ret); 185 BUG_ON(ret);
@@ -186,6 +193,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
186 h->alloc_exclude_start = 0; 193 h->alloc_exclude_start = 0;
187 h->delayed_ref_updates = 0; 194 h->delayed_ref_updates = 0;
188 195
196 if (!current->journal_info && type != TRANS_USERSPACE)
197 current->journal_info = h;
198
189 root->fs_info->running_transaction->use_count++; 199 root->fs_info->running_transaction->use_count++;
190 record_root_in_trans(h, root); 200 record_root_in_trans(h, root);
191 mutex_unlock(&root->fs_info->trans_mutex); 201 mutex_unlock(&root->fs_info->trans_mutex);
@@ -195,18 +205,18 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
195struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 205struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
196 int num_blocks) 206 int num_blocks)
197{ 207{
198 return start_transaction(root, num_blocks, 1); 208 return start_transaction(root, num_blocks, TRANS_START);
199} 209}
200struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 210struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
201 int num_blocks) 211 int num_blocks)
202{ 212{
203 return start_transaction(root, num_blocks, 0); 213 return start_transaction(root, num_blocks, TRANS_JOIN);
204} 214}
205 215
206struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 216struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
207 int num_blocks) 217 int num_blocks)
208{ 218{
209 return start_transaction(r, num_blocks, 2); 219 return start_transaction(r, num_blocks, TRANS_USERSPACE);
210} 220}
211 221
212/* wait for a transaction commit to be fully complete */ 222/* wait for a transaction commit to be fully complete */
@@ -317,6 +327,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
317 wake_up(&cur_trans->writer_wait); 327 wake_up(&cur_trans->writer_wait);
318 put_transaction(cur_trans); 328 put_transaction(cur_trans);
319 mutex_unlock(&info->trans_mutex); 329 mutex_unlock(&info->trans_mutex);
330
331 if (current->journal_info == trans)
332 current->journal_info = NULL;
320 memset(trans, 0, sizeof(*trans)); 333 memset(trans, 0, sizeof(*trans));
321 kmem_cache_free(btrfs_trans_handle_cachep, trans); 334 kmem_cache_free(btrfs_trans_handle_cachep, trans);
322 335
@@ -338,10 +351,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
338/* 351/*
339 * when btree blocks are allocated, they have some corresponding bits set for 352 * when btree blocks are allocated, they have some corresponding bits set for
340 * them in one of two extent_io trees. This is used to make sure all of 353 * them in one of two extent_io trees. This is used to make sure all of
341 * those extents are on disk for transaction or log commit 354 * those extents are sent to disk but does not wait on them
342 */ 355 */
343int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, 356int btrfs_write_marked_extents(struct btrfs_root *root,
344 struct extent_io_tree *dirty_pages) 357 struct extent_io_tree *dirty_pages)
345{ 358{
346 int ret; 359 int ret;
347 int err = 0; 360 int err = 0;
@@ -388,6 +401,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
388 page_cache_release(page); 401 page_cache_release(page);
389 } 402 }
390 } 403 }
404 if (err)
405 werr = err;
406 return werr;
407}
408
409/*
410 * when btree blocks are allocated, they have some corresponding bits set for
411 * them in one of two extent_io trees. This is used to make sure all of
412 * those extents are on disk for transaction or log commit. We wait
413 * on all the pages and clear them from the dirty pages state tree
414 */
415int btrfs_wait_marked_extents(struct btrfs_root *root,
416 struct extent_io_tree *dirty_pages)
417{
418 int ret;
419 int err = 0;
420 int werr = 0;
421 struct page *page;
422 struct inode *btree_inode = root->fs_info->btree_inode;
423 u64 start = 0;
424 u64 end;
425 unsigned long index;
426
391 while (1) { 427 while (1) {
392 ret = find_first_extent_bit(dirty_pages, 0, &start, &end, 428 ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
393 EXTENT_DIRTY); 429 EXTENT_DIRTY);
@@ -418,6 +454,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
418 return werr; 454 return werr;
419} 455}
420 456
457/*
458 * when btree blocks are allocated, they have some corresponding bits set for
459 * them in one of two extent_io trees. This is used to make sure all of
460 * those extents are on disk for transaction or log commit
461 */
462int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
463 struct extent_io_tree *dirty_pages)
464{
465 int ret;
466 int ret2;
467
468 ret = btrfs_write_marked_extents(root, dirty_pages);
469 ret2 = btrfs_wait_marked_extents(root, dirty_pages);
470 return ret || ret2;
471}
472
421int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 473int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
422 struct btrfs_root *root) 474 struct btrfs_root *root)
423{ 475{
@@ -743,6 +795,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
743 memcpy(&pending->root_key, &key, sizeof(key)); 795 memcpy(&pending->root_key, &key, sizeof(key));
744fail: 796fail:
745 kfree(new_root_item); 797 kfree(new_root_item);
798 btrfs_unreserve_metadata_space(root, 6);
746 return ret; 799 return ret;
747} 800}
748 801
@@ -1059,6 +1112,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1059 1112
1060 mutex_unlock(&root->fs_info->trans_mutex); 1113 mutex_unlock(&root->fs_info->trans_mutex);
1061 1114
1115 if (current->journal_info == trans)
1116 current->journal_info = NULL;
1117
1062 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1118 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1063 return ret; 1119 return ret;
1064} 1120}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 663c6740491..d4e3e7a6938 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
79 struct inode *inode) 79 struct inode *inode)
80{ 80{
81 BTRFS_I(inode)->last_trans = trans->transaction->transid; 81 BTRFS_I(inode)->last_trans = trans->transaction->transid;
82 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
82} 83}
83 84
84int btrfs_end_transaction(struct btrfs_trans_handle *trans, 85int btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -107,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
107 struct btrfs_root *root); 108 struct btrfs_root *root);
108int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, 109int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
109 struct extent_io_tree *dirty_pages); 110 struct extent_io_tree *dirty_pages);
111int btrfs_write_marked_extents(struct btrfs_root *root,
112 struct extent_io_tree *dirty_pages);
113int btrfs_wait_marked_extents(struct btrfs_root *root,
114 struct extent_io_tree *dirty_pages);
110int btrfs_transaction_in_commit(struct btrfs_fs_info *info); 115int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
111#endif 116#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7827841b55c..741666a7676 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
137 137
138 mutex_lock(&root->log_mutex); 138 mutex_lock(&root->log_mutex);
139 if (root->log_root) { 139 if (root->log_root) {
140 if (!root->log_start_pid) {
141 root->log_start_pid = current->pid;
142 root->log_multiple_pids = false;
143 } else if (root->log_start_pid != current->pid) {
144 root->log_multiple_pids = true;
145 }
146
140 root->log_batch++; 147 root->log_batch++;
141 atomic_inc(&root->log_writers); 148 atomic_inc(&root->log_writers);
142 mutex_unlock(&root->log_mutex); 149 mutex_unlock(&root->log_mutex);
143 return 0; 150 return 0;
144 } 151 }
152 root->log_multiple_pids = false;
153 root->log_start_pid = current->pid;
145 mutex_lock(&root->fs_info->tree_log_mutex); 154 mutex_lock(&root->fs_info->tree_log_mutex);
146 if (!root->fs_info->log_root_tree) { 155 if (!root->fs_info->log_root_tree) {
147 ret = btrfs_init_log_root_tree(trans, root->fs_info); 156 ret = btrfs_init_log_root_tree(trans, root->fs_info);
@@ -1971,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1971 int ret; 1980 int ret;
1972 struct btrfs_root *log = root->log_root; 1981 struct btrfs_root *log = root->log_root;
1973 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 1982 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
1983 u64 log_transid = 0;
1974 1984
1975 mutex_lock(&root->log_mutex); 1985 mutex_lock(&root->log_mutex);
1976 index1 = root->log_transid % 2; 1986 index1 = root->log_transid % 2;
@@ -1987,10 +1997,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1987 1997
1988 while (1) { 1998 while (1) {
1989 unsigned long batch = root->log_batch; 1999 unsigned long batch = root->log_batch;
1990 mutex_unlock(&root->log_mutex); 2000 if (root->log_multiple_pids) {
1991 schedule_timeout_uninterruptible(1); 2001 mutex_unlock(&root->log_mutex);
1992 mutex_lock(&root->log_mutex); 2002 schedule_timeout_uninterruptible(1);
1993 2003 mutex_lock(&root->log_mutex);
2004 }
1994 wait_for_writer(trans, root); 2005 wait_for_writer(trans, root);
1995 if (batch == root->log_batch) 2006 if (batch == root->log_batch)
1996 break; 2007 break;
@@ -2003,14 +2014,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2003 goto out; 2014 goto out;
2004 } 2015 }
2005 2016
2006 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2017 /* we start IO on all the marked extents here, but we don't actually
2018 * wait for them until later.
2019 */
2020 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages);
2007 BUG_ON(ret); 2021 BUG_ON(ret);
2008 2022
2009 btrfs_set_root_node(&log->root_item, log->node); 2023 btrfs_set_root_node(&log->root_item, log->node);
2010 2024
2011 root->log_batch = 0; 2025 root->log_batch = 0;
2026 log_transid = root->log_transid;
2012 root->log_transid++; 2027 root->log_transid++;
2013 log->log_transid = root->log_transid; 2028 log->log_transid = root->log_transid;
2029 root->log_start_pid = 0;
2014 smp_mb(); 2030 smp_mb();
2015 /* 2031 /*
2016 * log tree has been flushed to disk, new modifications of 2032 * log tree has been flushed to disk, new modifications of
@@ -2036,6 +2052,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2036 2052
2037 index2 = log_root_tree->log_transid % 2; 2053 index2 = log_root_tree->log_transid % 2;
2038 if (atomic_read(&log_root_tree->log_commit[index2])) { 2054 if (atomic_read(&log_root_tree->log_commit[index2])) {
2055 btrfs_wait_marked_extents(log, &log->dirty_log_pages);
2039 wait_log_commit(trans, log_root_tree, 2056 wait_log_commit(trans, log_root_tree,
2040 log_root_tree->log_transid); 2057 log_root_tree->log_transid);
2041 mutex_unlock(&log_root_tree->log_mutex); 2058 mutex_unlock(&log_root_tree->log_mutex);
@@ -2055,6 +2072,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2055 * check the full commit flag again 2072 * check the full commit flag again
2056 */ 2073 */
2057 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2074 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2075 btrfs_wait_marked_extents(log, &log->dirty_log_pages);
2058 mutex_unlock(&log_root_tree->log_mutex); 2076 mutex_unlock(&log_root_tree->log_mutex);
2059 ret = -EAGAIN; 2077 ret = -EAGAIN;
2060 goto out_wake_log_root; 2078 goto out_wake_log_root;
@@ -2063,6 +2081,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2063 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2081 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
2064 &log_root_tree->dirty_log_pages); 2082 &log_root_tree->dirty_log_pages);
2065 BUG_ON(ret); 2083 BUG_ON(ret);
2084 btrfs_wait_marked_extents(log, &log->dirty_log_pages);
2066 2085
2067 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2086 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
2068 log_root_tree->node->start); 2087 log_root_tree->node->start);
@@ -2082,9 +2101,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2082 * the running transaction open, so a full commit can't hop 2101 * the running transaction open, so a full commit can't hop
2083 * in and cause problems either. 2102 * in and cause problems either.
2084 */ 2103 */
2085 write_ctree_super(trans, root->fs_info->tree_root, 2); 2104 write_ctree_super(trans, root->fs_info->tree_root, 1);
2086 ret = 0; 2105 ret = 0;
2087 2106
2107 mutex_lock(&root->log_mutex);
2108 if (root->last_log_commit < log_transid)
2109 root->last_log_commit = log_transid;
2110 mutex_unlock(&root->log_mutex);
2111
2088out_wake_log_root: 2112out_wake_log_root:
2089 atomic_set(&log_root_tree->log_commit[index2], 0); 2113 atomic_set(&log_root_tree->log_commit[index2], 0);
2090 smp_mb(); 2114 smp_mb();
@@ -2852,6 +2876,21 @@ out:
2852 return ret; 2876 return ret;
2853} 2877}
2854 2878
2879static int inode_in_log(struct btrfs_trans_handle *trans,
2880 struct inode *inode)
2881{
2882 struct btrfs_root *root = BTRFS_I(inode)->root;
2883 int ret = 0;
2884
2885 mutex_lock(&root->log_mutex);
2886 if (BTRFS_I(inode)->logged_trans == trans->transid &&
2887 BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
2888 ret = 1;
2889 mutex_unlock(&root->log_mutex);
2890 return ret;
2891}
2892
2893
2855/* 2894/*
2856 * helper function around btrfs_log_inode to make sure newly created 2895 * helper function around btrfs_log_inode to make sure newly created
2857 * parent directories also end up in the log. A minimal inode and backref 2896 * parent directories also end up in the log. A minimal inode and backref
@@ -2891,6 +2930,11 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2891 if (ret) 2930 if (ret)
2892 goto end_no_trans; 2931 goto end_no_trans;
2893 2932
2933 if (inode_in_log(trans, inode)) {
2934 ret = BTRFS_NO_LOG_SYNC;
2935 goto end_no_trans;
2936 }
2937
2894 start_log_trans(trans, root); 2938 start_log_trans(trans, root);
2895 2939
2896 ret = btrfs_log_inode(trans, root, inode, inode_only); 2940 ret = btrfs_log_inode(trans, root, inode, inode_only);
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index d09c7609e16..0776eacb508 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -19,6 +19,9 @@
19#ifndef __TREE_LOG_ 19#ifndef __TREE_LOG_
20#define __TREE_LOG_ 20#define __TREE_LOG_
21 21
22/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
23#define BTRFS_NO_LOG_SYNC 256
24
22int btrfs_sync_log(struct btrfs_trans_handle *trans, 25int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root); 26 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 23e7d36ff32..7eda483d7b5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
446 goto error; 446 goto error;
447 447
448 device->name = kstrdup(orig_dev->name, GFP_NOFS); 448 device->name = kstrdup(orig_dev->name, GFP_NOFS);
449 if (!device->name) 449 if (!device->name) {
450 kfree(device);
450 goto error; 451 goto error;
452 }
451 453
452 device->devid = orig_dev->devid; 454 device->devid = orig_dev->devid;
453 device->work.func = pending_bios_fn; 455 device->work.func = pending_bios_fn;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a9d3bf4d268..b6dd5967c48 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -260,7 +260,7 @@ err:
260 * attributes are handled directly. 260 * attributes are handled directly.
261 */ 261 */
262struct xattr_handler *btrfs_xattr_handlers[] = { 262struct xattr_handler *btrfs_xattr_handlers[] = {
263#ifdef CONFIG_FS_POSIX_ACL 263#ifdef CONFIG_BTRFS_FS_POSIX_ACL
264 &btrfs_xattr_acl_access_handler, 264 &btrfs_xattr_acl_access_handler,
265 &btrfs_xattr_acl_default_handler, 265 &btrfs_xattr_acl_default_handler,
266#endif 266#endif
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 431accd475a..27089311fbe 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -114,8 +114,9 @@ nomem_lookup_data:
114 114
115/* 115/*
116 * attempt to look up the nominated node in this cache 116 * attempt to look up the nominated node in this cache
117 * - return -ETIMEDOUT to be scheduled again
117 */ 118 */
118static void cachefiles_lookup_object(struct fscache_object *_object) 119static int cachefiles_lookup_object(struct fscache_object *_object)
119{ 120{
120 struct cachefiles_lookup_data *lookup_data; 121 struct cachefiles_lookup_data *lookup_data;
121 struct cachefiles_object *parent, *object; 122 struct cachefiles_object *parent, *object;
@@ -145,13 +146,15 @@ static void cachefiles_lookup_object(struct fscache_object *_object)
145 object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) 146 object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
146 cachefiles_attr_changed(&object->fscache); 147 cachefiles_attr_changed(&object->fscache);
147 148
148 if (ret < 0) { 149 if (ret < 0 && ret != -ETIMEDOUT) {
149 printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n", 150 if (ret != -ENOBUFS)
150 ret); 151 printk(KERN_WARNING
152 "CacheFiles: Lookup failed error %d\n", ret);
151 fscache_object_lookup_error(&object->fscache); 153 fscache_object_lookup_error(&object->fscache);
152 } 154 }
153 155
154 _leave(" [%d]", ret); 156 _leave(" [%d]", ret);
157 return ret;
155} 158}
156 159
157/* 160/*
@@ -331,6 +334,7 @@ static void cachefiles_put_object(struct fscache_object *_object)
331 } 334 }
332 335
333 cache = object->fscache.cache; 336 cache = object->fscache.cache;
337 fscache_object_destroy(&object->fscache);
334 kmem_cache_free(cachefiles_object_jar, object); 338 kmem_cache_free(cachefiles_object_jar, object);
335 fscache_object_destroyed(cache); 339 fscache_object_destroyed(cache);
336 } 340 }
@@ -403,12 +407,26 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
403 if (oi_size == ni_size) 407 if (oi_size == ni_size)
404 return 0; 408 return 0;
405 409
406 newattrs.ia_size = ni_size;
407 newattrs.ia_valid = ATTR_SIZE;
408
409 cachefiles_begin_secure(cache, &saved_cred); 410 cachefiles_begin_secure(cache, &saved_cred);
410 mutex_lock(&object->backer->d_inode->i_mutex); 411 mutex_lock(&object->backer->d_inode->i_mutex);
412
413 /* if there's an extension to a partial page at the end of the backing
414 * file, we need to discard the partial page so that we pick up new
415 * data after it */
416 if (oi_size & ~PAGE_MASK && ni_size > oi_size) {
417 _debug("discard tail %llx", oi_size);
418 newattrs.ia_valid = ATTR_SIZE;
419 newattrs.ia_size = oi_size & PAGE_MASK;
420 ret = notify_change(object->backer, &newattrs);
421 if (ret < 0)
422 goto truncate_failed;
423 }
424
425 newattrs.ia_valid = ATTR_SIZE;
426 newattrs.ia_size = ni_size;
411 ret = notify_change(object->backer, &newattrs); 427 ret = notify_change(object->backer, &newattrs);
428
429truncate_failed:
412 mutex_unlock(&object->backer->d_inode->i_mutex); 430 mutex_unlock(&object->backer->d_inode->i_mutex);
413 cachefiles_end_secure(cache, saved_cred); 431 cachefiles_end_secure(cache, saved_cred);
414 432
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 4ce818ae39e..14ac4806e29 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -21,17 +21,81 @@
21#include <linux/security.h> 21#include <linux/security.h>
22#include "internal.h" 22#include "internal.h"
23 23
24static int cachefiles_wait_bit(void *flags) 24#define CACHEFILES_KEYBUF_SIZE 512
25
26/*
27 * dump debugging info about an object
28 */
29static noinline
30void __cachefiles_printk_object(struct cachefiles_object *object,
31 const char *prefix,
32 u8 *keybuf)
25{ 33{
26 schedule(); 34 struct fscache_cookie *cookie;
27 return 0; 35 unsigned keylen, loop;
36
37 printk(KERN_ERR "%sobject: OBJ%x\n",
38 prefix, object->fscache.debug_id);
39 printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n",
40 prefix, fscache_object_states[object->fscache.state],
41 object->fscache.flags, object->fscache.work.flags,
42 object->fscache.events,
43 object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
44 printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
45 prefix, object->fscache.n_ops, object->fscache.n_in_progress,
46 object->fscache.n_exclusive);
47 printk(KERN_ERR "%sparent=%p\n",
48 prefix, object->fscache.parent);
49
50 spin_lock(&object->fscache.lock);
51 cookie = object->fscache.cookie;
52 if (cookie) {
53 printk(KERN_ERR "%scookie=%p [pr=%p nd=%p fl=%lx]\n",
54 prefix,
55 object->fscache.cookie,
56 object->fscache.cookie->parent,
57 object->fscache.cookie->netfs_data,
58 object->fscache.cookie->flags);
59 if (keybuf)
60 keylen = cookie->def->get_key(cookie->netfs_data, keybuf,
61 CACHEFILES_KEYBUF_SIZE);
62 else
63 keylen = 0;
64 } else {
65 printk(KERN_ERR "%scookie=NULL\n", prefix);
66 keylen = 0;
67 }
68 spin_unlock(&object->fscache.lock);
69
70 if (keylen) {
71 printk(KERN_ERR "%skey=[%u] '", prefix, keylen);
72 for (loop = 0; loop < keylen; loop++)
73 printk("%02x", keybuf[loop]);
74 printk("'\n");
75 }
76}
77
78/*
79 * dump debugging info about a pair of objects
80 */
81static noinline void cachefiles_printk_object(struct cachefiles_object *object,
82 struct cachefiles_object *xobject)
83{
84 u8 *keybuf;
85
86 keybuf = kmalloc(CACHEFILES_KEYBUF_SIZE, GFP_NOIO);
87 if (object)
88 __cachefiles_printk_object(object, "", keybuf);
89 if (xobject)
90 __cachefiles_printk_object(xobject, "x", keybuf);
91 kfree(keybuf);
28} 92}
29 93
30/* 94/*
31 * record the fact that an object is now active 95 * record the fact that an object is now active
32 */ 96 */
33static void cachefiles_mark_object_active(struct cachefiles_cache *cache, 97static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
34 struct cachefiles_object *object) 98 struct cachefiles_object *object)
35{ 99{
36 struct cachefiles_object *xobject; 100 struct cachefiles_object *xobject;
37 struct rb_node **_p, *_parent = NULL; 101 struct rb_node **_p, *_parent = NULL;
@@ -42,8 +106,11 @@ static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
42try_again: 106try_again:
43 write_lock(&cache->active_lock); 107 write_lock(&cache->active_lock);
44 108
45 if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) 109 if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
110 printk(KERN_ERR "CacheFiles: Error: Object already active\n");
111 cachefiles_printk_object(object, NULL);
46 BUG(); 112 BUG();
113 }
47 114
48 dentry = object->dentry; 115 dentry = object->dentry;
49 _p = &cache->active_nodes.rb_node; 116 _p = &cache->active_nodes.rb_node;
@@ -66,8 +133,8 @@ try_again:
66 rb_insert_color(&object->active_node, &cache->active_nodes); 133 rb_insert_color(&object->active_node, &cache->active_nodes);
67 134
68 write_unlock(&cache->active_lock); 135 write_unlock(&cache->active_lock);
69 _leave(""); 136 _leave(" = 0");
70 return; 137 return 0;
71 138
72 /* an old object from a previous incarnation is hogging the slot - we 139 /* an old object from a previous incarnation is hogging the slot - we
73 * need to wait for it to be destroyed */ 140 * need to wait for it to be destroyed */
@@ -76,44 +143,70 @@ wait_for_old_object:
76 printk(KERN_ERR "\n"); 143 printk(KERN_ERR "\n");
77 printk(KERN_ERR "CacheFiles: Error:" 144 printk(KERN_ERR "CacheFiles: Error:"
78 " Unexpected object collision\n"); 145 " Unexpected object collision\n");
79 printk(KERN_ERR "xobject: OBJ%x\n", 146 cachefiles_printk_object(object, xobject);
80 xobject->fscache.debug_id);
81 printk(KERN_ERR "xobjstate=%s\n",
82 fscache_object_states[xobject->fscache.state]);
83 printk(KERN_ERR "xobjflags=%lx\n", xobject->fscache.flags);
84 printk(KERN_ERR "xobjevent=%lx [%lx]\n",
85 xobject->fscache.events, xobject->fscache.event_mask);
86 printk(KERN_ERR "xops=%u inp=%u exc=%u\n",
87 xobject->fscache.n_ops, xobject->fscache.n_in_progress,
88 xobject->fscache.n_exclusive);
89 printk(KERN_ERR "xcookie=%p [pr=%p nd=%p fl=%lx]\n",
90 xobject->fscache.cookie,
91 xobject->fscache.cookie->parent,
92 xobject->fscache.cookie->netfs_data,
93 xobject->fscache.cookie->flags);
94 printk(KERN_ERR "xparent=%p\n",
95 xobject->fscache.parent);
96 printk(KERN_ERR "object: OBJ%x\n",
97 object->fscache.debug_id);
98 printk(KERN_ERR "cookie=%p [pr=%p nd=%p fl=%lx]\n",
99 object->fscache.cookie,
100 object->fscache.cookie->parent,
101 object->fscache.cookie->netfs_data,
102 object->fscache.cookie->flags);
103 printk(KERN_ERR "parent=%p\n",
104 object->fscache.parent);
105 BUG(); 147 BUG();
106 } 148 }
107 atomic_inc(&xobject->usage); 149 atomic_inc(&xobject->usage);
108 write_unlock(&cache->active_lock); 150 write_unlock(&cache->active_lock);
109 151
110 _debug(">>> wait"); 152 if (test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
111 wait_on_bit(&xobject->flags, CACHEFILES_OBJECT_ACTIVE, 153 wait_queue_head_t *wq;
112 cachefiles_wait_bit, TASK_UNINTERRUPTIBLE); 154
113 _debug("<<< waited"); 155 signed long timeout = 60 * HZ;
156 wait_queue_t wait;
157 bool requeue;
158
159 /* if the object we're waiting for is queued for processing,
160 * then just put ourselves on the queue behind it */
161 if (slow_work_is_queued(&xobject->fscache.work)) {
162 _debug("queue OBJ%x behind OBJ%x immediately",
163 object->fscache.debug_id,
164 xobject->fscache.debug_id);
165 goto requeue;
166 }
167
168 /* otherwise we sleep until either the object we're waiting for
169 * is done, or the slow-work facility wants the thread back to
170 * do other work */
171 wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE);
172 init_wait(&wait);
173 requeue = false;
174 do {
175 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
176 if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags))
177 break;
178 requeue = slow_work_sleep_till_thread_needed(
179 &object->fscache.work, &timeout);
180 } while (timeout > 0 && !requeue);
181 finish_wait(wq, &wait);
182
183 if (requeue &&
184 test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) {
185 _debug("queue OBJ%x behind OBJ%x after wait",
186 object->fscache.debug_id,
187 xobject->fscache.debug_id);
188 goto requeue;
189 }
190
191 if (timeout <= 0) {
192 printk(KERN_ERR "\n");
193 printk(KERN_ERR "CacheFiles: Error: Overlong"
194 " wait for old active object to go away\n");
195 cachefiles_printk_object(object, xobject);
196 goto requeue;
197 }
198 }
199
200 ASSERT(!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags));
114 201
115 cache->cache.ops->put_object(&xobject->fscache); 202 cache->cache.ops->put_object(&xobject->fscache);
116 goto try_again; 203 goto try_again;
204
205requeue:
206 clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
207 cache->cache.ops->put_object(&xobject->fscache);
208 _leave(" = -ETIMEDOUT");
209 return -ETIMEDOUT;
117} 210}
118 211
119/* 212/*
@@ -254,7 +347,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
254 347
255 dir = dget_parent(object->dentry); 348 dir = dget_parent(object->dentry);
256 349
257 mutex_lock(&dir->d_inode->i_mutex); 350 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
258 ret = cachefiles_bury_object(cache, dir, object->dentry); 351 ret = cachefiles_bury_object(cache, dir, object->dentry);
259 352
260 dput(dir); 353 dput(dir);
@@ -307,7 +400,7 @@ lookup_again:
307 /* search the current directory for the element name */ 400 /* search the current directory for the element name */
308 _debug("lookup '%s'", name); 401 _debug("lookup '%s'", name);
309 402
310 mutex_lock(&dir->d_inode->i_mutex); 403 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
311 404
312 start = jiffies; 405 start = jiffies;
313 next = lookup_one_len(name, dir, nlen); 406 next = lookup_one_len(name, dir, nlen);
@@ -418,12 +511,15 @@ lookup_again:
418 } 511 }
419 512
420 /* note that we're now using this object */ 513 /* note that we're now using this object */
421 cachefiles_mark_object_active(cache, object); 514 ret = cachefiles_mark_object_active(cache, object);
422 515
423 mutex_unlock(&dir->d_inode->i_mutex); 516 mutex_unlock(&dir->d_inode->i_mutex);
424 dput(dir); 517 dput(dir);
425 dir = NULL; 518 dir = NULL;
426 519
520 if (ret == -ETIMEDOUT)
521 goto mark_active_timed_out;
522
427 _debug("=== OBTAINED_OBJECT ==="); 523 _debug("=== OBTAINED_OBJECT ===");
428 524
429 if (object->new) { 525 if (object->new) {
@@ -467,6 +563,10 @@ create_error:
467 cachefiles_io_error(cache, "Create/mkdir failed"); 563 cachefiles_io_error(cache, "Create/mkdir failed");
468 goto error; 564 goto error;
469 565
566mark_active_timed_out:
567 _debug("mark active timed out");
568 goto release_dentry;
569
470check_error: 570check_error:
471 _debug("check error %d", ret); 571 _debug("check error %d", ret);
472 write_lock(&cache->active_lock); 572 write_lock(&cache->active_lock);
@@ -474,7 +574,7 @@ check_error:
474 clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags); 574 clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
475 wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE); 575 wake_up_bit(&object->flags, CACHEFILES_OBJECT_ACTIVE);
476 write_unlock(&cache->active_lock); 576 write_unlock(&cache->active_lock);
477 577release_dentry:
478 dput(object->dentry); 578 dput(object->dentry);
479 object->dentry = NULL; 579 object->dentry = NULL;
480 goto error_out; 580 goto error_out;
@@ -495,9 +595,6 @@ error:
495error_out2: 595error_out2:
496 dput(dir); 596 dput(dir);
497error_out: 597error_out:
498 if (ret == -ENOSPC)
499 ret = -ENOBUFS;
500
501 _leave(" = error %d", -ret); 598 _leave(" = error %d", -ret);
502 return ret; 599 return ret;
503} 600}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index a69787e7dd9..a6c8c6fe8df 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/mount.h> 12#include <linux/mount.h>
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/ima.h>
14#include "internal.h" 15#include "internal.h"
15 16
16/* 17/*
@@ -40,8 +41,10 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
40 41
41 _debug("--- monitor %p %lx ---", page, page->flags); 42 _debug("--- monitor %p %lx ---", page, page->flags);
42 43
43 if (!PageUptodate(page) && !PageError(page)) 44 if (!PageUptodate(page) && !PageError(page)) {
44 dump_stack(); 45 /* unlocked, not uptodate and not erronous? */
46 _debug("page probably truncated");
47 }
45 48
46 /* remove from the waitqueue */ 49 /* remove from the waitqueue */
47 list_del(&wait->task_list); 50 list_del(&wait->task_list);
@@ -61,6 +64,84 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode,
61} 64}
62 65
63/* 66/*
67 * handle a probably truncated page
68 * - check to see if the page is still relevant and reissue the read if
69 * possible
70 * - return -EIO on error, -ENODATA if the page is gone, -EINPROGRESS if we
71 * must wait again and 0 if successful
72 */
73static int cachefiles_read_reissue(struct cachefiles_object *object,
74 struct cachefiles_one_read *monitor)
75{
76 struct address_space *bmapping = object->backer->d_inode->i_mapping;
77 struct page *backpage = monitor->back_page, *backpage2;
78 int ret;
79
80 kenter("{ino=%lx},{%lx,%lx}",
81 object->backer->d_inode->i_ino,
82 backpage->index, backpage->flags);
83
84 /* skip if the page was truncated away completely */
85 if (backpage->mapping != bmapping) {
86 kleave(" = -ENODATA [mapping]");
87 return -ENODATA;
88 }
89
90 backpage2 = find_get_page(bmapping, backpage->index);
91 if (!backpage2) {
92 kleave(" = -ENODATA [gone]");
93 return -ENODATA;
94 }
95
96 if (backpage != backpage2) {
97 put_page(backpage2);
98 kleave(" = -ENODATA [different]");
99 return -ENODATA;
100 }
101
102 /* the page is still there and we already have a ref on it, so we don't
103 * need a second */
104 put_page(backpage2);
105
106 INIT_LIST_HEAD(&monitor->op_link);
107 add_page_wait_queue(backpage, &monitor->monitor);
108
109 if (trylock_page(backpage)) {
110 ret = -EIO;
111 if (PageError(backpage))
112 goto unlock_discard;
113 ret = 0;
114 if (PageUptodate(backpage))
115 goto unlock_discard;
116
117 kdebug("reissue read");
118 ret = bmapping->a_ops->readpage(NULL, backpage);
119 if (ret < 0)
120 goto unlock_discard;
121 }
122
123 /* but the page may have been read before the monitor was installed, so
124 * the monitor may miss the event - so we have to ensure that we do get
125 * one in such a case */
126 if (trylock_page(backpage)) {
127 _debug("jumpstart %p {%lx}", backpage, backpage->flags);
128 unlock_page(backpage);
129 }
130
131 /* it'll reappear on the todo list */
132 kleave(" = -EINPROGRESS");
133 return -EINPROGRESS;
134
135unlock_discard:
136 unlock_page(backpage);
137 spin_lock_irq(&object->work_lock);
138 list_del(&monitor->op_link);
139 spin_unlock_irq(&object->work_lock);
140 kleave(" = %d", ret);
141 return ret;
142}
143
144/*
64 * copy data from backing pages to netfs pages to complete a read operation 145 * copy data from backing pages to netfs pages to complete a read operation
65 * - driven by FS-Cache's thread pool 146 * - driven by FS-Cache's thread pool
66 */ 147 */
@@ -92,20 +173,26 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
92 173
93 _debug("- copy {%lu}", monitor->back_page->index); 174 _debug("- copy {%lu}", monitor->back_page->index);
94 175
95 error = -EIO; 176 recheck:
96 if (PageUptodate(monitor->back_page)) { 177 if (PageUptodate(monitor->back_page)) {
97 copy_highpage(monitor->netfs_page, monitor->back_page); 178 copy_highpage(monitor->netfs_page, monitor->back_page);
98 179
99 pagevec_add(&pagevec, monitor->netfs_page); 180 pagevec_add(&pagevec, monitor->netfs_page);
100 fscache_mark_pages_cached(monitor->op, &pagevec); 181 fscache_mark_pages_cached(monitor->op, &pagevec);
101 error = 0; 182 error = 0;
102 } 183 } else if (!PageError(monitor->back_page)) {
103 184 /* the page has probably been truncated */
104 if (error) 185 error = cachefiles_read_reissue(object, monitor);
186 if (error == -EINPROGRESS)
187 goto next;
188 goto recheck;
189 } else {
105 cachefiles_io_error_obj( 190 cachefiles_io_error_obj(
106 object, 191 object,
107 "Readpage failed on backing file %lx", 192 "Readpage failed on backing file %lx",
108 (unsigned long) monitor->back_page->flags); 193 (unsigned long) monitor->back_page->flags);
194 error = -EIO;
195 }
109 196
110 page_cache_release(monitor->back_page); 197 page_cache_release(monitor->back_page);
111 198
@@ -114,6 +201,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
114 fscache_put_retrieval(op); 201 fscache_put_retrieval(op);
115 kfree(monitor); 202 kfree(monitor);
116 203
204 next:
117 /* let the thread pool have some air occasionally */ 205 /* let the thread pool have some air occasionally */
118 max--; 206 max--;
119 if (max < 0 || need_resched()) { 207 if (max < 0 || need_resched()) {
@@ -333,7 +421,8 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
333 421
334 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; 422 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
335 423
336 op->op.flags = FSCACHE_OP_FAST; 424 op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
425 op->op.flags |= FSCACHE_OP_FAST;
337 op->op.processor = cachefiles_read_copier; 426 op->op.processor = cachefiles_read_copier;
338 427
339 pagevec_init(&pagevec, 0); 428 pagevec_init(&pagevec, 0);
@@ -639,7 +728,8 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
639 728
640 pagevec_init(&pagevec, 0); 729 pagevec_init(&pagevec, 0);
641 730
642 op->op.flags = FSCACHE_OP_FAST; 731 op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
732 op->op.flags |= FSCACHE_OP_FAST;
643 op->op.processor = cachefiles_read_copier; 733 op->op.processor = cachefiles_read_copier;
644 734
645 INIT_LIST_HEAD(&backpages); 735 INIT_LIST_HEAD(&backpages);
@@ -801,7 +891,8 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
801 struct cachefiles_cache *cache; 891 struct cachefiles_cache *cache;
802 mm_segment_t old_fs; 892 mm_segment_t old_fs;
803 struct file *file; 893 struct file *file;
804 loff_t pos; 894 loff_t pos, eof;
895 size_t len;
805 void *data; 896 void *data;
806 int ret; 897 int ret;
807 898
@@ -832,18 +923,33 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
832 if (IS_ERR(file)) { 923 if (IS_ERR(file)) {
833 ret = PTR_ERR(file); 924 ret = PTR_ERR(file);
834 } else { 925 } else {
926 ima_counts_get(file);
835 ret = -EIO; 927 ret = -EIO;
836 if (file->f_op->write) { 928 if (file->f_op->write) {
837 pos = (loff_t) page->index << PAGE_SHIFT; 929 pos = (loff_t) page->index << PAGE_SHIFT;
930
931 /* we mustn't write more data than we have, so we have
932 * to beware of a partial page at EOF */
933 eof = object->fscache.store_limit_l;
934 len = PAGE_SIZE;
935 if (eof & ~PAGE_MASK) {
936 ASSERTCMP(pos, <, eof);
937 if (eof - pos < PAGE_SIZE) {
938 _debug("cut short %llx to %llx",
939 pos, eof);
940 len = eof - pos;
941 ASSERTCMP(pos + len, ==, eof);
942 }
943 }
944
838 data = kmap(page); 945 data = kmap(page);
839 old_fs = get_fs(); 946 old_fs = get_fs();
840 set_fs(KERNEL_DS); 947 set_fs(KERNEL_DS);
841 ret = file->f_op->write( 948 ret = file->f_op->write(
842 file, (const void __user *) data, PAGE_SIZE, 949 file, (const void __user *) data, len, &pos);
843 &pos);
844 set_fs(old_fs); 950 set_fs(old_fs);
845 kunmap(page); 951 kunmap(page);
846 if (ret != PAGE_SIZE) 952 if (ret != len)
847 ret = -EIO; 953 ret = -EIO;
848 } 954 }
849 fput(file); 955 fput(file);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 145540a316a..094ea65afc8 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,12 @@
1Version 1.61
2------------
3Fix append problem to Samba servers (files opened with O_APPEND could
4have duplicated data). Fix oops in cifs_lookup. Workaround problem
5mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session.
6Disable use of server inode numbers when server only
7partially supports them (e.g. for one server querying inode numbers on
8FindFirst fails but QPathInfo queries works).
9
1Version 1.60 10Version 1.60
2------------- 11-------------
3Fix memory leak in reconnect. Fix oops in DFS mount error path. 12Fix memory leak in reconnect. Fix oops in DFS mount error path.
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 9a5e4f5f312..29f1da761bb 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1037,7 +1037,7 @@ init_cifs(void)
1037 if (rc) 1037 if (rc)
1038 goto out_unregister_key_type; 1038 goto out_unregister_key_type;
1039#endif 1039#endif
1040 rc = slow_work_register_user(); 1040 rc = slow_work_register_user(THIS_MODULE);
1041 if (rc) 1041 if (rc)
1042 goto out_unregister_resolver_key; 1042 goto out_unregister_resolver_key;
1043 1043
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 6928c24d1d4..5646727e33f 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -388,4 +388,5 @@ extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
388 const struct nls_table *nls_codepage, int remap_special_chars); 388 const struct nls_table *nls_codepage, int remap_special_chars);
389extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon, 389extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
390 const int netfid, __u64 *pExtAttrBits, __u64 *pMask); 390 const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
391extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
391#endif /* _CIFSPROTO_H */ 392#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 43003e0bef1..63ea83ff687 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1577,7 +1577,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1577 1577
1578out_err: 1578out_err:
1579 if (tcp_ses) { 1579 if (tcp_ses) {
1580 kfree(tcp_ses->hostname); 1580 if (!IS_ERR(tcp_ses->hostname))
1581 kfree(tcp_ses->hostname);
1581 if (tcp_ses->ssocket) 1582 if (tcp_ses->ssocket)
1582 sock_release(tcp_ses->ssocket); 1583 sock_release(tcp_ses->ssocket);
1583 kfree(tcp_ses); 1584 kfree(tcp_ses);
@@ -2219,16 +2220,8 @@ is_path_accessible(int xid, struct cifsTconInfo *tcon,
2219 struct cifs_sb_info *cifs_sb, const char *full_path) 2220 struct cifs_sb_info *cifs_sb, const char *full_path)
2220{ 2221{
2221 int rc; 2222 int rc;
2222 __u64 inode_num;
2223 FILE_ALL_INFO *pfile_info; 2223 FILE_ALL_INFO *pfile_info;
2224 2224
2225 rc = CIFSGetSrvInodeNumber(xid, tcon, full_path, &inode_num,
2226 cifs_sb->local_nls,
2227 cifs_sb->mnt_cifs_flags &
2228 CIFS_MOUNT_MAP_SPECIAL_CHR);
2229 if (rc != -EOPNOTSUPP)
2230 return rc;
2231
2232 pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 2225 pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
2233 if (pfile_info == NULL) 2226 if (pfile_info == NULL)
2234 return -ENOMEM; 2227 return -ENOMEM;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 627a60a6c1b..1f42f772865 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -214,8 +214,6 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
214 posix_flags |= SMB_O_EXCL; 214 posix_flags |= SMB_O_EXCL;
215 if (oflags & O_TRUNC) 215 if (oflags & O_TRUNC)
216 posix_flags |= SMB_O_TRUNC; 216 posix_flags |= SMB_O_TRUNC;
217 if (oflags & O_APPEND)
218 posix_flags |= SMB_O_APPEND;
219 if (oflags & O_SYNC) 217 if (oflags & O_SYNC)
220 posix_flags |= SMB_O_SYNC; 218 posix_flags |= SMB_O_SYNC;
221 if (oflags & O_DIRECTORY) 219 if (oflags & O_DIRECTORY)
@@ -643,9 +641,9 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
643 * O_EXCL: optimize away the lookup, but don't hash the dentry. Let 641 * O_EXCL: optimize away the lookup, but don't hash the dentry. Let
644 * the VFS handle the create. 642 * the VFS handle the create.
645 */ 643 */
646 if (nd->flags & LOOKUP_EXCL) { 644 if (nd && (nd->flags & LOOKUP_EXCL)) {
647 d_instantiate(direntry, NULL); 645 d_instantiate(direntry, NULL);
648 return 0; 646 return NULL;
649 } 647 }
650 648
651 /* can not grab the rename sem here since it would 649 /* can not grab the rename sem here since it would
@@ -675,7 +673,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
675 * reduction in network traffic in the other paths. 673 * reduction in network traffic in the other paths.
676 */ 674 */
677 if (pTcon->unix_ext) { 675 if (pTcon->unix_ext) {
678 if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && 676 if (nd && !(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
679 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && 677 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
680 (nd->intent.open.flags & O_CREAT)) { 678 (nd->intent.open.flags & O_CREAT)) {
681 rc = cifs_posix_open(full_path, &newInode, nd->path.mnt, 679 rc = cifs_posix_open(full_path, &newInode, nd->path.mnt,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 5e2492535da..cababd8a52d 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -512,13 +512,10 @@ int cifs_get_inode_info(struct inode **pinode,
512 cifs_sb->local_nls, 512 cifs_sb->local_nls,
513 cifs_sb->mnt_cifs_flags & 513 cifs_sb->mnt_cifs_flags &
514 CIFS_MOUNT_MAP_SPECIAL_CHR); 514 CIFS_MOUNT_MAP_SPECIAL_CHR);
515 if (rc1) { 515 if (rc1 || !fattr.cf_uniqueid) {
516 cFYI(1, ("GetSrvInodeNum rc %d", rc1)); 516 cFYI(1, ("GetSrvInodeNum rc %d", rc1));
517 fattr.cf_uniqueid = iunique(sb, ROOT_I); 517 fattr.cf_uniqueid = iunique(sb, ROOT_I);
518 /* disable serverino if call not supported */ 518 cifs_autodisable_serverino(cifs_sb);
519 if (rc1 == -EINVAL)
520 cifs_sb->mnt_cifs_flags &=
521 ~CIFS_MOUNT_SERVER_INUM;
522 } 519 }
523 } else { 520 } else {
524 fattr.cf_uniqueid = iunique(sb, ROOT_I); 521 fattr.cf_uniqueid = iunique(sb, ROOT_I);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 0241b25ac33..d27d4ec6579 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -715,3 +715,17 @@ cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
715ctoUCS_out: 715ctoUCS_out:
716 return i; 716 return i;
717} 717}
718
719void
720cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
721{
722 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
723 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
724 cERROR(1, ("Autodisabling the use of server inode numbers on "
725 "%s. This server doesn't seem to support them "
726 "properly. Hardlinks will not be recognized on this "
727 "mount. Consider mounting with the \"noserverino\" "
728 "option to silence this message.",
729 cifs_sb->tcon->treeName));
730 }
731}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 1f098ca7163..f84062f9a98 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -727,11 +727,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
727 cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *) 727 cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *)
728 pfindEntry, cifs_sb); 728 pfindEntry, cifs_sb);
729 729
730 /* FIXME: make _to_fattr functions fill this out */ 730 if (inum && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
731 if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO)
732 fattr.cf_uniqueid = inum; 731 fattr.cf_uniqueid = inum;
733 else 732 } else {
734 fattr.cf_uniqueid = iunique(sb, ROOT_I); 733 fattr.cf_uniqueid = iunique(sb, ROOT_I);
734 cifs_autodisable_serverino(cifs_sb);
735 }
735 736
736 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); 737 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
737 tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr); 738 tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr);
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 0376ac66c44..be4392ca209 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -22,6 +22,7 @@
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/major.h> 23#include <linux/major.h>
24#include <linux/time.h> 24#include <linux/time.h>
25#include <linux/sched.h>
25#include <linux/slab.h> 26#include <linux/slab.h>
26#include <linux/ioport.h> 27#include <linux/ioport.h>
27#include <linux/fcntl.h> 28#include <linux/fcntl.h>
diff --git a/fs/compat.c b/fs/compat.c
index d576b552e8e..6c19040ffee 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1532,6 +1532,8 @@ int compat_do_execve(char * filename,
1532 if (retval < 0) 1532 if (retval < 0)
1533 goto out; 1533 goto out;
1534 1534
1535 current->stack_start = current->mm->start_stack;
1536
1535 /* execve succeeded */ 1537 /* execve succeeded */
1536 current->fs->in_exec = 0; 1538 current->fs->in_exec = 0;
1537 current->in_execve = 0; 1539 current->in_execve = 0;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index f91fd51b32e..d84e7058c29 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1800,7 +1800,7 @@ struct space_resv_32 {
1800/* just account for different alignment */ 1800/* just account for different alignment */
1801static int compat_ioctl_preallocate(struct file *file, unsigned long arg) 1801static int compat_ioctl_preallocate(struct file *file, unsigned long arg)
1802{ 1802{
1803 struct space_resv_32 __user *p32 = (void __user *)arg; 1803 struct space_resv_32 __user *p32 = compat_ptr(arg);
1804 struct space_resv __user *p = compat_alloc_user_space(sizeof(*p)); 1804 struct space_resv __user *p = compat_alloc_user_space(sizeof(*p));
1805 1805
1806 if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) || 1806 if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) ||
@@ -2802,7 +2802,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
2802#else 2802#else
2803 case FS_IOC_RESVSP: 2803 case FS_IOC_RESVSP:
2804 case FS_IOC_RESVSP64: 2804 case FS_IOC_RESVSP64:
2805 error = ioctl_preallocate(filp, (void __user *)arg); 2805 error = ioctl_preallocate(filp, compat_ptr(arg));
2806 goto out_fput; 2806 goto out_fput;
2807#endif 2807#endif
2808 2808
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 240cef14fe5..70736eb4b51 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -316,6 +316,10 @@ int dlm_lowcomms_connect_node(int nodeid)
316{ 316{
317 struct connection *con; 317 struct connection *con;
318 318
319 /* with sctp there's no connecting without sending */
320 if (dlm_config.ci_protocol != 0)
321 return 0;
322
319 if (nodeid == dlm_our_nodeid()) 323 if (nodeid == dlm_our_nodeid())
320 return 0; 324 return 0;
321 325
@@ -455,9 +459,9 @@ static void process_sctp_notification(struct connection *con,
455 int prim_len, ret; 459 int prim_len, ret;
456 int addr_len; 460 int addr_len;
457 struct connection *new_con; 461 struct connection *new_con;
458 struct file *file;
459 sctp_peeloff_arg_t parg; 462 sctp_peeloff_arg_t parg;
460 int parglen = sizeof(parg); 463 int parglen = sizeof(parg);
464 int err;
461 465
462 /* 466 /*
463 * We get this before any data for an association. 467 * We get this before any data for an association.
@@ -512,19 +516,22 @@ static void process_sctp_notification(struct connection *con,
512 ret = kernel_getsockopt(con->sock, IPPROTO_SCTP, 516 ret = kernel_getsockopt(con->sock, IPPROTO_SCTP,
513 SCTP_SOCKOPT_PEELOFF, 517 SCTP_SOCKOPT_PEELOFF,
514 (void *)&parg, &parglen); 518 (void *)&parg, &parglen);
515 if (ret) { 519 if (ret < 0) {
516 log_print("Can't peel off a socket for " 520 log_print("Can't peel off a socket for "
517 "connection %d to node %d: err=%d\n", 521 "connection %d to node %d: err=%d",
518 parg.associd, nodeid, ret); 522 parg.associd, nodeid, ret);
523 return;
524 }
525 new_con->sock = sockfd_lookup(parg.sd, &err);
526 if (!new_con->sock) {
527 log_print("sockfd_lookup error %d", err);
528 return;
519 } 529 }
520 file = fget(parg.sd);
521 new_con->sock = SOCKET_I(file->f_dentry->d_inode);
522 add_sock(new_con->sock, new_con); 530 add_sock(new_con->sock, new_con);
523 fput(file); 531 sockfd_put(new_con->sock);
524 put_unused_fd(parg.sd);
525 532
526 log_print("got new/restarted association %d nodeid %d", 533 log_print("connecting to %d sctp association %d",
527 (int)sn->sn_assoc_change.sac_assoc_id, nodeid); 534 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
528 535
529 /* Send any pending writes */ 536 /* Send any pending writes */
530 clear_bit(CF_CONNECT_PENDING, &new_con->flags); 537 clear_bit(CF_CONNECT_PENDING, &new_con->flags);
@@ -837,8 +844,6 @@ static void sctp_init_assoc(struct connection *con)
837 if (con->retries++ > MAX_CONNECT_RETRIES) 844 if (con->retries++ > MAX_CONNECT_RETRIES)
838 return; 845 return;
839 846
840 log_print("Initiating association with node %d", con->nodeid);
841
842 if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) { 847 if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) {
843 log_print("no address for nodeid %d", con->nodeid); 848 log_print("no address for nodeid %d", con->nodeid);
844 return; 849 return;
@@ -855,11 +860,14 @@ static void sctp_init_assoc(struct connection *con)
855 outmessage.msg_flags = MSG_EOR; 860 outmessage.msg_flags = MSG_EOR;
856 861
857 spin_lock(&con->writequeue_lock); 862 spin_lock(&con->writequeue_lock);
858 e = list_entry(con->writequeue.next, struct writequeue_entry,
859 list);
860 863
861 BUG_ON((struct list_head *) e == &con->writequeue); 864 if (list_empty(&con->writequeue)) {
865 spin_unlock(&con->writequeue_lock);
866 log_print("writequeue empty for nodeid %d", con->nodeid);
867 return;
868 }
862 869
870 e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
863 len = e->len; 871 len = e->len;
864 offset = e->offset; 872 offset = e->offset;
865 spin_unlock(&con->writequeue_lock); 873 spin_unlock(&con->writequeue_lock);
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index 8aadb99b763..1cd6d9d3e29 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -1,8 +1,9 @@
1config ECRYPT_FS 1config ECRYPT_FS
2 tristate "eCrypt filesystem layer support (EXPERIMENTAL)" 2 tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
3 depends on EXPERIMENTAL && KEYS && NET 3 depends on EXPERIMENTAL && KEYS && CRYPTO
4 select CRYPTO_ECB 4 select CRYPTO_ECB
5 select CRYPTO_CBC 5 select CRYPTO_CBC
6 select CRYPTO_MD5
6 help 7 help
7 Encrypted filesystem that operates on the VFS layer. See 8 Encrypted filesystem that operates on the VFS layer. See
8 <file:Documentation/filesystems/ecryptfs.txt> to learn more about 9 <file:Documentation/filesystems/ecryptfs.txt> to learn more about
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 101fe4c7b1e..c6ac85d6c70 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -35,6 +35,7 @@
35#include <linux/key.h> 35#include <linux/key.h>
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/fs_stack.h> 37#include <linux/fs_stack.h>
38#include <linux/ima.h>
38#include "ecryptfs_kernel.h" 39#include "ecryptfs_kernel.h"
39 40
40/** 41/**
@@ -118,6 +119,7 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
118 const struct cred *cred = current_cred(); 119 const struct cred *cred = current_cred();
119 struct ecryptfs_inode_info *inode_info = 120 struct ecryptfs_inode_info *inode_info =
120 ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); 121 ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
122 int opened_lower_file = 0;
121 int rc = 0; 123 int rc = 0;
122 124
123 mutex_lock(&inode_info->lower_file_mutex); 125 mutex_lock(&inode_info->lower_file_mutex);
@@ -134,9 +136,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
134 "for lower_dentry [0x%p] and lower_mnt [0x%p]; " 136 "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
135 "rc = [%d]\n", lower_dentry, lower_mnt, rc); 137 "rc = [%d]\n", lower_dentry, lower_mnt, rc);
136 inode_info->lower_file = NULL; 138 inode_info->lower_file = NULL;
137 } 139 } else
140 opened_lower_file = 1;
138 } 141 }
139 mutex_unlock(&inode_info->lower_file_mutex); 142 mutex_unlock(&inode_info->lower_file_mutex);
143 if (opened_lower_file)
144 ima_counts_get(inode_info->lower_file);
140 return rc; 145 return rc;
141} 146}
142 147
diff --git a/fs/exec.c b/fs/exec.c
index d164342c2b6..c0c636e34f6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -623,10 +623,8 @@ int setup_arg_pages(struct linux_binprm *bprm,
623 /* Move stack pages down in memory. */ 623 /* Move stack pages down in memory. */
624 if (stack_shift) { 624 if (stack_shift) {
625 ret = shift_arg_pages(vma, stack_shift); 625 ret = shift_arg_pages(vma, stack_shift);
626 if (ret) { 626 if (ret)
627 up_write(&mm->mmap_sem); 627 goto out_unlock;
628 return ret;
629 }
630 } 628 }
631 629
632#ifdef CONFIG_STACK_GROWSUP 630#ifdef CONFIG_STACK_GROWSUP
@@ -640,7 +638,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
640 638
641out_unlock: 639out_unlock:
642 up_write(&mm->mmap_sem); 640 up_write(&mm->mmap_sem);
643 return 0; 641 return ret;
644} 642}
645EXPORT_SYMBOL(setup_arg_pages); 643EXPORT_SYMBOL(setup_arg_pages);
646 644
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 451d166bbe9..8209f266e9a 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -46,19 +46,21 @@
46int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) 46int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
47{ 47{
48 struct inode *inode = dentry->d_inode; 48 struct inode *inode = dentry->d_inode;
49 struct ext3_inode_info *ei = EXT3_I(inode);
50 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
49 int ret = 0; 51 int ret = 0;
52 tid_t commit_tid;
53
54 if (inode->i_sb->s_flags & MS_RDONLY)
55 return 0;
50 56
51 J_ASSERT(ext3_journal_current_handle() == NULL); 57 J_ASSERT(ext3_journal_current_handle() == NULL);
52 58
53 /* 59 /*
54 * data=writeback: 60 * data=writeback,ordered:
55 * The caller's filemap_fdatawrite()/wait will sync the data. 61 * The caller's filemap_fdatawrite()/wait will sync the data.
56 * sync_inode() will sync the metadata 62 * Metadata is in the journal, we wait for a proper transaction
57 * 63 * to commit here.
58 * data=ordered:
59 * The caller's filemap_fdatawrite() will write the data and
60 * sync_inode() will write the inode if it is dirty. Then the caller's
61 * filemap_fdatawait() will wait on the pages.
62 * 64 *
63 * data=journal: 65 * data=journal:
64 * filemap_fdatawrite won't do anything (the buffers are clean). 66 * filemap_fdatawrite won't do anything (the buffers are clean).
@@ -73,22 +75,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
73 goto out; 75 goto out;
74 } 76 }
75 77
76 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 78 if (datasync)
77 goto flush; 79 commit_tid = atomic_read(&ei->i_datasync_tid);
80 else
81 commit_tid = atomic_read(&ei->i_sync_tid);
78 82
79 /* 83 if (log_start_commit(journal, commit_tid)) {
80 * The VFS has written the file data. If the inode is unaltered 84 log_wait_commit(journal, commit_tid);
81 * then we need not start a commit.
82 */
83 if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
84 struct writeback_control wbc = {
85 .sync_mode = WB_SYNC_ALL,
86 .nr_to_write = 0, /* sys_fsync did this */
87 };
88 ret = sync_inode(inode, &wbc);
89 goto out; 85 goto out;
90 } 86 }
91flush: 87
92 /* 88 /*
93 * In case we didn't commit a transaction, we have to flush 89 * In case we didn't commit a transaction, we have to flush
94 * disk caches manually so that data really is on persistent 90 * disk caches manually so that data really is on persistent
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index acf1b142332..354ed3b47b3 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -699,8 +699,9 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
699 int err = 0; 699 int err = 0;
700 struct ext3_block_alloc_info *block_i; 700 struct ext3_block_alloc_info *block_i;
701 ext3_fsblk_t current_block; 701 ext3_fsblk_t current_block;
702 struct ext3_inode_info *ei = EXT3_I(inode);
702 703
703 block_i = EXT3_I(inode)->i_block_alloc_info; 704 block_i = ei->i_block_alloc_info;
704 /* 705 /*
705 * If we're splicing into a [td]indirect block (as opposed to the 706 * If we're splicing into a [td]indirect block (as opposed to the
706 * inode) then we need to get write access to the [td]indirect block 707 * inode) then we need to get write access to the [td]indirect block
@@ -741,6 +742,8 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
741 742
742 inode->i_ctime = CURRENT_TIME_SEC; 743 inode->i_ctime = CURRENT_TIME_SEC;
743 ext3_mark_inode_dirty(handle, inode); 744 ext3_mark_inode_dirty(handle, inode);
745 /* ext3_mark_inode_dirty already updated i_sync_tid */
746 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
744 747
745 /* had we spliced it onto indirect block? */ 748 /* had we spliced it onto indirect block? */
746 if (where->bh) { 749 if (where->bh) {
@@ -1735,6 +1738,7 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1735 ssize_t ret; 1738 ssize_t ret;
1736 int orphan = 0; 1739 int orphan = 0;
1737 size_t count = iov_length(iov, nr_segs); 1740 size_t count = iov_length(iov, nr_segs);
1741 int retries = 0;
1738 1742
1739 if (rw == WRITE) { 1743 if (rw == WRITE) {
1740 loff_t final_size = offset + count; 1744 loff_t final_size = offset + count;
@@ -1757,9 +1761,12 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1757 } 1761 }
1758 } 1762 }
1759 1763
1764retry:
1760 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 1765 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1761 offset, nr_segs, 1766 offset, nr_segs,
1762 ext3_get_block, NULL); 1767 ext3_get_block, NULL);
1768 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1769 goto retry;
1763 1770
1764 if (orphan) { 1771 if (orphan) {
1765 int err; 1772 int err;
@@ -2750,6 +2757,8 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2750 struct ext3_inode_info *ei; 2757 struct ext3_inode_info *ei;
2751 struct buffer_head *bh; 2758 struct buffer_head *bh;
2752 struct inode *inode; 2759 struct inode *inode;
2760 journal_t *journal = EXT3_SB(sb)->s_journal;
2761 transaction_t *transaction;
2753 long ret; 2762 long ret;
2754 int block; 2763 int block;
2755 2764
@@ -2827,6 +2836,30 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2827 ei->i_data[block] = raw_inode->i_block[block]; 2836 ei->i_data[block] = raw_inode->i_block[block];
2828 INIT_LIST_HEAD(&ei->i_orphan); 2837 INIT_LIST_HEAD(&ei->i_orphan);
2829 2838
2839 /*
2840 * Set transaction id's of transactions that have to be committed
2841 * to finish f[data]sync. We set them to currently running transaction
2842 * as we cannot be sure that the inode or some of its metadata isn't
2843 * part of the transaction - the inode could have been reclaimed and
2844 * now it is reread from disk.
2845 */
2846 if (journal) {
2847 tid_t tid;
2848
2849 spin_lock(&journal->j_state_lock);
2850 if (journal->j_running_transaction)
2851 transaction = journal->j_running_transaction;
2852 else
2853 transaction = journal->j_committing_transaction;
2854 if (transaction)
2855 tid = transaction->t_tid;
2856 else
2857 tid = journal->j_commit_sequence;
2858 spin_unlock(&journal->j_state_lock);
2859 atomic_set(&ei->i_sync_tid, tid);
2860 atomic_set(&ei->i_datasync_tid, tid);
2861 }
2862
2830 if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && 2863 if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
2831 EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { 2864 EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
2832 /* 2865 /*
@@ -3011,6 +3044,7 @@ again:
3011 err = rc; 3044 err = rc;
3012 ei->i_state &= ~EXT3_STATE_NEW; 3045 ei->i_state &= ~EXT3_STATE_NEW;
3013 3046
3047 atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
3014out_brelse: 3048out_brelse:
3015 brelse (bh); 3049 brelse (bh);
3016 ext3_std_error(inode->i_sb, err); 3050 ext3_std_error(inode->i_sb, err);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 72743d36050..427496c4767 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -466,6 +466,8 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
466 return NULL; 466 return NULL;
467 ei->i_block_alloc_info = NULL; 467 ei->i_block_alloc_info = NULL;
468 ei->vfs_inode.i_version = 1; 468 ei->vfs_inode.i_version = 1;
469 atomic_set(&ei->i_datasync_tid, 0);
470 atomic_set(&ei->i_sync_tid, 0);
469 return &ei->vfs_inode; 471 return &ei->vfs_inode;
470} 472}
471 473
@@ -2321,7 +2323,18 @@ static int ext3_commit_super(struct super_block *sb,
2321 2323
2322 if (!sbh) 2324 if (!sbh)
2323 return error; 2325 return error;
2324 es->s_wtime = cpu_to_le32(get_seconds()); 2326 /*
2327 * If the file system is mounted read-only, don't update the
2328 * superblock write time. This avoids updating the superblock
2329 * write time when we are mounting the root file system
2330 * read/only but we need to replay the journal; at that point,
2331 * for people who are east of GMT and who make their clock
2332 * tick in localtime for Windows bug-for-bug compatibility,
2333 * the clock is set in the future, and this will cause e2fsck
2334 * to complain and force a full file system check.
2335 */
2336 if (!(sb->s_flags & MS_RDONLY))
2337 es->s_wtime = cpu_to_le32(get_seconds());
2325 es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); 2338 es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
2326 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); 2339 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
2327 BUFFER_TRACE(sbh, "marking dirty"); 2340 BUFFER_TRACE(sbh, "marking dirty");
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index d5c0ea2e8f2..9f2d45d75b1 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -26,20 +26,6 @@ config EXT4_FS
26 26
27 If unsure, say N. 27 If unsure, say N.
28 28
29config EXT4DEV_COMPAT
30 bool "Enable ext4dev compatibility"
31 depends on EXT4_FS
32 help
33 Starting with 2.6.28, the name of the ext4 filesystem was
34 renamed from ext4dev to ext4. Unfortunately there are some
35 legacy userspace programs (such as klibc's fstype) have
36 "ext4dev" hardcoded.
37
38 To enable backwards compatibility so that systems that are
39 still expecting to mount ext4 filesystems using ext4dev,
40 choose Y here. This feature will go away by 2.6.31, so
41 please arrange to get your userspace programs fixed!
42
43config EXT4_FS_XATTR 29config EXT4_FS_XATTR
44 bool "Ext4 extended attributes" 30 bool "Ext4 extended attributes"
45 depends on EXT4_FS 31 depends on EXT4_FS
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e227eea23f0..8825515eedd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -65,6 +65,12 @@ typedef __u32 ext4_lblk_t;
65/* data type for block group number */ 65/* data type for block group number */
66typedef unsigned int ext4_group_t; 66typedef unsigned int ext4_group_t;
67 67
68/*
69 * Flags used in mballoc's allocation_context flags field.
70 *
71 * Also used to show what's going on for debugging purposes when the
72 * flag field is exported via the traceport interface
73 */
68 74
69/* prefer goal again. length */ 75/* prefer goal again. length */
70#define EXT4_MB_HINT_MERGE 0x0001 76#define EXT4_MB_HINT_MERGE 0x0001
@@ -127,6 +133,16 @@ struct mpage_da_data {
127 int pages_written; 133 int pages_written;
128 int retval; 134 int retval;
129}; 135};
136#define DIO_AIO_UNWRITTEN 0x1
137typedef struct ext4_io_end {
138 struct list_head list; /* per-file finished AIO list */
139 struct inode *inode; /* file being written to */
140 unsigned int flag; /* unwritten or not */
141 int error; /* I/O error code */
142 ext4_lblk_t offset; /* offset in the file */
143 size_t size; /* size of the extent */
144 struct work_struct work; /* data work queue */
145} ext4_io_end_t;
130 146
131/* 147/*
132 * Special inodes numbers 148 * Special inodes numbers
@@ -306,6 +322,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
306#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ 322#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
307#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ 323#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
308#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */ 324#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
325#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
309 326
310/* Used to pass group descriptor data when online resize is done */ 327/* Used to pass group descriptor data when online resize is done */
311struct ext4_new_group_input { 328struct ext4_new_group_input {
@@ -347,7 +364,16 @@ struct ext4_new_group_data {
347 /* Call ext4_da_update_reserve_space() after successfully 364 /* Call ext4_da_update_reserve_space() after successfully
348 allocating the blocks */ 365 allocating the blocks */
349#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008 366#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
350 367 /* caller is from the direct IO path, request to creation of an
368 unitialized extents if not allocated, split the uninitialized
369 extent if blocks has been preallocated already*/
370#define EXT4_GET_BLOCKS_DIO 0x0010
371#define EXT4_GET_BLOCKS_CONVERT 0x0020
372#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
373 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
374 /* Convert extent to initialized after direct IO complete */
375#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
376 EXT4_GET_BLOCKS_DIO_CREATE_EXT)
351 377
352/* 378/*
353 * ioctl commands 379 * ioctl commands
@@ -500,8 +526,8 @@ struct move_extent {
500static inline __le32 ext4_encode_extra_time(struct timespec *time) 526static inline __le32 ext4_encode_extra_time(struct timespec *time)
501{ 527{
502 return cpu_to_le32((sizeof(time->tv_sec) > 4 ? 528 return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
503 time->tv_sec >> 32 : 0) | 529 (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
504 ((time->tv_nsec << 2) & EXT4_NSEC_MASK)); 530 ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
505} 531}
506 532
507static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) 533static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
@@ -509,7 +535,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
509 if (sizeof(time->tv_sec) > 4) 535 if (sizeof(time->tv_sec) > 4)
510 time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) 536 time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
511 << 32; 537 << 32;
512 time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2; 538 time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
513} 539}
514 540
515#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ 541#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
@@ -672,6 +698,11 @@ struct ext4_inode_info {
672 __u16 i_extra_isize; 698 __u16 i_extra_isize;
673 699
674 spinlock_t i_block_reservation_lock; 700 spinlock_t i_block_reservation_lock;
701
702 /* completed async DIOs that might need unwritten extents handling */
703 struct list_head i_aio_dio_complete_list;
704 /* current io_end structure for async DIO write*/
705 ext4_io_end_t *cur_aio_dio;
675}; 706};
676 707
677/* 708/*
@@ -713,6 +744,7 @@ struct ext4_inode_info {
713#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 744#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
714#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 745#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
715#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 746#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
747#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
716#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 748#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
717#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 749#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
718#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 750#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
@@ -942,18 +974,11 @@ struct ext4_sb_info {
942 unsigned int s_mb_stats; 974 unsigned int s_mb_stats;
943 unsigned int s_mb_order2_reqs; 975 unsigned int s_mb_order2_reqs;
944 unsigned int s_mb_group_prealloc; 976 unsigned int s_mb_group_prealloc;
977 unsigned int s_max_writeback_mb_bump;
945 /* where last allocation was done - for stream allocation */ 978 /* where last allocation was done - for stream allocation */
946 unsigned long s_mb_last_group; 979 unsigned long s_mb_last_group;
947 unsigned long s_mb_last_start; 980 unsigned long s_mb_last_start;
948 981
949 /* history to debug policy */
950 struct ext4_mb_history *s_mb_history;
951 int s_mb_history_cur;
952 int s_mb_history_max;
953 int s_mb_history_num;
954 spinlock_t s_mb_history_lock;
955 int s_mb_history_filter;
956
957 /* stats for buddy allocator */ 982 /* stats for buddy allocator */
958 spinlock_t s_mb_pa_lock; 983 spinlock_t s_mb_pa_lock;
959 atomic_t s_bal_reqs; /* number of reqs with len > 1 */ 984 atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -980,6 +1005,9 @@ struct ext4_sb_info {
980 1005
981 unsigned int s_log_groups_per_flex; 1006 unsigned int s_log_groups_per_flex;
982 struct flex_groups *s_flex_groups; 1007 struct flex_groups *s_flex_groups;
1008
1009 /* workqueue for dio unwritten */
1010 struct workqueue_struct *dio_unwritten_wq;
983}; 1011};
984 1012
985static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1013static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1397,7 +1425,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
1397 struct address_space *mapping, loff_t from); 1425 struct address_space *mapping, loff_t from);
1398extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1426extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1399extern qsize_t ext4_get_reserved_space(struct inode *inode); 1427extern qsize_t ext4_get_reserved_space(struct inode *inode);
1400 1428extern int flush_aio_dio_completed_IO(struct inode *inode);
1401/* ioctl.c */ 1429/* ioctl.c */
1402extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1430extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
1403extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); 1431extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1699,6 +1727,8 @@ extern void ext4_ext_init(struct super_block *);
1699extern void ext4_ext_release(struct super_block *); 1727extern void ext4_ext_release(struct super_block *);
1700extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1728extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1701 loff_t len); 1729 loff_t len);
1730extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1731 loff_t len);
1702extern int ext4_get_blocks(handle_t *handle, struct inode *inode, 1732extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1703 sector_t block, unsigned int max_blocks, 1733 sector_t block, unsigned int max_blocks,
1704 struct buffer_head *bh, int flags); 1734 struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 61652f1d15e..2ca686454e8 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
220 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); 220 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
221} 221}
222 222
223static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
224{
225 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
226}
227
223extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); 228extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
224extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); 229extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
225extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); 230extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
@@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
235 struct ext4_ext_path *path, 240 struct ext4_ext_path *path,
236 struct ext4_extent *); 241 struct ext4_extent *);
237extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); 242extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
238extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); 243extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
239extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, 244extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
240 ext_prepare_callback, void *); 245 ext_prepare_callback, void *);
241extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 246extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 139fb8cb87e..a2865980342 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -161,11 +161,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
161handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 161handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
162int __ext4_journal_stop(const char *where, handle_t *handle); 162int __ext4_journal_stop(const char *where, handle_t *handle);
163 163
164#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1) 164#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
165 165
166/* Note: Do not use this for NULL handles. This is only to determine if
167 * a properly allocated handle is using a journal or not. */
166static inline int ext4_handle_valid(handle_t *handle) 168static inline int ext4_handle_valid(handle_t *handle)
167{ 169{
168 if (handle == EXT4_NOJOURNAL_HANDLE) 170 if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
169 return 0; 171 return 0;
170 return 1; 172 return 1;
171} 173}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7a383257792..715264b4bae 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -723,7 +723,7 @@ err:
723 * insert new index [@logical;@ptr] into the block at @curp; 723 * insert new index [@logical;@ptr] into the block at @curp;
724 * check where to insert: before @curp or after @curp 724 * check where to insert: before @curp or after @curp
725 */ 725 */
726static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 726int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
727 struct ext4_ext_path *curp, 727 struct ext4_ext_path *curp,
728 int logical, ext4_fsblk_t ptr) 728 int logical, ext4_fsblk_t ptr)
729{ 729{
@@ -1586,7 +1586,7 @@ out:
1586 */ 1586 */
1587int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1587int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1588 struct ext4_ext_path *path, 1588 struct ext4_ext_path *path,
1589 struct ext4_extent *newext) 1589 struct ext4_extent *newext, int flag)
1590{ 1590{
1591 struct ext4_extent_header *eh; 1591 struct ext4_extent_header *eh;
1592 struct ext4_extent *ex, *fex; 1592 struct ext4_extent *ex, *fex;
@@ -1602,7 +1602,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1602 BUG_ON(path[depth].p_hdr == NULL); 1602 BUG_ON(path[depth].p_hdr == NULL);
1603 1603
1604 /* try to insert block into found extent and return */ 1604 /* try to insert block into found extent and return */
1605 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { 1605 if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1606 && ext4_can_extents_be_merged(inode, ex, newext)) {
1606 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1607 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1607 ext4_ext_is_uninitialized(newext), 1608 ext4_ext_is_uninitialized(newext),
1608 ext4_ext_get_actual_len(newext), 1609 ext4_ext_get_actual_len(newext),
@@ -1722,7 +1723,8 @@ has_space:
1722 1723
1723merge: 1724merge:
1724 /* try to merge extents to the right */ 1725 /* try to merge extents to the right */
1725 ext4_ext_try_to_merge(inode, path, nearex); 1726 if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1727 ext4_ext_try_to_merge(inode, path, nearex);
1726 1728
1727 /* try to merge extents to the left */ 1729 /* try to merge extents to the left */
1728 1730
@@ -2378,6 +2380,7 @@ void ext4_ext_init(struct super_block *sb)
2378 */ 2380 */
2379 2381
2380 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 2382 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2383#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2381 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2384 printk(KERN_INFO "EXT4-fs: file extents enabled");
2382#ifdef AGGRESSIVE_TEST 2385#ifdef AGGRESSIVE_TEST
2383 printk(", aggressive tests"); 2386 printk(", aggressive tests");
@@ -2389,6 +2392,7 @@ void ext4_ext_init(struct super_block *sb)
2389 printk(", stats"); 2392 printk(", stats");
2390#endif 2393#endif
2391 printk("\n"); 2394 printk("\n");
2395#endif
2392#ifdef EXTENTS_STATS 2396#ifdef EXTENTS_STATS
2393 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 2397 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2394 EXT4_SB(sb)->s_ext_min = 1 << 30; 2398 EXT4_SB(sb)->s_ext_min = 1 << 30;
@@ -2490,7 +2494,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2490} 2494}
2491 2495
2492#define EXT4_EXT_ZERO_LEN 7 2496#define EXT4_EXT_ZERO_LEN 7
2493
2494/* 2497/*
2495 * This function is called by ext4_ext_get_blocks() if someone tries to write 2498 * This function is called by ext4_ext_get_blocks() if someone tries to write
2496 * to an uninitialized extent. It may result in splitting the uninitialized 2499 * to an uninitialized extent. It may result in splitting the uninitialized
@@ -2583,7 +2586,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2583 ex3->ee_block = cpu_to_le32(iblock); 2586 ex3->ee_block = cpu_to_le32(iblock);
2584 ext4_ext_store_pblock(ex3, newblock); 2587 ext4_ext_store_pblock(ex3, newblock);
2585 ex3->ee_len = cpu_to_le16(allocated); 2588 ex3->ee_len = cpu_to_le16(allocated);
2586 err = ext4_ext_insert_extent(handle, inode, path, ex3); 2589 err = ext4_ext_insert_extent(handle, inode, path,
2590 ex3, 0);
2587 if (err == -ENOSPC) { 2591 if (err == -ENOSPC) {
2588 err = ext4_ext_zeroout(inode, &orig_ex); 2592 err = ext4_ext_zeroout(inode, &orig_ex);
2589 if (err) 2593 if (err)
@@ -2639,7 +2643,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2639 ext4_ext_store_pblock(ex3, newblock + max_blocks); 2643 ext4_ext_store_pblock(ex3, newblock + max_blocks);
2640 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 2644 ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2641 ext4_ext_mark_uninitialized(ex3); 2645 ext4_ext_mark_uninitialized(ex3);
2642 err = ext4_ext_insert_extent(handle, inode, path, ex3); 2646 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2643 if (err == -ENOSPC) { 2647 if (err == -ENOSPC) {
2644 err = ext4_ext_zeroout(inode, &orig_ex); 2648 err = ext4_ext_zeroout(inode, &orig_ex);
2645 if (err) 2649 if (err)
@@ -2757,7 +2761,192 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2757 err = ext4_ext_dirty(handle, inode, path + depth); 2761 err = ext4_ext_dirty(handle, inode, path + depth);
2758 goto out; 2762 goto out;
2759insert: 2763insert:
2760 err = ext4_ext_insert_extent(handle, inode, path, &newex); 2764 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2765 if (err == -ENOSPC) {
2766 err = ext4_ext_zeroout(inode, &orig_ex);
2767 if (err)
2768 goto fix_extent_len;
2769 /* update the extent length and mark as initialized */
2770 ex->ee_block = orig_ex.ee_block;
2771 ex->ee_len = orig_ex.ee_len;
2772 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2773 ext4_ext_dirty(handle, inode, path + depth);
2774 /* zero out the first half */
2775 return allocated;
2776 } else if (err)
2777 goto fix_extent_len;
2778out:
2779 ext4_ext_show_leaf(inode, path);
2780 return err ? err : allocated;
2781
2782fix_extent_len:
2783 ex->ee_block = orig_ex.ee_block;
2784 ex->ee_len = orig_ex.ee_len;
2785 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2786 ext4_ext_mark_uninitialized(ex);
2787 ext4_ext_dirty(handle, inode, path + depth);
2788 return err;
2789}
2790
2791/*
2792 * This function is called by ext4_ext_get_blocks() from
2793 * ext4_get_blocks_dio_write() when DIO to write
2794 * to an uninitialized extent.
2795 *
2796 * Writing to an uninitized extent may result in splitting the uninitialized
2797 * extent into multiple /intialized unintialized extents (up to three)
2798 * There are three possibilities:
2799 * a> There is no split required: Entire extent should be uninitialized
2800 * b> Splits in two extents: Write is happening at either end of the extent
2801 * c> Splits in three extents: Somone is writing in middle of the extent
2802 *
2803 * One of more index blocks maybe needed if the extent tree grow after
2804 * the unintialized extent split. To prevent ENOSPC occur at the IO
2805 * complete, we need to split the uninitialized extent before DIO submit
2806 * the IO. The uninitilized extent called at this time will be split
2807 * into three uninitialized extent(at most). After IO complete, the part
2808 * being filled will be convert to initialized by the end_io callback function
2809 * via ext4_convert_unwritten_extents().
2810 *
2811 * Returns the size of uninitialized extent to be written on success.
2812 */
2813static int ext4_split_unwritten_extents(handle_t *handle,
2814 struct inode *inode,
2815 struct ext4_ext_path *path,
2816 ext4_lblk_t iblock,
2817 unsigned int max_blocks,
2818 int flags)
2819{
2820 struct ext4_extent *ex, newex, orig_ex;
2821 struct ext4_extent *ex1 = NULL;
2822 struct ext4_extent *ex2 = NULL;
2823 struct ext4_extent *ex3 = NULL;
2824 struct ext4_extent_header *eh;
2825 ext4_lblk_t ee_block;
2826 unsigned int allocated, ee_len, depth;
2827 ext4_fsblk_t newblock;
2828 int err = 0;
2829
2830 ext_debug("ext4_split_unwritten_extents: inode %lu,"
2831 "iblock %llu, max_blocks %u\n", inode->i_ino,
2832 (unsigned long long)iblock, max_blocks);
2833 depth = ext_depth(inode);
2834 eh = path[depth].p_hdr;
2835 ex = path[depth].p_ext;
2836 ee_block = le32_to_cpu(ex->ee_block);
2837 ee_len = ext4_ext_get_actual_len(ex);
2838 allocated = ee_len - (iblock - ee_block);
2839 newblock = iblock - ee_block + ext_pblock(ex);
2840 ex2 = ex;
2841 orig_ex.ee_block = ex->ee_block;
2842 orig_ex.ee_len = cpu_to_le16(ee_len);
2843 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2844
2845 /*
2846 * If the uninitialized extent begins at the same logical
2847 * block where the write begins, and the write completely
2848 * covers the extent, then we don't need to split it.
2849 */
2850 if ((iblock == ee_block) && (allocated <= max_blocks))
2851 return allocated;
2852
2853 err = ext4_ext_get_access(handle, inode, path + depth);
2854 if (err)
2855 goto out;
2856 /* ex1: ee_block to iblock - 1 : uninitialized */
2857 if (iblock > ee_block) {
2858 ex1 = ex;
2859 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2860 ext4_ext_mark_uninitialized(ex1);
2861 ex2 = &newex;
2862 }
2863 /*
2864 * for sanity, update the length of the ex2 extent before
2865 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2866 * overlap of blocks.
2867 */
2868 if (!ex1 && allocated > max_blocks)
2869 ex2->ee_len = cpu_to_le16(max_blocks);
2870 /* ex3: to ee_block + ee_len : uninitialised */
2871 if (allocated > max_blocks) {
2872 unsigned int newdepth;
2873 ex3 = &newex;
2874 ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2875 ext4_ext_store_pblock(ex3, newblock + max_blocks);
2876 ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2877 ext4_ext_mark_uninitialized(ex3);
2878 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2879 if (err == -ENOSPC) {
2880 err = ext4_ext_zeroout(inode, &orig_ex);
2881 if (err)
2882 goto fix_extent_len;
2883 /* update the extent length and mark as initialized */
2884 ex->ee_block = orig_ex.ee_block;
2885 ex->ee_len = orig_ex.ee_len;
2886 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2887 ext4_ext_dirty(handle, inode, path + depth);
2888 /* zeroed the full extent */
2889 /* blocks available from iblock */
2890 return allocated;
2891
2892 } else if (err)
2893 goto fix_extent_len;
2894 /*
2895 * The depth, and hence eh & ex might change
2896 * as part of the insert above.
2897 */
2898 newdepth = ext_depth(inode);
2899 /*
2900 * update the extent length after successful insert of the
2901 * split extent
2902 */
2903 orig_ex.ee_len = cpu_to_le16(ee_len -
2904 ext4_ext_get_actual_len(ex3));
2905 depth = newdepth;
2906 ext4_ext_drop_refs(path);
2907 path = ext4_ext_find_extent(inode, iblock, path);
2908 if (IS_ERR(path)) {
2909 err = PTR_ERR(path);
2910 goto out;
2911 }
2912 eh = path[depth].p_hdr;
2913 ex = path[depth].p_ext;
2914 if (ex2 != &newex)
2915 ex2 = ex;
2916
2917 err = ext4_ext_get_access(handle, inode, path + depth);
2918 if (err)
2919 goto out;
2920
2921 allocated = max_blocks;
2922 }
2923 /*
2924 * If there was a change of depth as part of the
2925 * insertion of ex3 above, we need to update the length
2926 * of the ex1 extent again here
2927 */
2928 if (ex1 && ex1 != ex) {
2929 ex1 = ex;
2930 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2931 ext4_ext_mark_uninitialized(ex1);
2932 ex2 = &newex;
2933 }
2934 /*
2935 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
2936 * uninitialised still.
2937 */
2938 ex2->ee_block = cpu_to_le32(iblock);
2939 ext4_ext_store_pblock(ex2, newblock);
2940 ex2->ee_len = cpu_to_le16(allocated);
2941 ext4_ext_mark_uninitialized(ex2);
2942 if (ex2 != ex)
2943 goto insert;
2944 /* Mark modified extent as dirty */
2945 err = ext4_ext_dirty(handle, inode, path + depth);
2946 ext_debug("out here\n");
2947 goto out;
2948insert:
2949 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2761 if (err == -ENOSPC) { 2950 if (err == -ENOSPC) {
2762 err = ext4_ext_zeroout(inode, &orig_ex); 2951 err = ext4_ext_zeroout(inode, &orig_ex);
2763 if (err) 2952 if (err)
@@ -2783,7 +2972,147 @@ fix_extent_len:
2783 ext4_ext_dirty(handle, inode, path + depth); 2972 ext4_ext_dirty(handle, inode, path + depth);
2784 return err; 2973 return err;
2785} 2974}
2975static int ext4_convert_unwritten_extents_dio(handle_t *handle,
2976 struct inode *inode,
2977 struct ext4_ext_path *path)
2978{
2979 struct ext4_extent *ex;
2980 struct ext4_extent_header *eh;
2981 int depth;
2982 int err = 0;
2983 int ret = 0;
2984
2985 depth = ext_depth(inode);
2986 eh = path[depth].p_hdr;
2987 ex = path[depth].p_ext;
2988
2989 err = ext4_ext_get_access(handle, inode, path + depth);
2990 if (err)
2991 goto out;
2992 /* first mark the extent as initialized */
2993 ext4_ext_mark_initialized(ex);
2994
2995 /*
2996 * We have to see if it can be merged with the extent
2997 * on the left.
2998 */
2999 if (ex > EXT_FIRST_EXTENT(eh)) {
3000 /*
3001 * To merge left, pass "ex - 1" to try_to_merge(),
3002 * since it merges towards right _only_.
3003 */
3004 ret = ext4_ext_try_to_merge(inode, path, ex - 1);
3005 if (ret) {
3006 err = ext4_ext_correct_indexes(handle, inode, path);
3007 if (err)
3008 goto out;
3009 depth = ext_depth(inode);
3010 ex--;
3011 }
3012 }
3013 /*
3014 * Try to Merge towards right.
3015 */
3016 ret = ext4_ext_try_to_merge(inode, path, ex);
3017 if (ret) {
3018 err = ext4_ext_correct_indexes(handle, inode, path);
3019 if (err)
3020 goto out;
3021 depth = ext_depth(inode);
3022 }
3023 /* Mark modified extent as dirty */
3024 err = ext4_ext_dirty(handle, inode, path + depth);
3025out:
3026 ext4_ext_show_leaf(inode, path);
3027 return err;
3028}
3029
3030static int
3031ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3032 ext4_lblk_t iblock, unsigned int max_blocks,
3033 struct ext4_ext_path *path, int flags,
3034 unsigned int allocated, struct buffer_head *bh_result,
3035 ext4_fsblk_t newblock)
3036{
3037 int ret = 0;
3038 int err = 0;
3039 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3040
3041 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3042 "block %llu, max_blocks %u, flags %d, allocated %u",
3043 inode->i_ino, (unsigned long long)iblock, max_blocks,
3044 flags, allocated);
3045 ext4_ext_show_leaf(inode, path);
2786 3046
3047 /* DIO get_block() before submit the IO, split the extent */
3048 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
3049 ret = ext4_split_unwritten_extents(handle,
3050 inode, path, iblock,
3051 max_blocks, flags);
3052 /*
3053 * Flag the inode(non aio case) or end_io struct (aio case)
3054 * that this IO needs to convertion to written when IO is
3055 * completed
3056 */
3057 if (io)
3058 io->flag = DIO_AIO_UNWRITTEN;
3059 else
3060 EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
3061 goto out;
3062 }
3063 /* async DIO end_io complete, convert the filled extent to written */
3064 if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
3065 ret = ext4_convert_unwritten_extents_dio(handle, inode,
3066 path);
3067 goto out2;
3068 }
3069 /* buffered IO case */
3070 /*
3071 * repeat fallocate creation request
3072 * we already have an unwritten extent
3073 */
3074 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3075 goto map_out;
3076
3077 /* buffered READ or buffered write_begin() lookup */
3078 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3079 /*
3080 * We have blocks reserved already. We
3081 * return allocated blocks so that delalloc
3082 * won't do block reservation for us. But
3083 * the buffer head will be unmapped so that
3084 * a read from the block returns 0s.
3085 */
3086 set_buffer_unwritten(bh_result);
3087 goto out1;
3088 }
3089
3090 /* buffered write, writepage time, convert*/
3091 ret = ext4_ext_convert_to_initialized(handle, inode,
3092 path, iblock,
3093 max_blocks);
3094out:
3095 if (ret <= 0) {
3096 err = ret;
3097 goto out2;
3098 } else
3099 allocated = ret;
3100 set_buffer_new(bh_result);
3101map_out:
3102 set_buffer_mapped(bh_result);
3103out1:
3104 if (allocated > max_blocks)
3105 allocated = max_blocks;
3106 ext4_ext_show_leaf(inode, path);
3107 bh_result->b_bdev = inode->i_sb->s_bdev;
3108 bh_result->b_blocknr = newblock;
3109out2:
3110 if (path) {
3111 ext4_ext_drop_refs(path);
3112 kfree(path);
3113 }
3114 return err ? err : allocated;
3115}
2787/* 3116/*
2788 * Block allocation/map/preallocation routine for extents based files 3117 * Block allocation/map/preallocation routine for extents based files
2789 * 3118 *
@@ -2814,6 +3143,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2814 int err = 0, depth, ret, cache_type; 3143 int err = 0, depth, ret, cache_type;
2815 unsigned int allocated = 0; 3144 unsigned int allocated = 0;
2816 struct ext4_allocation_request ar; 3145 struct ext4_allocation_request ar;
3146 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
2817 3147
2818 __clear_bit(BH_New, &bh_result->b_state); 3148 __clear_bit(BH_New, &bh_result->b_state);
2819 ext_debug("blocks %u/%u requested for inode %lu\n", 3149 ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -2889,33 +3219,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2889 EXT4_EXT_CACHE_EXTENT); 3219 EXT4_EXT_CACHE_EXTENT);
2890 goto out; 3220 goto out;
2891 } 3221 }
2892 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) 3222 ret = ext4_ext_handle_uninitialized_extents(handle,
2893 goto out; 3223 inode, iblock, max_blocks, path,
2894 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3224 flags, allocated, bh_result, newblock);
2895 if (allocated > max_blocks) 3225 return ret;
2896 allocated = max_blocks;
2897 /*
2898 * We have blocks reserved already. We
2899 * return allocated blocks so that delalloc
2900 * won't do block reservation for us. But
2901 * the buffer head will be unmapped so that
2902 * a read from the block returns 0s.
2903 */
2904 set_buffer_unwritten(bh_result);
2905 bh_result->b_bdev = inode->i_sb->s_bdev;
2906 bh_result->b_blocknr = newblock;
2907 goto out2;
2908 }
2909
2910 ret = ext4_ext_convert_to_initialized(handle, inode,
2911 path, iblock,
2912 max_blocks);
2913 if (ret <= 0) {
2914 err = ret;
2915 goto out2;
2916 } else
2917 allocated = ret;
2918 goto outnew;
2919 } 3226 }
2920 } 3227 }
2921 3228
@@ -2986,9 +3293,27 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2986 /* try to insert new extent into found leaf and return */ 3293 /* try to insert new extent into found leaf and return */
2987 ext4_ext_store_pblock(&newex, newblock); 3294 ext4_ext_store_pblock(&newex, newblock);
2988 newex.ee_len = cpu_to_le16(ar.len); 3295 newex.ee_len = cpu_to_le16(ar.len);
2989 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ 3296 /* Mark uninitialized */
3297 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
2990 ext4_ext_mark_uninitialized(&newex); 3298 ext4_ext_mark_uninitialized(&newex);
2991 err = ext4_ext_insert_extent(handle, inode, path, &newex); 3299 /*
3300 * io_end structure was created for every async
3301 * direct IO write to the middle of the file.
3302 * To avoid unecessary convertion for every aio dio rewrite
3303 * to the mid of file, here we flag the IO that is really
3304 * need the convertion.
3305 * For non asycn direct IO case, flag the inode state
3306 * that we need to perform convertion when IO is done.
3307 */
3308 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
3309 if (io)
3310 io->flag = DIO_AIO_UNWRITTEN;
3311 else
3312 EXT4_I(inode)->i_state |=
3313 EXT4_STATE_DIO_UNWRITTEN;;
3314 }
3315 }
3316 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2992 if (err) { 3317 if (err) {
2993 /* free data blocks we just allocated */ 3318 /* free data blocks we just allocated */
2994 /* not a good idea to call discard here directly, 3319 /* not a good idea to call discard here directly,
@@ -3002,7 +3327,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3002 /* previous routine could use block we allocated */ 3327 /* previous routine could use block we allocated */
3003 newblock = ext_pblock(&newex); 3328 newblock = ext_pblock(&newex);
3004 allocated = ext4_ext_get_actual_len(&newex); 3329 allocated = ext4_ext_get_actual_len(&newex);
3005outnew:
3006 set_buffer_new(bh_result); 3330 set_buffer_new(bh_result);
3007 3331
3008 /* Cache only when it is _not_ an uninitialized extent */ 3332 /* Cache only when it is _not_ an uninitialized extent */
@@ -3201,6 +3525,64 @@ retry:
3201} 3525}
3202 3526
3203/* 3527/*
3528 * This function convert a range of blocks to written extents
3529 * The caller of this function will pass the start offset and the size.
3530 * all unwritten extents within this range will be converted to
3531 * written extents.
3532 *
3533 * This function is called from the direct IO end io call back
3534 * function, to convert the fallocated extents after IO is completed.
3535 * Returns 0 on success.
3536 */
3537int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3538 loff_t len)
3539{
3540 handle_t *handle;
3541 ext4_lblk_t block;
3542 unsigned int max_blocks;
3543 int ret = 0;
3544 int ret2 = 0;
3545 struct buffer_head map_bh;
3546 unsigned int credits, blkbits = inode->i_blkbits;
3547
3548 block = offset >> blkbits;
3549 /*
3550 * We can't just convert len to max_blocks because
3551 * If blocksize = 4096 offset = 3072 and len = 2048
3552 */
3553 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3554 - block;
3555 /*
3556 * credits to insert 1 extent into extent tree
3557 */
3558 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3559 while (ret >= 0 && ret < max_blocks) {
3560 block = block + ret;
3561 max_blocks = max_blocks - ret;
3562 handle = ext4_journal_start(inode, credits);
3563 if (IS_ERR(handle)) {
3564 ret = PTR_ERR(handle);
3565 break;
3566 }
3567 map_bh.b_state = 0;
3568 ret = ext4_get_blocks(handle, inode, block,
3569 max_blocks, &map_bh,
3570 EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
3571 if (ret <= 0) {
3572 WARN_ON(ret <= 0);
3573 printk(KERN_ERR "%s: ext4_ext_get_blocks "
3574 "returned error inode#%lu, block=%u, "
3575 "max_blocks=%u", __func__,
3576 inode->i_ino, block, max_blocks);
3577 }
3578 ext4_mark_inode_dirty(handle, inode);
3579 ret2 = ext4_journal_stop(handle);
3580 if (ret <= 0 || ret2 )
3581 break;
3582 }
3583 return ret > 0 ? ret2 : ret;
3584}
3585/*
3204 * Callback function called for each extent to gather FIEMAP information. 3586 * Callback function called for each extent to gather FIEMAP information.
3205 */ 3587 */
3206static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, 3588static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 07475740b51..2b1531266ee 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,6 +44,8 @@
44 * 44 *
45 * What we do is just kick off a commit and wait on it. This will snapshot the 45 * What we do is just kick off a commit and wait on it. This will snapshot the
46 * inode to disk. 46 * inode to disk.
47 *
48 * i_mutex lock is held when entering and exiting this function
47 */ 49 */
48 50
49int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) 51int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
@@ -56,6 +58,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
56 58
57 trace_ext4_sync_file(file, dentry, datasync); 59 trace_ext4_sync_file(file, dentry, datasync);
58 60
61 ret = flush_aio_dio_completed_IO(inode);
62 if (ret < 0)
63 goto out;
59 /* 64 /*
60 * data=writeback: 65 * data=writeback:
61 * The caller's filemap_fdatawrite()/wait will sync the data. 66 * The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 064746fad58..2c8caa51add 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
37#include <linux/namei.h> 37#include <linux/namei.h>
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h>
40 41
41#include "ext4_jbd2.h" 42#include "ext4_jbd2.h"
42#include "xattr.h" 43#include "xattr.h"
@@ -192,7 +193,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
192 * so before we call here everything must be consistently dirtied against 193 * so before we call here everything must be consistently dirtied against
193 * this transaction. 194 * this transaction.
194 */ 195 */
195 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, 196int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
196 int nblocks) 197 int nblocks)
197{ 198{
198 int ret; 199 int ret;
@@ -208,6 +209,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
208 up_write(&EXT4_I(inode)->i_data_sem); 209 up_write(&EXT4_I(inode)->i_data_sem);
209 ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); 210 ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
210 down_write(&EXT4_I(inode)->i_data_sem); 211 down_write(&EXT4_I(inode)->i_data_sem);
212 ext4_discard_preallocations(inode);
211 213
212 return ret; 214 return ret;
213} 215}
@@ -1145,6 +1147,64 @@ static int check_block_validity(struct inode *inode, const char *msg,
1145} 1147}
1146 1148
1147/* 1149/*
1150 * Return the number of contiguous dirty pages in a given inode
1151 * starting at page frame idx.
1152 */
1153static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1154 unsigned int max_pages)
1155{
1156 struct address_space *mapping = inode->i_mapping;
1157 pgoff_t index;
1158 struct pagevec pvec;
1159 pgoff_t num = 0;
1160 int i, nr_pages, done = 0;
1161
1162 if (max_pages == 0)
1163 return 0;
1164 pagevec_init(&pvec, 0);
1165 while (!done) {
1166 index = idx;
1167 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1168 PAGECACHE_TAG_DIRTY,
1169 (pgoff_t)PAGEVEC_SIZE);
1170 if (nr_pages == 0)
1171 break;
1172 for (i = 0; i < nr_pages; i++) {
1173 struct page *page = pvec.pages[i];
1174 struct buffer_head *bh, *head;
1175
1176 lock_page(page);
1177 if (unlikely(page->mapping != mapping) ||
1178 !PageDirty(page) ||
1179 PageWriteback(page) ||
1180 page->index != idx) {
1181 done = 1;
1182 unlock_page(page);
1183 break;
1184 }
1185 if (page_has_buffers(page)) {
1186 bh = head = page_buffers(page);
1187 do {
1188 if (!buffer_delay(bh) &&
1189 !buffer_unwritten(bh))
1190 done = 1;
1191 bh = bh->b_this_page;
1192 } while (!done && (bh != head));
1193 }
1194 unlock_page(page);
1195 if (done)
1196 break;
1197 idx++;
1198 num++;
1199 if (num >= max_pages)
1200 break;
1201 }
1202 pagevec_release(&pvec);
1203 }
1204 return num;
1205}
1206
1207/*
1148 * The ext4_get_blocks() function tries to look up the requested blocks, 1208 * The ext4_get_blocks() function tries to look up the requested blocks,
1149 * and returns if the blocks are already mapped. 1209 * and returns if the blocks are already mapped.
1150 * 1210 *
@@ -1175,6 +1235,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1175 clear_buffer_mapped(bh); 1235 clear_buffer_mapped(bh);
1176 clear_buffer_unwritten(bh); 1236 clear_buffer_unwritten(bh);
1177 1237
1238 ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
1239 "logical block %lu\n", inode->i_ino, flags, max_blocks,
1240 (unsigned long)block);
1178 /* 1241 /*
1179 * Try to see if we can get the block without requesting a new 1242 * Try to see if we can get the block without requesting a new
1180 * file system block. 1243 * file system block.
@@ -1796,11 +1859,11 @@ repeat:
1796 1859
1797 if (ext4_claim_free_blocks(sbi, total)) { 1860 if (ext4_claim_free_blocks(sbi, total)) {
1798 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1861 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1862 vfs_dq_release_reservation_block(inode, total);
1799 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1863 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1800 yield(); 1864 yield();
1801 goto repeat; 1865 goto repeat;
1802 } 1866 }
1803 vfs_dq_release_reservation_block(inode, total);
1804 return -ENOSPC; 1867 return -ENOSPC;
1805 } 1868 }
1806 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1869 EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -2092,18 +2155,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2092static void ext4_print_free_blocks(struct inode *inode) 2155static void ext4_print_free_blocks(struct inode *inode)
2093{ 2156{
2094 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2157 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2095 printk(KERN_EMERG "Total free blocks count %lld\n", 2158 printk(KERN_CRIT "Total free blocks count %lld\n",
2096 ext4_count_free_blocks(inode->i_sb)); 2159 ext4_count_free_blocks(inode->i_sb));
2097 printk(KERN_EMERG "Free/Dirty block details\n"); 2160 printk(KERN_CRIT "Free/Dirty block details\n");
2098 printk(KERN_EMERG "free_blocks=%lld\n", 2161 printk(KERN_CRIT "free_blocks=%lld\n",
2099 (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); 2162 (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
2100 printk(KERN_EMERG "dirty_blocks=%lld\n", 2163 printk(KERN_CRIT "dirty_blocks=%lld\n",
2101 (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 2164 (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
2102 printk(KERN_EMERG "Block reservation details\n"); 2165 printk(KERN_CRIT "Block reservation details\n");
2103 printk(KERN_EMERG "i_reserved_data_blocks=%u\n", 2166 printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
2104 EXT4_I(inode)->i_reserved_data_blocks); 2167 EXT4_I(inode)->i_reserved_data_blocks);
2105 printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", 2168 printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
2106 EXT4_I(inode)->i_reserved_meta_blocks); 2169 EXT4_I(inode)->i_reserved_meta_blocks);
2107 return; 2170 return;
2108} 2171}
2109 2172
@@ -2189,14 +2252,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2189 * writepage and writepages will again try to write 2252 * writepage and writepages will again try to write
2190 * the same. 2253 * the same.
2191 */ 2254 */
2192 printk(KERN_EMERG "%s block allocation failed for inode %lu " 2255 ext4_msg(mpd->inode->i_sb, KERN_CRIT,
2193 "at logical offset %llu with max blocks " 2256 "delayed block allocation failed for inode %lu at "
2194 "%zd with error %d\n", 2257 "logical offset %llu with max blocks %zd with "
2195 __func__, mpd->inode->i_ino, 2258 "error %d\n", mpd->inode->i_ino,
2196 (unsigned long long)next, 2259 (unsigned long long) next,
2197 mpd->b_size >> mpd->inode->i_blkbits, err); 2260 mpd->b_size >> mpd->inode->i_blkbits, err);
2198 printk(KERN_EMERG "This should not happen.!! " 2261 printk(KERN_CRIT "This should not happen!! "
2199 "Data will be lost\n"); 2262 "Data will be lost\n");
2200 if (err == -ENOSPC) { 2263 if (err == -ENOSPC) {
2201 ext4_print_free_blocks(mpd->inode); 2264 ext4_print_free_blocks(mpd->inode);
2202 } 2265 }
@@ -2743,8 +2806,10 @@ static int ext4_da_writepages(struct address_space *mapping,
2743 int no_nrwrite_index_update; 2806 int no_nrwrite_index_update;
2744 int pages_written = 0; 2807 int pages_written = 0;
2745 long pages_skipped; 2808 long pages_skipped;
2809 unsigned int max_pages;
2746 int range_cyclic, cycled = 1, io_done = 0; 2810 int range_cyclic, cycled = 1, io_done = 0;
2747 int needed_blocks, ret = 0, nr_to_writebump = 0; 2811 int needed_blocks, ret = 0;
2812 long desired_nr_to_write, nr_to_writebump = 0;
2748 loff_t range_start = wbc->range_start; 2813 loff_t range_start = wbc->range_start;
2749 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2814 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2750 2815
@@ -2771,16 +2836,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2771 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2836 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2772 return -EROFS; 2837 return -EROFS;
2773 2838
2774 /*
2775 * Make sure nr_to_write is >= sbi->s_mb_stream_request
2776 * This make sure small files blocks are allocated in
2777 * single attempt. This ensure that small files
2778 * get less fragmented.
2779 */
2780 if (wbc->nr_to_write < sbi->s_mb_stream_request) {
2781 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
2782 wbc->nr_to_write = sbi->s_mb_stream_request;
2783 }
2784 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2839 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2785 range_whole = 1; 2840 range_whole = 1;
2786 2841
@@ -2795,6 +2850,36 @@ static int ext4_da_writepages(struct address_space *mapping,
2795 } else 2850 } else
2796 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2851 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2797 2852
2853 /*
2854 * This works around two forms of stupidity. The first is in
2855 * the writeback code, which caps the maximum number of pages
2856 * written to be 1024 pages. This is wrong on multiple
2857 * levels; different architectues have a different page size,
2858 * which changes the maximum amount of data which gets
2859 * written. Secondly, 4 megabytes is way too small. XFS
2860 * forces this value to be 16 megabytes by multiplying
2861 * nr_to_write parameter by four, and then relies on its
2862 * allocator to allocate larger extents to make them
2863 * contiguous. Unfortunately this brings us to the second
2864 * stupidity, which is that ext4's mballoc code only allocates
2865 * at most 2048 blocks. So we force contiguous writes up to
2866 * the number of dirty blocks in the inode, or
2867 * sbi->max_writeback_mb_bump whichever is smaller.
2868 */
2869 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2870 if (!range_cyclic && range_whole)
2871 desired_nr_to_write = wbc->nr_to_write * 8;
2872 else
2873 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2874 max_pages);
2875 if (desired_nr_to_write > max_pages)
2876 desired_nr_to_write = max_pages;
2877
2878 if (wbc->nr_to_write < desired_nr_to_write) {
2879 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2880 wbc->nr_to_write = desired_nr_to_write;
2881 }
2882
2798 mpd.wbc = wbc; 2883 mpd.wbc = wbc;
2799 mpd.inode = mapping->host; 2884 mpd.inode = mapping->host;
2800 2885
@@ -2822,10 +2907,9 @@ retry:
2822 handle = ext4_journal_start(inode, needed_blocks); 2907 handle = ext4_journal_start(inode, needed_blocks);
2823 if (IS_ERR(handle)) { 2908 if (IS_ERR(handle)) {
2824 ret = PTR_ERR(handle); 2909 ret = PTR_ERR(handle);
2825 printk(KERN_CRIT "%s: jbd2_start: " 2910 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2826 "%ld pages, ino %lu; err %d\n", __func__, 2911 "%ld pages, ino %lu; err %d\n", __func__,
2827 wbc->nr_to_write, inode->i_ino, ret); 2912 wbc->nr_to_write, inode->i_ino, ret);
2828 dump_stack();
2829 goto out_writepages; 2913 goto out_writepages;
2830 } 2914 }
2831 2915
@@ -2897,9 +2981,10 @@ retry:
2897 goto retry; 2981 goto retry;
2898 } 2982 }
2899 if (pages_skipped != wbc->pages_skipped) 2983 if (pages_skipped != wbc->pages_skipped)
2900 printk(KERN_EMERG "This should not happen leaving %s " 2984 ext4_msg(inode->i_sb, KERN_CRIT,
2901 "with nr_to_write = %ld ret = %d\n", 2985 "This should not happen leaving %s "
2902 __func__, wbc->nr_to_write, ret); 2986 "with nr_to_write = %ld ret = %d\n",
2987 __func__, wbc->nr_to_write, ret);
2903 2988
2904 /* Update index */ 2989 /* Update index */
2905 index += pages_written; 2990 index += pages_written;
@@ -2914,7 +2999,8 @@ retry:
2914out_writepages: 2999out_writepages:
2915 if (!no_nrwrite_index_update) 3000 if (!no_nrwrite_index_update)
2916 wbc->no_nrwrite_index_update = 0; 3001 wbc->no_nrwrite_index_update = 0;
2917 wbc->nr_to_write -= nr_to_writebump; 3002 if (wbc->nr_to_write > nr_to_writebump)
3003 wbc->nr_to_write -= nr_to_writebump;
2918 wbc->range_start = range_start; 3004 wbc->range_start = range_start;
2919 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3005 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2920 return ret; 3006 return ret;
@@ -3272,6 +3358,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3272} 3358}
3273 3359
3274/* 3360/*
3361 * O_DIRECT for ext3 (or indirect map) based files
3362 *
3275 * If the O_DIRECT write will extend the file then add this inode to the 3363 * If the O_DIRECT write will extend the file then add this inode to the
3276 * orphan list. So recovery will truncate it back to the original size 3364 * orphan list. So recovery will truncate it back to the original size
3277 * if the machine crashes during the write. 3365 * if the machine crashes during the write.
@@ -3280,7 +3368,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3280 * crashes then stale disk data _may_ be exposed inside the file. But current 3368 * crashes then stale disk data _may_ be exposed inside the file. But current
3281 * VFS code falls back into buffered path in that case so we are safe. 3369 * VFS code falls back into buffered path in that case so we are safe.
3282 */ 3370 */
3283static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3371static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3284 const struct iovec *iov, loff_t offset, 3372 const struct iovec *iov, loff_t offset,
3285 unsigned long nr_segs) 3373 unsigned long nr_segs)
3286{ 3374{
@@ -3291,6 +3379,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3291 ssize_t ret; 3379 ssize_t ret;
3292 int orphan = 0; 3380 int orphan = 0;
3293 size_t count = iov_length(iov, nr_segs); 3381 size_t count = iov_length(iov, nr_segs);
3382 int retries = 0;
3294 3383
3295 if (rw == WRITE) { 3384 if (rw == WRITE) {
3296 loff_t final_size = offset + count; 3385 loff_t final_size = offset + count;
@@ -3313,9 +3402,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3313 } 3402 }
3314 } 3403 }
3315 3404
3405retry:
3316 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3406 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
3317 offset, nr_segs, 3407 offset, nr_segs,
3318 ext4_get_block, NULL); 3408 ext4_get_block, NULL);
3409 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3410 goto retry;
3319 3411
3320 if (orphan) { 3412 if (orphan) {
3321 int err; 3413 int err;
@@ -3354,6 +3446,364 @@ out:
3354 return ret; 3446 return ret;
3355} 3447}
3356 3448
3449static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
3450 struct buffer_head *bh_result, int create)
3451{
3452 handle_t *handle = NULL;
3453 int ret = 0;
3454 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3455 int dio_credits;
3456
3457 ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
3458 inode->i_ino, create);
3459 /*
3460 * DIO VFS code passes create = 0 flag for write to
3461 * the middle of file. It does this to avoid block
3462 * allocation for holes, to prevent expose stale data
3463 * out when there is parallel buffered read (which does
3464 * not hold the i_mutex lock) while direct IO write has
3465 * not completed. DIO request on holes finally falls back
3466 * to buffered IO for this reason.
3467 *
3468 * For ext4 extent based file, since we support fallocate,
3469 * new allocated extent as uninitialized, for holes, we
3470 * could fallocate blocks for holes, thus parallel
3471 * buffered IO read will zero out the page when read on
3472 * a hole while parallel DIO write to the hole has not completed.
3473 *
3474 * when we come here, we know it's a direct IO write to
3475 * to the middle of file (<i_size)
3476 * so it's safe to override the create flag from VFS.
3477 */
3478 create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
3479
3480 if (max_blocks > DIO_MAX_BLOCKS)
3481 max_blocks = DIO_MAX_BLOCKS;
3482 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3483 handle = ext4_journal_start(inode, dio_credits);
3484 if (IS_ERR(handle)) {
3485 ret = PTR_ERR(handle);
3486 goto out;
3487 }
3488 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3489 create);
3490 if (ret > 0) {
3491 bh_result->b_size = (ret << inode->i_blkbits);
3492 ret = 0;
3493 }
3494 ext4_journal_stop(handle);
3495out:
3496 return ret;
3497}
3498
3499static void ext4_free_io_end(ext4_io_end_t *io)
3500{
3501 BUG_ON(!io);
3502 iput(io->inode);
3503 kfree(io);
3504}
3505static void dump_aio_dio_list(struct inode * inode)
3506{
3507#ifdef EXT4_DEBUG
3508 struct list_head *cur, *before, *after;
3509 ext4_io_end_t *io, *io0, *io1;
3510
3511 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3512 ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
3513 return;
3514 }
3515
3516 ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
3517 list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
3518 cur = &io->list;
3519 before = cur->prev;
3520 io0 = container_of(before, ext4_io_end_t, list);
3521 after = cur->next;
3522 io1 = container_of(after, ext4_io_end_t, list);
3523
3524 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3525 io, inode->i_ino, io0, io1);
3526 }
3527#endif
3528}
3529
3530/*
3531 * check a range of space and convert unwritten extents to written.
3532 */
3533static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
3534{
3535 struct inode *inode = io->inode;
3536 loff_t offset = io->offset;
3537 size_t size = io->size;
3538 int ret = 0;
3539
3540 ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
3541 "list->prev 0x%p\n",
3542 io, inode->i_ino, io->list.next, io->list.prev);
3543
3544 if (list_empty(&io->list))
3545 return ret;
3546
3547 if (io->flag != DIO_AIO_UNWRITTEN)
3548 return ret;
3549
3550 if (offset + size <= i_size_read(inode))
3551 ret = ext4_convert_unwritten_extents(inode, offset, size);
3552
3553 if (ret < 0) {
3554 printk(KERN_EMERG "%s: failed to convert unwritten"
3555 "extents to written extents, error is %d"
3556 " io is still on inode %lu aio dio list\n",
3557 __func__, ret, inode->i_ino);
3558 return ret;
3559 }
3560
3561 /* clear the DIO AIO unwritten flag */
3562 io->flag = 0;
3563 return ret;
3564}
3565/*
3566 * work on completed aio dio IO, to convert unwritten extents to extents
3567 */
3568static void ext4_end_aio_dio_work(struct work_struct *work)
3569{
3570 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3571 struct inode *inode = io->inode;
3572 int ret = 0;
3573
3574 mutex_lock(&inode->i_mutex);
3575 ret = ext4_end_aio_dio_nolock(io);
3576 if (ret >= 0) {
3577 if (!list_empty(&io->list))
3578 list_del_init(&io->list);
3579 ext4_free_io_end(io);
3580 }
3581 mutex_unlock(&inode->i_mutex);
3582}
3583/*
3584 * This function is called from ext4_sync_file().
3585 *
3586 * When AIO DIO IO is completed, the work to convert unwritten
3587 * extents to written is queued on workqueue but may not get immediately
3588 * scheduled. When fsync is called, we need to ensure the
3589 * conversion is complete before fsync returns.
3590 * The inode keeps track of a list of completed AIO from DIO path
3591 * that might needs to do the conversion. This function walks through
3592 * the list and convert the related unwritten extents to written.
3593 */
3594int flush_aio_dio_completed_IO(struct inode *inode)
3595{
3596 ext4_io_end_t *io;
3597 int ret = 0;
3598 int ret2 = 0;
3599
3600 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
3601 return ret;
3602
3603 dump_aio_dio_list(inode);
3604 while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3605 io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
3606 ext4_io_end_t, list);
3607 /*
3608 * Calling ext4_end_aio_dio_nolock() to convert completed
3609 * IO to written.
3610 *
3611 * When ext4_sync_file() is called, run_queue() may already
3612 * about to flush the work corresponding to this io structure.
3613 * It will be upset if it founds the io structure related
3614 * to the work-to-be schedule is freed.
3615 *
3616 * Thus we need to keep the io structure still valid here after
3617 * convertion finished. The io structure has a flag to
3618 * avoid double converting from both fsync and background work
3619 * queue work.
3620 */
3621 ret = ext4_end_aio_dio_nolock(io);
3622 if (ret < 0)
3623 ret2 = ret;
3624 else
3625 list_del_init(&io->list);
3626 }
3627 return (ret2 < 0) ? ret2 : 0;
3628}
3629
3630static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
3631{
3632 ext4_io_end_t *io = NULL;
3633
3634 io = kmalloc(sizeof(*io), GFP_NOFS);
3635
3636 if (io) {
3637 igrab(inode);
3638 io->inode = inode;
3639 io->flag = 0;
3640 io->offset = 0;
3641 io->size = 0;
3642 io->error = 0;
3643 INIT_WORK(&io->work, ext4_end_aio_dio_work);
3644 INIT_LIST_HEAD(&io->list);
3645 }
3646
3647 return io;
3648}
3649
3650static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3651 ssize_t size, void *private)
3652{
3653 ext4_io_end_t *io_end = iocb->private;
3654 struct workqueue_struct *wq;
3655
3656 /* if not async direct IO or dio with 0 bytes write, just return */
3657 if (!io_end || !size)
3658 return;
3659
3660 ext_debug("ext4_end_io_dio(): io_end 0x%p"
3661 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
3662 iocb->private, io_end->inode->i_ino, iocb, offset,
3663 size);
3664
3665 /* if not aio dio with unwritten extents, just free io and return */
3666 if (io_end->flag != DIO_AIO_UNWRITTEN){
3667 ext4_free_io_end(io_end);
3668 iocb->private = NULL;
3669 return;
3670 }
3671
3672 io_end->offset = offset;
3673 io_end->size = size;
3674 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3675
3676 /* queue the work to convert unwritten extents to written */
3677 queue_work(wq, &io_end->work);
3678
3679 /* Add the io_end to per-inode completed aio dio list*/
3680 list_add_tail(&io_end->list,
3681 &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
3682 iocb->private = NULL;
3683}
3684/*
3685 * For ext4 extent files, ext4 will do direct-io write to holes,
3686 * preallocated extents, and those write extend the file, no need to
3687 * fall back to buffered IO.
3688 *
3689 * For holes, we fallocate those blocks, mark them as unintialized
3690 * If those blocks were preallocated, we mark sure they are splited, but
3691 * still keep the range to write as unintialized.
3692 *
3693 * The unwrritten extents will be converted to written when DIO is completed.
3694 * For async direct IO, since the IO may still pending when return, we
3695 * set up an end_io call back function, which will do the convertion
3696 * when async direct IO completed.
3697 *
3698 * If the O_DIRECT write will extend the file then add this inode to the
3699 * orphan list. So recovery will truncate it back to the original size
3700 * if the machine crashes during the write.
3701 *
3702 */
3703static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3704 const struct iovec *iov, loff_t offset,
3705 unsigned long nr_segs)
3706{
3707 struct file *file = iocb->ki_filp;
3708 struct inode *inode = file->f_mapping->host;
3709 ssize_t ret;
3710 size_t count = iov_length(iov, nr_segs);
3711
3712 loff_t final_size = offset + count;
3713 if (rw == WRITE && final_size <= inode->i_size) {
3714 /*
3715 * We could direct write to holes and fallocate.
3716 *
3717 * Allocated blocks to fill the hole are marked as uninitialized
3718 * to prevent paralel buffered read to expose the stale data
3719 * before DIO complete the data IO.
3720 *
3721 * As to previously fallocated extents, ext4 get_block
3722 * will just simply mark the buffer mapped but still
3723 * keep the extents uninitialized.
3724 *
3725 * for non AIO case, we will convert those unwritten extents
3726 * to written after return back from blockdev_direct_IO.
3727 *
3728 * for async DIO, the conversion needs to be defered when
3729 * the IO is completed. The ext4 end_io callback function
3730 * will be called to take care of the conversion work.
3731 * Here for async case, we allocate an io_end structure to
3732 * hook to the iocb.
3733 */
3734 iocb->private = NULL;
3735 EXT4_I(inode)->cur_aio_dio = NULL;
3736 if (!is_sync_kiocb(iocb)) {
3737 iocb->private = ext4_init_io_end(inode);
3738 if (!iocb->private)
3739 return -ENOMEM;
3740 /*
3741 * we save the io structure for current async
3742 * direct IO, so that later ext4_get_blocks()
3743 * could flag the io structure whether there
3744 * is a unwritten extents needs to be converted
3745 * when IO is completed.
3746 */
3747 EXT4_I(inode)->cur_aio_dio = iocb->private;
3748 }
3749
3750 ret = blockdev_direct_IO(rw, iocb, inode,
3751 inode->i_sb->s_bdev, iov,
3752 offset, nr_segs,
3753 ext4_get_block_dio_write,
3754 ext4_end_io_dio);
3755 if (iocb->private)
3756 EXT4_I(inode)->cur_aio_dio = NULL;
3757 /*
3758 * The io_end structure takes a reference to the inode,
3759 * that structure needs to be destroyed and the
3760 * reference to the inode need to be dropped, when IO is
3761 * complete, even with 0 byte write, or failed.
3762 *
3763 * In the successful AIO DIO case, the io_end structure will be
3764 * desctroyed and the reference to the inode will be dropped
3765 * after the end_io call back function is called.
3766 *
3767 * In the case there is 0 byte write, or error case, since
3768 * VFS direct IO won't invoke the end_io call back function,
3769 * we need to free the end_io structure here.
3770 */
3771 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3772 ext4_free_io_end(iocb->private);
3773 iocb->private = NULL;
3774 } else if (ret > 0 && (EXT4_I(inode)->i_state &
3775 EXT4_STATE_DIO_UNWRITTEN)) {
3776 int err;
3777 /*
3778 * for non AIO case, since the IO is already
3779 * completed, we could do the convertion right here
3780 */
3781 err = ext4_convert_unwritten_extents(inode,
3782 offset, ret);
3783 if (err < 0)
3784 ret = err;
3785 EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
3786 }
3787 return ret;
3788 }
3789
3790 /* for write the the end of file case, we fall back to old way */
3791 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3792}
3793
3794static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3795 const struct iovec *iov, loff_t offset,
3796 unsigned long nr_segs)
3797{
3798 struct file *file = iocb->ki_filp;
3799 struct inode *inode = file->f_mapping->host;
3800
3801 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
3802 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3803
3804 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3805}
3806
3357/* 3807/*
3358 * Pages can be marked dirty completely asynchronously from ext4's journalling 3808 * Pages can be marked dirty completely asynchronously from ext4's journalling
3359 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3809 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
@@ -4551,8 +5001,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
4551 */ 5001 */
4552static int ext4_do_update_inode(handle_t *handle, 5002static int ext4_do_update_inode(handle_t *handle,
4553 struct inode *inode, 5003 struct inode *inode,
4554 struct ext4_iloc *iloc, 5004 struct ext4_iloc *iloc)
4555 int do_sync)
4556{ 5005{
4557 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 5006 struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
4558 struct ext4_inode_info *ei = EXT4_I(inode); 5007 struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4653,22 +5102,10 @@ static int ext4_do_update_inode(handle_t *handle,
4653 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 5102 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
4654 } 5103 }
4655 5104
4656 /* 5105 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4657 * If we're not using a journal and we were called from 5106 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4658 * ext4_write_inode() to sync the inode (making do_sync true), 5107 if (!err)
4659 * we can just use sync_dirty_buffer() directly to do our dirty 5108 err = rc;
4660 * work. Testing s_journal here is a bit redundant but it's
4661 * worth it to avoid potential future trouble.
4662 */
4663 if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
4664 BUFFER_TRACE(bh, "call sync_dirty_buffer");
4665 sync_dirty_buffer(bh);
4666 } else {
4667 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4668 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4669 if (!err)
4670 err = rc;
4671 }
4672 ei->i_state &= ~EXT4_STATE_NEW; 5109 ei->i_state &= ~EXT4_STATE_NEW;
4673 5110
4674out_brelse: 5111out_brelse:
@@ -4736,8 +5173,16 @@ int ext4_write_inode(struct inode *inode, int wait)
4736 err = ext4_get_inode_loc(inode, &iloc); 5173 err = ext4_get_inode_loc(inode, &iloc);
4737 if (err) 5174 if (err)
4738 return err; 5175 return err;
4739 err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE, 5176 if (wait)
4740 inode, &iloc, wait); 5177 sync_dirty_buffer(iloc.bh);
5178 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5179 ext4_error(inode->i_sb, __func__,
5180 "IO error syncing inode, "
5181 "inode=%lu, block=%llu",
5182 inode->i_ino,
5183 (unsigned long long)iloc.bh->b_blocknr);
5184 err = -EIO;
5185 }
4741 } 5186 }
4742 return err; 5187 return err;
4743} 5188}
@@ -5033,7 +5478,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
5033 get_bh(iloc->bh); 5478 get_bh(iloc->bh);
5034 5479
5035 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 5480 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
5036 err = ext4_do_update_inode(handle, inode, iloc, 0); 5481 err = ext4_do_update_inode(handle, inode, iloc);
5037 put_bh(iloc->bh); 5482 put_bh(iloc->bh);
5038 return err; 5483 return err;
5039} 5484}
@@ -5177,27 +5622,14 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5177 */ 5622 */
5178void ext4_dirty_inode(struct inode *inode) 5623void ext4_dirty_inode(struct inode *inode)
5179{ 5624{
5180 handle_t *current_handle = ext4_journal_current_handle();
5181 handle_t *handle; 5625 handle_t *handle;
5182 5626
5183 if (!ext4_handle_valid(current_handle)) {
5184 ext4_mark_inode_dirty(current_handle, inode);
5185 return;
5186 }
5187
5188 handle = ext4_journal_start(inode, 2); 5627 handle = ext4_journal_start(inode, 2);
5189 if (IS_ERR(handle)) 5628 if (IS_ERR(handle))
5190 goto out; 5629 goto out;
5191 if (current_handle && 5630
5192 current_handle->h_transaction != handle->h_transaction) { 5631 ext4_mark_inode_dirty(handle, inode);
5193 /* This task has a transaction open against a different fs */ 5632
5194 printk(KERN_EMERG "%s: transactions do not match!\n",
5195 __func__);
5196 } else {
5197 jbd_debug(5, "marking dirty. outer handle=%p\n",
5198 current_handle);
5199 ext4_mark_inode_dirty(handle, inode);
5200 }
5201 ext4_journal_stop(handle); 5633 ext4_journal_stop(handle);
5202out: 5634out:
5203 return; 5635 return;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e9c61896d60..bba12824def 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2096,207 +2096,6 @@ out:
2096 return err; 2096 return err;
2097} 2097}
2098 2098
2099#ifdef EXT4_MB_HISTORY
2100struct ext4_mb_proc_session {
2101 struct ext4_mb_history *history;
2102 struct super_block *sb;
2103 int start;
2104 int max;
2105};
2106
2107static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
2108 struct ext4_mb_history *hs,
2109 int first)
2110{
2111 if (hs == s->history + s->max)
2112 hs = s->history;
2113 if (!first && hs == s->history + s->start)
2114 return NULL;
2115 while (hs->orig.fe_len == 0) {
2116 hs++;
2117 if (hs == s->history + s->max)
2118 hs = s->history;
2119 if (hs == s->history + s->start)
2120 return NULL;
2121 }
2122 return hs;
2123}
2124
2125static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
2126{
2127 struct ext4_mb_proc_session *s = seq->private;
2128 struct ext4_mb_history *hs;
2129 int l = *pos;
2130
2131 if (l == 0)
2132 return SEQ_START_TOKEN;
2133 hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2134 if (!hs)
2135 return NULL;
2136 while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
2137 return hs;
2138}
2139
2140static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
2141 loff_t *pos)
2142{
2143 struct ext4_mb_proc_session *s = seq->private;
2144 struct ext4_mb_history *hs = v;
2145
2146 ++*pos;
2147 if (v == SEQ_START_TOKEN)
2148 return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2149 else
2150 return ext4_mb_history_skip_empty(s, ++hs, 0);
2151}
2152
2153static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2154{
2155 char buf[25], buf2[25], buf3[25], *fmt;
2156 struct ext4_mb_history *hs = v;
2157
2158 if (v == SEQ_START_TOKEN) {
2159 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
2160 "%-5s %-2s %-6s %-5s %-5s %-6s\n",
2161 "pid", "inode", "original", "goal", "result", "found",
2162 "grps", "cr", "flags", "merge", "tail", "broken");
2163 return 0;
2164 }
2165
2166 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
2167 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
2168 "0x%04x %-5s %-5u %-6u\n";
2169 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2170 hs->result.fe_start, hs->result.fe_len,
2171 hs->result.fe_logical);
2172 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
2173 hs->orig.fe_start, hs->orig.fe_len,
2174 hs->orig.fe_logical);
2175 sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
2176 hs->goal.fe_start, hs->goal.fe_len,
2177 hs->goal.fe_logical);
2178 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
2179 hs->found, hs->groups, hs->cr, hs->flags,
2180 hs->merged ? "M" : "", hs->tail,
2181 hs->buddy ? 1 << hs->buddy : 0);
2182 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
2183 fmt = "%-5u %-8u %-23s %-23s %-23s\n";
2184 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2185 hs->result.fe_start, hs->result.fe_len,
2186 hs->result.fe_logical);
2187 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
2188 hs->orig.fe_start, hs->orig.fe_len,
2189 hs->orig.fe_logical);
2190 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
2191 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
2192 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
2193 hs->result.fe_start, hs->result.fe_len);
2194 seq_printf(seq, "%-5u %-8u %-23s discard\n",
2195 hs->pid, hs->ino, buf2);
2196 } else if (hs->op == EXT4_MB_HISTORY_FREE) {
2197 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
2198 hs->result.fe_start, hs->result.fe_len);
2199 seq_printf(seq, "%-5u %-8u %-23s free\n",
2200 hs->pid, hs->ino, buf2);
2201 }
2202 return 0;
2203}
2204
2205static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
2206{
2207}
2208
2209static const struct seq_operations ext4_mb_seq_history_ops = {
2210 .start = ext4_mb_seq_history_start,
2211 .next = ext4_mb_seq_history_next,
2212 .stop = ext4_mb_seq_history_stop,
2213 .show = ext4_mb_seq_history_show,
2214};
2215
2216static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
2217{
2218 struct super_block *sb = PDE(inode)->data;
2219 struct ext4_sb_info *sbi = EXT4_SB(sb);
2220 struct ext4_mb_proc_session *s;
2221 int rc;
2222 int size;
2223
2224 if (unlikely(sbi->s_mb_history == NULL))
2225 return -ENOMEM;
2226 s = kmalloc(sizeof(*s), GFP_KERNEL);
2227 if (s == NULL)
2228 return -ENOMEM;
2229 s->sb = sb;
2230 size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
2231 s->history = kmalloc(size, GFP_KERNEL);
2232 if (s->history == NULL) {
2233 kfree(s);
2234 return -ENOMEM;
2235 }
2236
2237 spin_lock(&sbi->s_mb_history_lock);
2238 memcpy(s->history, sbi->s_mb_history, size);
2239 s->max = sbi->s_mb_history_max;
2240 s->start = sbi->s_mb_history_cur % s->max;
2241 spin_unlock(&sbi->s_mb_history_lock);
2242
2243 rc = seq_open(file, &ext4_mb_seq_history_ops);
2244 if (rc == 0) {
2245 struct seq_file *m = (struct seq_file *)file->private_data;
2246 m->private = s;
2247 } else {
2248 kfree(s->history);
2249 kfree(s);
2250 }
2251 return rc;
2252
2253}
2254
2255static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
2256{
2257 struct seq_file *seq = (struct seq_file *)file->private_data;
2258 struct ext4_mb_proc_session *s = seq->private;
2259 kfree(s->history);
2260 kfree(s);
2261 return seq_release(inode, file);
2262}
2263
2264static ssize_t ext4_mb_seq_history_write(struct file *file,
2265 const char __user *buffer,
2266 size_t count, loff_t *ppos)
2267{
2268 struct seq_file *seq = (struct seq_file *)file->private_data;
2269 struct ext4_mb_proc_session *s = seq->private;
2270 struct super_block *sb = s->sb;
2271 char str[32];
2272 int value;
2273
2274 if (count >= sizeof(str)) {
2275 printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
2276 "mb_history", (int)sizeof(str));
2277 return -EOVERFLOW;
2278 }
2279
2280 if (copy_from_user(str, buffer, count))
2281 return -EFAULT;
2282
2283 value = simple_strtol(str, NULL, 0);
2284 if (value < 0)
2285 return -ERANGE;
2286 EXT4_SB(sb)->s_mb_history_filter = value;
2287
2288 return count;
2289}
2290
2291static const struct file_operations ext4_mb_seq_history_fops = {
2292 .owner = THIS_MODULE,
2293 .open = ext4_mb_seq_history_open,
2294 .read = seq_read,
2295 .write = ext4_mb_seq_history_write,
2296 .llseek = seq_lseek,
2297 .release = ext4_mb_seq_history_release,
2298};
2299
2300static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2099static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2301{ 2100{
2302 struct super_block *sb = seq->private; 2101 struct super_block *sb = seq->private;
@@ -2396,82 +2195,6 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
2396 .release = seq_release, 2195 .release = seq_release,
2397}; 2196};
2398 2197
2399static void ext4_mb_history_release(struct super_block *sb)
2400{
2401 struct ext4_sb_info *sbi = EXT4_SB(sb);
2402
2403 if (sbi->s_proc != NULL) {
2404 remove_proc_entry("mb_groups", sbi->s_proc);
2405 if (sbi->s_mb_history_max)
2406 remove_proc_entry("mb_history", sbi->s_proc);
2407 }
2408 kfree(sbi->s_mb_history);
2409}
2410
2411static void ext4_mb_history_init(struct super_block *sb)
2412{
2413 struct ext4_sb_info *sbi = EXT4_SB(sb);
2414 int i;
2415
2416 if (sbi->s_proc != NULL) {
2417 if (sbi->s_mb_history_max)
2418 proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
2419 &ext4_mb_seq_history_fops, sb);
2420 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2421 &ext4_mb_seq_groups_fops, sb);
2422 }
2423
2424 sbi->s_mb_history_cur = 0;
2425 spin_lock_init(&sbi->s_mb_history_lock);
2426 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
2427 sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
2428 /* if we can't allocate history, then we simple won't use it */
2429}
2430
2431static noinline_for_stack void
2432ext4_mb_store_history(struct ext4_allocation_context *ac)
2433{
2434 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2435 struct ext4_mb_history h;
2436
2437 if (sbi->s_mb_history == NULL)
2438 return;
2439
2440 if (!(ac->ac_op & sbi->s_mb_history_filter))
2441 return;
2442
2443 h.op = ac->ac_op;
2444 h.pid = current->pid;
2445 h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
2446 h.orig = ac->ac_o_ex;
2447 h.result = ac->ac_b_ex;
2448 h.flags = ac->ac_flags;
2449 h.found = ac->ac_found;
2450 h.groups = ac->ac_groups_scanned;
2451 h.cr = ac->ac_criteria;
2452 h.tail = ac->ac_tail;
2453 h.buddy = ac->ac_buddy;
2454 h.merged = 0;
2455 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
2456 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
2457 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
2458 h.merged = 1;
2459 h.goal = ac->ac_g_ex;
2460 h.result = ac->ac_f_ex;
2461 }
2462
2463 spin_lock(&sbi->s_mb_history_lock);
2464 memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
2465 if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
2466 sbi->s_mb_history_cur = 0;
2467 spin_unlock(&sbi->s_mb_history_lock);
2468}
2469
2470#else
2471#define ext4_mb_history_release(sb)
2472#define ext4_mb_history_init(sb)
2473#endif
2474
2475 2198
2476/* Create and initialize ext4_group_info data for the given group. */ 2199/* Create and initialize ext4_group_info data for the given group. */
2477int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2200int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
@@ -2690,7 +2413,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2690 sbi->s_mb_stats = MB_DEFAULT_STATS; 2413 sbi->s_mb_stats = MB_DEFAULT_STATS;
2691 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 2414 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2692 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 2415 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2693 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
2694 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2416 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2695 2417
2696 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2418 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
@@ -2708,12 +2430,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2708 spin_lock_init(&lg->lg_prealloc_lock); 2430 spin_lock_init(&lg->lg_prealloc_lock);
2709 } 2431 }
2710 2432
2711 ext4_mb_history_init(sb); 2433 if (sbi->s_proc)
2434 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2435 &ext4_mb_seq_groups_fops, sb);
2712 2436
2713 if (sbi->s_journal) 2437 if (sbi->s_journal)
2714 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2438 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2715
2716 printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
2717 return 0; 2439 return 0;
2718} 2440}
2719 2441
@@ -2790,7 +2512,8 @@ int ext4_mb_release(struct super_block *sb)
2790 } 2512 }
2791 2513
2792 free_percpu(sbi->s_locality_groups); 2514 free_percpu(sbi->s_locality_groups);
2793 ext4_mb_history_release(sb); 2515 if (sbi->s_proc)
2516 remove_proc_entry("mb_groups", sbi->s_proc);
2794 2517
2795 return 0; 2518 return 0;
2796} 2519}
@@ -3276,7 +2999,10 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3276 atomic_inc(&sbi->s_bal_breaks); 2999 atomic_inc(&sbi->s_bal_breaks);
3277 } 3000 }
3278 3001
3279 ext4_mb_store_history(ac); 3002 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3003 trace_ext4_mballoc_alloc(ac);
3004 else
3005 trace_ext4_mballoc_prealloc(ac);
3280} 3006}
3281 3007
3282/* 3008/*
@@ -3776,7 +3502,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3776 if (ac) { 3502 if (ac) {
3777 ac->ac_sb = sb; 3503 ac->ac_sb = sb;
3778 ac->ac_inode = pa->pa_inode; 3504 ac->ac_inode = pa->pa_inode;
3779 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3780 } 3505 }
3781 3506
3782 while (bit < end) { 3507 while (bit < end) {
@@ -3796,7 +3521,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3796 ac->ac_b_ex.fe_start = bit; 3521 ac->ac_b_ex.fe_start = bit;
3797 ac->ac_b_ex.fe_len = next - bit; 3522 ac->ac_b_ex.fe_len = next - bit;
3798 ac->ac_b_ex.fe_logical = 0; 3523 ac->ac_b_ex.fe_logical = 0;
3799 ext4_mb_store_history(ac); 3524 trace_ext4_mballoc_discard(ac);
3800 } 3525 }
3801 3526
3802 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, 3527 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
@@ -3831,9 +3556,6 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3831 ext4_group_t group; 3556 ext4_group_t group;
3832 ext4_grpblk_t bit; 3557 ext4_grpblk_t bit;
3833 3558
3834 if (ac)
3835 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3836
3837 trace_ext4_mb_release_group_pa(ac, pa); 3559 trace_ext4_mb_release_group_pa(ac, pa);
3838 BUG_ON(pa->pa_deleted == 0); 3560 BUG_ON(pa->pa_deleted == 0);
3839 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3561 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
@@ -3848,7 +3570,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3848 ac->ac_b_ex.fe_start = bit; 3570 ac->ac_b_ex.fe_start = bit;
3849 ac->ac_b_ex.fe_len = pa->pa_len; 3571 ac->ac_b_ex.fe_len = pa->pa_len;
3850 ac->ac_b_ex.fe_logical = 0; 3572 ac->ac_b_ex.fe_logical = 0;
3851 ext4_mb_store_history(ac); 3573 trace_ext4_mballoc_discard(ac);
3852 } 3574 }
3853 3575
3854 return 0; 3576 return 0;
@@ -4189,7 +3911,6 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4189 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 3911 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
4190 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) 3912 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4191 >> bsbits; 3913 >> bsbits;
4192 size = max(size, isize);
4193 3914
4194 if ((size == isize) && 3915 if ((size == isize) &&
4195 !ext4_fs_is_busy(sbi) && 3916 !ext4_fs_is_busy(sbi) &&
@@ -4199,6 +3920,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4199 } 3920 }
4200 3921
4201 /* don't use group allocation for large files */ 3922 /* don't use group allocation for large files */
3923 size = max(size, isize);
4202 if (size >= sbi->s_mb_stream_request) { 3924 if (size >= sbi->s_mb_stream_request) {
4203 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 3925 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4204 return; 3926 return;
@@ -4739,7 +4461,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4739 4461
4740 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4462 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4741 if (ac) { 4463 if (ac) {
4742 ac->ac_op = EXT4_MB_HISTORY_FREE;
4743 ac->ac_inode = inode; 4464 ac->ac_inode = inode;
4744 ac->ac_sb = sb; 4465 ac->ac_sb = sb;
4745 } 4466 }
@@ -4806,7 +4527,7 @@ do_more:
4806 ac->ac_b_ex.fe_group = block_group; 4527 ac->ac_b_ex.fe_group = block_group;
4807 ac->ac_b_ex.fe_start = bit; 4528 ac->ac_b_ex.fe_start = bit;
4808 ac->ac_b_ex.fe_len = count; 4529 ac->ac_b_ex.fe_len = count;
4809 ext4_mb_store_history(ac); 4530 trace_ext4_mballoc_free(ac);
4810 } 4531 }
4811 4532
4812 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4533 err = ext4_mb_load_buddy(sb, block_group, &e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 188d3d709b2..0ca811061bc 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -52,18 +52,8 @@ extern u8 mb_enable_debug;
52#define mb_debug(n, fmt, a...) 52#define mb_debug(n, fmt, a...)
53#endif 53#endif
54 54
55/*
56 * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
57 * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
58 */
59#define EXT4_MB_HISTORY
60#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ 55#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
61#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ 56#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
62#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */
63#define EXT4_MB_HISTORY_FREE 8 /* free */
64
65#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \
66 EXT4_MB_HISTORY_PREALLOC)
67 57
68/* 58/*
69 * How long mballoc can look for a best extent (in found extents) 59 * How long mballoc can look for a best extent (in found extents)
@@ -84,7 +74,7 @@ extern u8 mb_enable_debug;
84 * with 'ext4_mb_stats' allocator will collect stats that will be 74 * with 'ext4_mb_stats' allocator will collect stats that will be
85 * shown at umount. The collecting costs though! 75 * shown at umount. The collecting costs though!
86 */ 76 */
87#define MB_DEFAULT_STATS 1 77#define MB_DEFAULT_STATS 0
88 78
89/* 79/*
90 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served 80 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
@@ -217,22 +207,6 @@ struct ext4_allocation_context {
217#define AC_STATUS_FOUND 2 207#define AC_STATUS_FOUND 2
218#define AC_STATUS_BREAK 3 208#define AC_STATUS_BREAK 3
219 209
220struct ext4_mb_history {
221 struct ext4_free_extent orig; /* orig allocation */
222 struct ext4_free_extent goal; /* goal allocation */
223 struct ext4_free_extent result; /* result allocation */
224 unsigned pid;
225 unsigned ino;
226 __u16 found; /* how many extents have been found */
227 __u16 groups; /* how many groups have been scanned */
228 __u16 tail; /* what tail broke some buddy */
229 __u16 buddy; /* buddy the tail ^^^ broke */
230 __u16 flags;
231 __u8 cr:3; /* which phase the result extent was found at */
232 __u8 op:4;
233 __u8 merged:1;
234};
235
236struct ext4_buddy { 210struct ext4_buddy {
237 struct page *bd_buddy_page; 211 struct page *bd_buddy_page;
238 void *bd_buddy; 212 void *bd_buddy;
@@ -247,13 +221,6 @@ struct ext4_buddy {
247#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 221#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
248#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 222#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
249 223
250#ifndef EXT4_MB_HISTORY
251static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
252{
253 return;
254}
255#endif
256
257#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 224#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
258 225
259static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 226static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index bf519f239ae..a93d5b80f3e 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
75 goto err_out; 75 goto err_out;
76 } 76 }
77 } 77 }
78 retval = ext4_ext_insert_extent(handle, inode, path, &newext); 78 retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
79err_out: 79err_out:
80 if (path) { 80 if (path) {
81 ext4_ext_drop_refs(path); 81 ext4_ext_drop_refs(path);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c07a2915e40..25b6b145736 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -322,7 +322,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
322 goto out; 322 goto out;
323 323
324 if (ext4_ext_insert_extent(handle, orig_inode, 324 if (ext4_ext_insert_extent(handle, orig_inode,
325 orig_path, new_ext)) 325 orig_path, new_ext, 0))
326 goto out; 326 goto out;
327 } 327 }
328 328
@@ -333,7 +333,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
333 goto out; 333 goto out;
334 334
335 if (ext4_ext_insert_extent(handle, orig_inode, 335 if (ext4_ext_insert_extent(handle, orig_inode,
336 orig_path, end_ext)) 336 orig_path, end_ext, 0))
337 goto out; 337 goto out;
338 } 338 }
339out: 339out:
@@ -1001,14 +1001,6 @@ mext_check_arguments(struct inode *orig_inode,
1001 return -EINVAL; 1001 return -EINVAL;
1002 } 1002 }
1003 1003
1004 /* orig and donor should be different file */
1005 if (orig_inode->i_ino == donor_inode->i_ino) {
1006 ext4_debug("ext4 move extent: The argument files should not "
1007 "be same file [ino:orig %lu, donor %lu]\n",
1008 orig_inode->i_ino, donor_inode->i_ino);
1009 return -EINVAL;
1010 }
1011
1012 /* Ext4 move extent supports only extent based file */ 1004 /* Ext4 move extent supports only extent based file */
1013 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { 1005 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
1014 ext4_debug("ext4 move extent: orig file is not extents " 1006 ext4_debug("ext4 move extent: orig file is not extents "
@@ -1232,6 +1224,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1232 int block_len_in_page; 1224 int block_len_in_page;
1233 int uninit; 1225 int uninit;
1234 1226
1227 /* orig and donor should be different file */
1228 if (orig_inode->i_ino == donor_inode->i_ino) {
1229 ext4_debug("ext4 move extent: The argument files should not "
1230 "be same file [ino:orig %lu, donor %lu]\n",
1231 orig_inode->i_ino, donor_inode->i_ino);
1232 return -EINVAL;
1233 }
1234
1235 /* protect orig and donor against a truncate */ 1235 /* protect orig and donor against a truncate */
1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode);
1237 if (ret1 < 0) 1237 if (ret1 < 0)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 42f81d285cd..6d2c1b897fc 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1518,12 +1518,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1518 return retval; 1518 return retval;
1519 1519
1520 if (blocks == 1 && !dx_fallback && 1520 if (blocks == 1 && !dx_fallback &&
1521 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { 1521 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
1522 retval = make_indexed_dir(handle, dentry, inode, bh); 1522 return make_indexed_dir(handle, dentry, inode, bh);
1523 if (retval == -ENOSPC)
1524 brelse(bh);
1525 return retval;
1526 }
1527 brelse(bh); 1523 brelse(bh);
1528 } 1524 }
1529 bh = ext4_append(handle, dir, &block, &retval); 1525 bh = ext4_append(handle, dir, &block, &retval);
@@ -1532,10 +1528,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1532 de = (struct ext4_dir_entry_2 *) bh->b_data; 1528 de = (struct ext4_dir_entry_2 *) bh->b_data;
1533 de->inode = 0; 1529 de->inode = 0;
1534 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1530 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1535 retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1531 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1536 if (retval == -ENOSPC)
1537 brelse(bh);
1538 return retval;
1539} 1532}
1540 1533
1541/* 1534/*
@@ -1664,8 +1657,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1664 if (!de) 1657 if (!de)
1665 goto cleanup; 1658 goto cleanup;
1666 err = add_dirent_to_buf(handle, dentry, inode, de, bh); 1659 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1667 if (err != -ENOSPC) 1660 bh = NULL;
1668 bh = NULL;
1669 goto cleanup; 1661 goto cleanup;
1670 1662
1671journal_error: 1663journal_error:
@@ -2076,7 +2068,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2076 struct ext4_iloc iloc; 2068 struct ext4_iloc iloc;
2077 int err = 0; 2069 int err = 0;
2078 2070
2079 if (!ext4_handle_valid(handle)) 2071 /* ext4_handle_valid() assumes a valid handle_t pointer */
2072 if (handle && !ext4_handle_valid(handle))
2080 return 0; 2073 return 0;
2081 2074
2082 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); 2075 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index df539ba2777..d4ca92aab51 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -50,13 +50,6 @@
50#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
51#include <trace/events/ext4.h> 51#include <trace/events/ext4.h>
52 52
53static int default_mb_history_length = 1000;
54
55module_param_named(default_mb_history_length, default_mb_history_length,
56 int, 0644);
57MODULE_PARM_DESC(default_mb_history_length,
58 "Default number of entries saved for mb_history");
59
60struct proc_dir_entry *ext4_proc_root; 53struct proc_dir_entry *ext4_proc_root;
61static struct kset *ext4_kset; 54static struct kset *ext4_kset;
62 55
@@ -189,6 +182,36 @@ void ext4_itable_unused_set(struct super_block *sb,
189 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); 182 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
190} 183}
191 184
185
186/* Just increment the non-pointer handle value */
187static handle_t *ext4_get_nojournal(void)
188{
189 handle_t *handle = current->journal_info;
190 unsigned long ref_cnt = (unsigned long)handle;
191
192 BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
193
194 ref_cnt++;
195 handle = (handle_t *)ref_cnt;
196
197 current->journal_info = handle;
198 return handle;
199}
200
201
202/* Decrement the non-pointer handle value */
203static void ext4_put_nojournal(handle_t *handle)
204{
205 unsigned long ref_cnt = (unsigned long)handle;
206
207 BUG_ON(ref_cnt == 0);
208
209 ref_cnt--;
210 handle = (handle_t *)ref_cnt;
211
212 current->journal_info = handle;
213}
214
192/* 215/*
193 * Wrappers for jbd2_journal_start/end. 216 * Wrappers for jbd2_journal_start/end.
194 * 217 *
@@ -215,11 +238,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
215 } 238 }
216 return jbd2_journal_start(journal, nblocks); 239 return jbd2_journal_start(journal, nblocks);
217 } 240 }
218 /* 241 return ext4_get_nojournal();
219 * We're not journaling, return the appropriate indication.
220 */
221 current->journal_info = EXT4_NOJOURNAL_HANDLE;
222 return current->journal_info;
223} 242}
224 243
225/* 244/*
@@ -235,11 +254,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
235 int rc; 254 int rc;
236 255
237 if (!ext4_handle_valid(handle)) { 256 if (!ext4_handle_valid(handle)) {
238 /* 257 ext4_put_nojournal(handle);
239 * Do this here since we don't call jbd2_journal_stop() in
240 * no-journal mode.
241 */
242 current->journal_info = NULL;
243 return 0; 258 return 0;
244 } 259 }
245 sb = handle->h_transaction->t_journal->j_private; 260 sb = handle->h_transaction->t_journal->j_private;
@@ -580,6 +595,9 @@ static void ext4_put_super(struct super_block *sb)
580 struct ext4_super_block *es = sbi->s_es; 595 struct ext4_super_block *es = sbi->s_es;
581 int i, err; 596 int i, err;
582 597
598 flush_workqueue(sbi->dio_unwritten_wq);
599 destroy_workqueue(sbi->dio_unwritten_wq);
600
583 lock_super(sb); 601 lock_super(sb);
584 lock_kernel(); 602 lock_kernel();
585 if (sb->s_dirt) 603 if (sb->s_dirt)
@@ -684,6 +702,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
684 ei->i_allocated_meta_blocks = 0; 702 ei->i_allocated_meta_blocks = 0;
685 ei->i_delalloc_reserved_flag = 0; 703 ei->i_delalloc_reserved_flag = 0;
686 spin_lock_init(&(ei->i_block_reservation_lock)); 704 spin_lock_init(&(ei->i_block_reservation_lock));
705 INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
706 ei->cur_aio_dio = NULL;
687 707
688 return &ei->vfs_inode; 708 return &ei->vfs_inode;
689} 709}
@@ -1052,7 +1072,7 @@ enum {
1052 Opt_journal_update, Opt_journal_dev, 1072 Opt_journal_update, Opt_journal_dev,
1053 Opt_journal_checksum, Opt_journal_async_commit, 1073 Opt_journal_checksum, Opt_journal_async_commit,
1054 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1074 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1055 Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length, 1075 Opt_data_err_abort, Opt_data_err_ignore,
1056 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1076 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1057 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 1077 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
1058 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, 1078 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
@@ -1099,7 +1119,6 @@ static const match_table_t tokens = {
1099 {Opt_data_writeback, "data=writeback"}, 1119 {Opt_data_writeback, "data=writeback"},
1100 {Opt_data_err_abort, "data_err=abort"}, 1120 {Opt_data_err_abort, "data_err=abort"},
1101 {Opt_data_err_ignore, "data_err=ignore"}, 1121 {Opt_data_err_ignore, "data_err=ignore"},
1102 {Opt_mb_history_length, "mb_history_length=%u"},
1103 {Opt_offusrjquota, "usrjquota="}, 1122 {Opt_offusrjquota, "usrjquota="},
1104 {Opt_usrjquota, "usrjquota=%s"}, 1123 {Opt_usrjquota, "usrjquota=%s"},
1105 {Opt_offgrpjquota, "grpjquota="}, 1124 {Opt_offgrpjquota, "grpjquota="},
@@ -1281,9 +1300,11 @@ static int parse_options(char *options, struct super_block *sb,
1281 *journal_devnum = option; 1300 *journal_devnum = option;
1282 break; 1301 break;
1283 case Opt_journal_checksum: 1302 case Opt_journal_checksum:
1284 break; /* Kept for backwards compatibility */ 1303 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1304 break;
1285 case Opt_journal_async_commit: 1305 case Opt_journal_async_commit:
1286 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); 1306 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
1307 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1287 break; 1308 break;
1288 case Opt_noload: 1309 case Opt_noload:
1289 set_opt(sbi->s_mount_opt, NOLOAD); 1310 set_opt(sbi->s_mount_opt, NOLOAD);
@@ -1340,13 +1361,6 @@ static int parse_options(char *options, struct super_block *sb,
1340 case Opt_data_err_ignore: 1361 case Opt_data_err_ignore:
1341 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); 1362 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1342 break; 1363 break;
1343 case Opt_mb_history_length:
1344 if (match_int(&args[0], &option))
1345 return 0;
1346 if (option < 0)
1347 return 0;
1348 sbi->s_mb_history_max = option;
1349 break;
1350#ifdef CONFIG_QUOTA 1364#ifdef CONFIG_QUOTA
1351 case Opt_usrjquota: 1365 case Opt_usrjquota:
1352 qtype = USRQUOTA; 1366 qtype = USRQUOTA;
@@ -1646,13 +1660,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1646 EXT4_INODES_PER_GROUP(sb), 1660 EXT4_INODES_PER_GROUP(sb),
1647 sbi->s_mount_opt); 1661 sbi->s_mount_opt);
1648 1662
1649 if (EXT4_SB(sb)->s_journal) {
1650 ext4_msg(sb, KERN_INFO, "%s journal on %s",
1651 EXT4_SB(sb)->s_journal->j_inode ? "internal" :
1652 "external", EXT4_SB(sb)->s_journal->j_devname);
1653 } else {
1654 ext4_msg(sb, KERN_INFO, "no journal");
1655 }
1656 return res; 1663 return res;
1657} 1664}
1658 1665
@@ -2197,6 +2204,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2197EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 2204EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2198EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2205EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2199EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2206EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2207EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
2200 2208
2201static struct attribute *ext4_attrs[] = { 2209static struct attribute *ext4_attrs[] = {
2202 ATTR_LIST(delayed_allocation_blocks), 2210 ATTR_LIST(delayed_allocation_blocks),
@@ -2210,6 +2218,7 @@ static struct attribute *ext4_attrs[] = {
2210 ATTR_LIST(mb_order2_req), 2218 ATTR_LIST(mb_order2_req),
2211 ATTR_LIST(mb_stream_req), 2219 ATTR_LIST(mb_stream_req),
2212 ATTR_LIST(mb_group_prealloc), 2220 ATTR_LIST(mb_group_prealloc),
2221 ATTR_LIST(max_writeback_mb_bump),
2213 NULL, 2222 NULL,
2214}; 2223};
2215 2224
@@ -2413,7 +2422,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2413 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; 2422 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
2414 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 2423 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2415 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 2424 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
2416 sbi->s_mb_history_max = default_mb_history_length;
2417 2425
2418 set_opt(sbi->s_mount_opt, BARRIER); 2426 set_opt(sbi->s_mount_opt, BARRIER);
2419 2427
@@ -2679,6 +2687,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2679 } 2687 }
2680 2688
2681 sbi->s_stripe = ext4_get_stripe_size(sbi); 2689 sbi->s_stripe = ext4_get_stripe_size(sbi);
2690 sbi->s_max_writeback_mb_bump = 128;
2682 2691
2683 /* 2692 /*
2684 * set up enough so that it can read an inode 2693 * set up enough so that it can read an inode
@@ -2752,14 +2761,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2752 goto failed_mount4; 2761 goto failed_mount4;
2753 } 2762 }
2754 2763
2755 jbd2_journal_set_features(sbi->s_journal, 2764 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
2756 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); 2765 jbd2_journal_set_features(sbi->s_journal,
2757 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) 2766 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2758 jbd2_journal_set_features(sbi->s_journal, 0, 0,
2759 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 2767 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2760 else 2768 } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
2769 jbd2_journal_set_features(sbi->s_journal,
2770 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
2761 jbd2_journal_clear_features(sbi->s_journal, 0, 0, 2771 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
2762 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 2772 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2773 } else {
2774 jbd2_journal_clear_features(sbi->s_journal,
2775 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2776 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2777 }
2763 2778
2764 /* We have now updated the journal if required, so we can 2779 /* We have now updated the journal if required, so we can
2765 * validate the data journaling mode. */ 2780 * validate the data journaling mode. */
@@ -2798,6 +2813,12 @@ no_journal:
2798 clear_opt(sbi->s_mount_opt, NOBH); 2813 clear_opt(sbi->s_mount_opt, NOBH);
2799 } 2814 }
2800 } 2815 }
2816 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
2817 if (!EXT4_SB(sb)->dio_unwritten_wq) {
2818 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
2819 goto failed_mount_wq;
2820 }
2821
2801 /* 2822 /*
2802 * The jbd2_journal_load will have done any necessary log recovery, 2823 * The jbd2_journal_load will have done any necessary log recovery,
2803 * so we can safely mount the rest of the filesystem now. 2824 * so we can safely mount the rest of the filesystem now.
@@ -2849,12 +2870,12 @@ no_journal:
2849 "available"); 2870 "available");
2850 } 2871 }
2851 2872
2852 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 2873 if (test_opt(sb, DELALLOC) &&
2874 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
2853 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " 2875 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
2854 "requested data journaling mode"); 2876 "requested data journaling mode");
2855 clear_opt(sbi->s_mount_opt, DELALLOC); 2877 clear_opt(sbi->s_mount_opt, DELALLOC);
2856 } else if (test_opt(sb, DELALLOC)) 2878 }
2857 ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
2858 2879
2859 err = ext4_setup_system_zone(sb); 2880 err = ext4_setup_system_zone(sb);
2860 if (err) { 2881 if (err) {
@@ -2910,6 +2931,8 @@ cantfind_ext4:
2910 2931
2911failed_mount4: 2932failed_mount4:
2912 ext4_msg(sb, KERN_ERR, "mount failed"); 2933 ext4_msg(sb, KERN_ERR, "mount failed");
2934 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
2935failed_mount_wq:
2913 ext4_release_system_zone(sb); 2936 ext4_release_system_zone(sb);
2914 if (sbi->s_journal) { 2937 if (sbi->s_journal) {
2915 jbd2_journal_destroy(sbi->s_journal); 2938 jbd2_journal_destroy(sbi->s_journal);
@@ -3164,9 +3187,7 @@ static int ext4_load_journal(struct super_block *sb,
3164 return -EINVAL; 3187 return -EINVAL;
3165 } 3188 }
3166 3189
3167 if (journal->j_flags & JBD2_BARRIER) 3190 if (!(journal->j_flags & JBD2_BARRIER))
3168 ext4_msg(sb, KERN_INFO, "barriers enabled");
3169 else
3170 ext4_msg(sb, KERN_INFO, "barriers disabled"); 3191 ext4_msg(sb, KERN_INFO, "barriers disabled");
3171 3192
3172 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { 3193 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
@@ -3361,11 +3382,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
3361{ 3382{
3362 int ret = 0; 3383 int ret = 0;
3363 tid_t target; 3384 tid_t target;
3385 struct ext4_sb_info *sbi = EXT4_SB(sb);
3364 3386
3365 trace_ext4_sync_fs(sb, wait); 3387 trace_ext4_sync_fs(sb, wait);
3366 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { 3388 flush_workqueue(sbi->dio_unwritten_wq);
3389 if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
3367 if (wait) 3390 if (wait)
3368 jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); 3391 jbd2_log_wait_commit(sbi->s_journal, target);
3369 } 3392 }
3370 return ret; 3393 return ret;
3371} 3394}
@@ -3951,27 +3974,6 @@ static struct file_system_type ext4_fs_type = {
3951 .fs_flags = FS_REQUIRES_DEV, 3974 .fs_flags = FS_REQUIRES_DEV,
3952}; 3975};
3953 3976
3954#ifdef CONFIG_EXT4DEV_COMPAT
3955static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
3956 const char *dev_name, void *data,struct vfsmount *mnt)
3957{
3958 printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
3959 "to mount using ext4\n", dev_name);
3960 printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
3961 "will go away by 2.6.31\n", dev_name);
3962 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
3963}
3964
3965static struct file_system_type ext4dev_fs_type = {
3966 .owner = THIS_MODULE,
3967 .name = "ext4dev",
3968 .get_sb = ext4dev_get_sb,
3969 .kill_sb = kill_block_super,
3970 .fs_flags = FS_REQUIRES_DEV,
3971};
3972MODULE_ALIAS("ext4dev");
3973#endif
3974
3975static int __init init_ext4_fs(void) 3977static int __init init_ext4_fs(void)
3976{ 3978{
3977 int err; 3979 int err;
@@ -3996,13 +3998,6 @@ static int __init init_ext4_fs(void)
3996 err = register_filesystem(&ext4_fs_type); 3998 err = register_filesystem(&ext4_fs_type);
3997 if (err) 3999 if (err)
3998 goto out; 4000 goto out;
3999#ifdef CONFIG_EXT4DEV_COMPAT
4000 err = register_filesystem(&ext4dev_fs_type);
4001 if (err) {
4002 unregister_filesystem(&ext4_fs_type);
4003 goto out;
4004 }
4005#endif
4006 return 0; 4001 return 0;
4007out: 4002out:
4008 destroy_inodecache(); 4003 destroy_inodecache();
@@ -4021,9 +4016,6 @@ out4:
4021static void __exit exit_ext4_fs(void) 4016static void __exit exit_ext4_fs(void)
4022{ 4017{
4023 unregister_filesystem(&ext4_fs_type); 4018 unregister_filesystem(&ext4_fs_type);
4024#ifdef CONFIG_EXT4DEV_COMPAT
4025 unregister_filesystem(&ext4dev_fs_type);
4026#endif
4027 destroy_inodecache(); 4019 destroy_inodecache();
4028 exit_ext4_xattr(); 4020 exit_ext4_xattr();
4029 exit_ext4_mballoc(); 4021 exit_ext4_mballoc();
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index adb0e72a176..7db0979c6b7 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -323,7 +323,7 @@ extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
323/* fat/misc.c */ 323/* fat/misc.c */
324extern void fat_fs_error(struct super_block *s, const char *fmt, ...) 324extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
325 __attribute__ ((format (printf, 2, 3))) __cold; 325 __attribute__ ((format (printf, 2, 3))) __cold;
326extern void fat_clusters_flush(struct super_block *sb); 326extern int fat_clusters_flush(struct super_block *sb);
327extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); 327extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
328extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, 328extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
329 __le16 __time, __le16 __date, u8 time_cs); 329 __le16 __time, __le16 __date, u8 time_cs);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 04629d1302f..76b7961ab66 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -451,12 +451,16 @@ static void fat_write_super(struct super_block *sb)
451 451
452static int fat_sync_fs(struct super_block *sb, int wait) 452static int fat_sync_fs(struct super_block *sb, int wait)
453{ 453{
454 lock_super(sb); 454 int err = 0;
455 fat_clusters_flush(sb);
456 sb->s_dirt = 0;
457 unlock_super(sb);
458 455
459 return 0; 456 if (sb->s_dirt) {
457 lock_super(sb);
458 sb->s_dirt = 0;
459 err = fat_clusters_flush(sb);
460 unlock_super(sb);
461 }
462
463 return err;
460} 464}
461 465
462static void fat_put_super(struct super_block *sb) 466static void fat_put_super(struct super_block *sb)
@@ -812,7 +816,7 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
812 seq_puts(m, ",shortname=mixed"); 816 seq_puts(m, ",shortname=mixed");
813 break; 817 break;
814 case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95: 818 case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95:
815 /* seq_puts(m, ",shortname=lower"); */ 819 seq_puts(m, ",shortname=lower");
816 break; 820 break;
817 default: 821 default:
818 seq_puts(m, ",shortname=unknown"); 822 seq_puts(m, ",shortname=unknown");
@@ -963,7 +967,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
963 opts->codepage = fat_default_codepage; 967 opts->codepage = fat_default_codepage;
964 opts->iocharset = fat_default_iocharset; 968 opts->iocharset = fat_default_iocharset;
965 if (is_vfat) { 969 if (is_vfat) {
966 opts->shortname = VFAT_SFN_DISPLAY_LOWER|VFAT_SFN_CREATE_WIN95; 970 opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95;
967 opts->rodir = 0; 971 opts->rodir = 0;
968 } else { 972 } else {
969 opts->shortname = 0; 973 opts->shortname = 0;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 4e35be873e0..0f55f5cb732 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -43,19 +43,19 @@ EXPORT_SYMBOL_GPL(fat_fs_error);
43 43
44/* Flushes the number of free clusters on FAT32 */ 44/* Flushes the number of free clusters on FAT32 */
45/* XXX: Need to write one per FSINFO block. Currently only writes 1 */ 45/* XXX: Need to write one per FSINFO block. Currently only writes 1 */
46void fat_clusters_flush(struct super_block *sb) 46int fat_clusters_flush(struct super_block *sb)
47{ 47{
48 struct msdos_sb_info *sbi = MSDOS_SB(sb); 48 struct msdos_sb_info *sbi = MSDOS_SB(sb);
49 struct buffer_head *bh; 49 struct buffer_head *bh;
50 struct fat_boot_fsinfo *fsinfo; 50 struct fat_boot_fsinfo *fsinfo;
51 51
52 if (sbi->fat_bits != 32) 52 if (sbi->fat_bits != 32)
53 return; 53 return 0;
54 54
55 bh = sb_bread(sb, sbi->fsinfo_sector); 55 bh = sb_bread(sb, sbi->fsinfo_sector);
56 if (bh == NULL) { 56 if (bh == NULL) {
57 printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n"); 57 printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n");
58 return; 58 return -EIO;
59 } 59 }
60 60
61 fsinfo = (struct fat_boot_fsinfo *)bh->b_data; 61 fsinfo = (struct fat_boot_fsinfo *)bh->b_data;
@@ -74,6 +74,8 @@ void fat_clusters_flush(struct super_block *sb)
74 mark_buffer_dirty(bh); 74 mark_buffer_dirty(bh);
75 } 75 }
76 brelse(bh); 76 brelse(bh);
77
78 return 0;
77} 79}
78 80
79/* 81/*
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index cb6e8355711..f565f24019b 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -499,17 +499,10 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
499 int charlen; 499 int charlen;
500 500
501 if (utf8) { 501 if (utf8) {
502 int name_len = strlen(name); 502 *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
503 503 if (*outlen < 0)
504 *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname); 504 return *outlen;
505 505 else if (*outlen > 255)
506 /*
507 * We stripped '.'s before and set len appropriately,
508 * but utf8s_to_utf16s doesn't care about len
509 */
510 *outlen -= (name_len - len);
511
512 if (*outlen > 255)
513 return -ENAMETOOLONG; 506 return -ENAMETOOLONG;
514 507
515 op = &outname[*outlen * sizeof(wchar_t)]; 508 op = &outname[*outlen * sizeof(wchar_t)];
diff --git a/fs/fcntl.c b/fs/fcntl.c
index fc089f2f7f5..2cf93ec40a6 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -284,7 +284,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
284 type = PIDTYPE_PID; 284 type = PIDTYPE_PID;
285 break; 285 break;
286 286
287 case F_OWNER_GID: 287 case F_OWNER_PGRP:
288 type = PIDTYPE_PGID; 288 type = PIDTYPE_PGID;
289 break; 289 break;
290 290
@@ -321,7 +321,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
321 break; 321 break;
322 322
323 case PIDTYPE_PGID: 323 case PIDTYPE_PGID:
324 owner.type = F_OWNER_GID; 324 owner.type = F_OWNER_PGRP;
325 break; 325 break;
326 326
327 default: 327 default:
diff --git a/fs/file.c b/fs/file.c
index f313314f996..87e129030ab 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -10,6 +10,7 @@
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/sched.h>
13#include <linux/slab.h> 14#include <linux/slab.h>
14#include <linux/vmalloc.h> 15#include <linux/vmalloc.h>
15#include <linux/file.h> 16#include <linux/file.h>
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
index 9bbb8ce7bea..864dac20a24 100644
--- a/fs/fscache/Kconfig
+++ b/fs/fscache/Kconfig
@@ -54,3 +54,10 @@ config FSCACHE_DEBUG
54 enabled by setting bits in /sys/modules/fscache/parameter/debug. 54 enabled by setting bits in /sys/modules/fscache/parameter/debug.
55 55
56 See Documentation/filesystems/caching/fscache.txt for more information. 56 See Documentation/filesystems/caching/fscache.txt for more information.
57
58config FSCACHE_OBJECT_LIST
59 bool "Maintain global object list for debugging purposes"
60 depends on FSCACHE && PROC_FS
61 help
62 Maintain a global list of active fscache objects that can be
63 retrieved through /proc/fs/fscache/objects for debugging purposes
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
index 91571b95aac..6d561531cb3 100644
--- a/fs/fscache/Makefile
+++ b/fs/fscache/Makefile
@@ -15,5 +15,6 @@ fscache-y := \
15fscache-$(CONFIG_PROC_FS) += proc.o 15fscache-$(CONFIG_PROC_FS) += proc.o
16fscache-$(CONFIG_FSCACHE_STATS) += stats.o 16fscache-$(CONFIG_FSCACHE_STATS) += stats.o
17fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o 17fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o
18fscache-$(CONFIG_FSCACHE_OBJECT_LIST) += object-list.o
18 19
19obj-$(CONFIG_FSCACHE) := fscache.o 20obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index e21985bbb1f..6a3c48abd67 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -263,6 +263,7 @@ int fscache_add_cache(struct fscache_cache *cache,
263 spin_lock(&cache->object_list_lock); 263 spin_lock(&cache->object_list_lock);
264 list_add_tail(&ifsdef->cache_link, &cache->object_list); 264 list_add_tail(&ifsdef->cache_link, &cache->object_list);
265 spin_unlock(&cache->object_list_lock); 265 spin_unlock(&cache->object_list_lock);
266 fscache_objlist_add(ifsdef);
266 267
267 /* add the cache's netfs definition index object to the top level index 268 /* add the cache's netfs definition index object to the top level index
268 * cookie as a known backing object */ 269 * cookie as a known backing object */
@@ -380,11 +381,15 @@ void fscache_withdraw_cache(struct fscache_cache *cache)
380 381
381 /* make sure all pages pinned by operations on behalf of the netfs are 382 /* make sure all pages pinned by operations on behalf of the netfs are
382 * written to disk */ 383 * written to disk */
384 fscache_stat(&fscache_n_cop_sync_cache);
383 cache->ops->sync_cache(cache); 385 cache->ops->sync_cache(cache);
386 fscache_stat_d(&fscache_n_cop_sync_cache);
384 387
385 /* dissociate all the netfs pages backed by this cache from the block 388 /* dissociate all the netfs pages backed by this cache from the block
386 * mappings in the cache */ 389 * mappings in the cache */
390 fscache_stat(&fscache_n_cop_dissociate_pages);
387 cache->ops->dissociate_pages(cache); 391 cache->ops->dissociate_pages(cache);
392 fscache_stat_d(&fscache_n_cop_dissociate_pages);
388 393
389 /* we now have to destroy all the active objects pertaining to this 394 /* we now have to destroy all the active objects pertaining to this
390 * cache - which we do by passing them off to thread pool to be 395 * cache - which we do by passing them off to thread pool to be
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 72fd18f6c71..990535071a8 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -36,6 +36,7 @@ void fscache_cookie_init_once(void *_cookie)
36 36
37 memset(cookie, 0, sizeof(*cookie)); 37 memset(cookie, 0, sizeof(*cookie));
38 spin_lock_init(&cookie->lock); 38 spin_lock_init(&cookie->lock);
39 spin_lock_init(&cookie->stores_lock);
39 INIT_HLIST_HEAD(&cookie->backing_objects); 40 INIT_HLIST_HEAD(&cookie->backing_objects);
40} 41}
41 42
@@ -102,7 +103,9 @@ struct fscache_cookie *__fscache_acquire_cookie(
102 cookie->netfs_data = netfs_data; 103 cookie->netfs_data = netfs_data;
103 cookie->flags = 0; 104 cookie->flags = 0;
104 105
105 INIT_RADIX_TREE(&cookie->stores, GFP_NOFS); 106 /* radix tree insertion won't use the preallocation pool unless it's
107 * told it may not wait */
108 INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT);
106 109
107 switch (cookie->def->type) { 110 switch (cookie->def->type) {
108 case FSCACHE_COOKIE_TYPE_INDEX: 111 case FSCACHE_COOKIE_TYPE_INDEX:
@@ -249,7 +252,9 @@ static int fscache_alloc_object(struct fscache_cache *cache,
249 252
250 /* ask the cache to allocate an object (we may end up with duplicate 253 /* ask the cache to allocate an object (we may end up with duplicate
251 * objects at this stage, but we sort that out later) */ 254 * objects at this stage, but we sort that out later) */
255 fscache_stat(&fscache_n_cop_alloc_object);
252 object = cache->ops->alloc_object(cache, cookie); 256 object = cache->ops->alloc_object(cache, cookie);
257 fscache_stat_d(&fscache_n_cop_alloc_object);
253 if (IS_ERR(object)) { 258 if (IS_ERR(object)) {
254 fscache_stat(&fscache_n_object_no_alloc); 259 fscache_stat(&fscache_n_object_no_alloc);
255 ret = PTR_ERR(object); 260 ret = PTR_ERR(object);
@@ -270,8 +275,11 @@ static int fscache_alloc_object(struct fscache_cache *cache,
270 /* only attach if we managed to allocate all we needed, otherwise 275 /* only attach if we managed to allocate all we needed, otherwise
271 * discard the object we just allocated and instead use the one 276 * discard the object we just allocated and instead use the one
272 * attached to the cookie */ 277 * attached to the cookie */
273 if (fscache_attach_object(cookie, object) < 0) 278 if (fscache_attach_object(cookie, object) < 0) {
279 fscache_stat(&fscache_n_cop_put_object);
274 cache->ops->put_object(object); 280 cache->ops->put_object(object);
281 fscache_stat_d(&fscache_n_cop_put_object);
282 }
275 283
276 _leave(" = 0"); 284 _leave(" = 0");
277 return 0; 285 return 0;
@@ -287,7 +295,9 @@ object_already_extant:
287 return 0; 295 return 0;
288 296
289error_put: 297error_put:
298 fscache_stat(&fscache_n_cop_put_object);
290 cache->ops->put_object(object); 299 cache->ops->put_object(object);
300 fscache_stat_d(&fscache_n_cop_put_object);
291error: 301error:
292 _leave(" = %d", ret); 302 _leave(" = %d", ret);
293 return ret; 303 return ret;
@@ -349,6 +359,8 @@ static int fscache_attach_object(struct fscache_cookie *cookie,
349 object->cookie = cookie; 359 object->cookie = cookie;
350 atomic_inc(&cookie->usage); 360 atomic_inc(&cookie->usage);
351 hlist_add_head(&object->cookie_link, &cookie->backing_objects); 361 hlist_add_head(&object->cookie_link, &cookie->backing_objects);
362
363 fscache_objlist_add(object);
352 ret = 0; 364 ret = 0;
353 365
354cant_attach_object: 366cant_attach_object:
@@ -403,6 +415,8 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
403 unsigned long event; 415 unsigned long event;
404 416
405 fscache_stat(&fscache_n_relinquishes); 417 fscache_stat(&fscache_n_relinquishes);
418 if (retire)
419 fscache_stat(&fscache_n_relinquishes_retire);
406 420
407 if (!cookie) { 421 if (!cookie) {
408 fscache_stat(&fscache_n_relinquishes_null); 422 fscache_stat(&fscache_n_relinquishes_null);
@@ -428,12 +442,8 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
428 442
429 event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE; 443 event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
430 444
431 /* detach pointers back to the netfs */
432 spin_lock(&cookie->lock); 445 spin_lock(&cookie->lock);
433 446
434 cookie->netfs_data = NULL;
435 cookie->def = NULL;
436
437 /* break links with all the active objects */ 447 /* break links with all the active objects */
438 while (!hlist_empty(&cookie->backing_objects)) { 448 while (!hlist_empty(&cookie->backing_objects)) {
439 object = hlist_entry(cookie->backing_objects.first, 449 object = hlist_entry(cookie->backing_objects.first,
@@ -456,6 +466,10 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
456 BUG(); 466 BUG();
457 } 467 }
458 468
469 /* detach pointers back to the netfs */
470 cookie->netfs_data = NULL;
471 cookie->def = NULL;
472
459 spin_unlock(&cookie->lock); 473 spin_unlock(&cookie->lock);
460 474
461 if (cookie->parent) { 475 if (cookie->parent) {
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 1c341304621..edd7434ab6e 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -17,6 +17,7 @@
17 * - cache->object_list_lock 17 * - cache->object_list_lock
18 * - object->lock 18 * - object->lock
19 * - object->parent->lock 19 * - object->parent->lock
20 * - cookie->stores_lock
20 * - fscache_thread_lock 21 * - fscache_thread_lock
21 * 22 *
22 */ 23 */
@@ -88,17 +89,31 @@ extern int fscache_wait_bit_interruptible(void *);
88/* 89/*
89 * object.c 90 * object.c
90 */ 91 */
92extern const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5];
93
91extern void fscache_withdrawing_object(struct fscache_cache *, 94extern void fscache_withdrawing_object(struct fscache_cache *,
92 struct fscache_object *); 95 struct fscache_object *);
93extern void fscache_enqueue_object(struct fscache_object *); 96extern void fscache_enqueue_object(struct fscache_object *);
94 97
95/* 98/*
99 * object-list.c
100 */
101#ifdef CONFIG_FSCACHE_OBJECT_LIST
102extern const struct file_operations fscache_objlist_fops;
103
104extern void fscache_objlist_add(struct fscache_object *);
105#else
106#define fscache_objlist_add(object) do {} while(0)
107#endif
108
109/*
96 * operation.c 110 * operation.c
97 */ 111 */
98extern int fscache_submit_exclusive_op(struct fscache_object *, 112extern int fscache_submit_exclusive_op(struct fscache_object *,
99 struct fscache_operation *); 113 struct fscache_operation *);
100extern int fscache_submit_op(struct fscache_object *, 114extern int fscache_submit_op(struct fscache_object *,
101 struct fscache_operation *); 115 struct fscache_operation *);
116extern int fscache_cancel_op(struct fscache_operation *);
102extern void fscache_abort_object(struct fscache_object *); 117extern void fscache_abort_object(struct fscache_object *);
103extern void fscache_start_operations(struct fscache_object *); 118extern void fscache_start_operations(struct fscache_object *);
104extern void fscache_operation_gc(struct work_struct *); 119extern void fscache_operation_gc(struct work_struct *);
@@ -127,6 +142,8 @@ extern atomic_t fscache_n_op_enqueue;
127extern atomic_t fscache_n_op_deferred_release; 142extern atomic_t fscache_n_op_deferred_release;
128extern atomic_t fscache_n_op_release; 143extern atomic_t fscache_n_op_release;
129extern atomic_t fscache_n_op_gc; 144extern atomic_t fscache_n_op_gc;
145extern atomic_t fscache_n_op_cancelled;
146extern atomic_t fscache_n_op_rejected;
130 147
131extern atomic_t fscache_n_attr_changed; 148extern atomic_t fscache_n_attr_changed;
132extern atomic_t fscache_n_attr_changed_ok; 149extern atomic_t fscache_n_attr_changed_ok;
@@ -138,6 +155,8 @@ extern atomic_t fscache_n_allocs;
138extern atomic_t fscache_n_allocs_ok; 155extern atomic_t fscache_n_allocs_ok;
139extern atomic_t fscache_n_allocs_wait; 156extern atomic_t fscache_n_allocs_wait;
140extern atomic_t fscache_n_allocs_nobufs; 157extern atomic_t fscache_n_allocs_nobufs;
158extern atomic_t fscache_n_allocs_intr;
159extern atomic_t fscache_n_allocs_object_dead;
141extern atomic_t fscache_n_alloc_ops; 160extern atomic_t fscache_n_alloc_ops;
142extern atomic_t fscache_n_alloc_op_waits; 161extern atomic_t fscache_n_alloc_op_waits;
143 162
@@ -148,6 +167,7 @@ extern atomic_t fscache_n_retrievals_nodata;
148extern atomic_t fscache_n_retrievals_nobufs; 167extern atomic_t fscache_n_retrievals_nobufs;
149extern atomic_t fscache_n_retrievals_intr; 168extern atomic_t fscache_n_retrievals_intr;
150extern atomic_t fscache_n_retrievals_nomem; 169extern atomic_t fscache_n_retrievals_nomem;
170extern atomic_t fscache_n_retrievals_object_dead;
151extern atomic_t fscache_n_retrieval_ops; 171extern atomic_t fscache_n_retrieval_ops;
152extern atomic_t fscache_n_retrieval_op_waits; 172extern atomic_t fscache_n_retrieval_op_waits;
153 173
@@ -158,6 +178,14 @@ extern atomic_t fscache_n_stores_nobufs;
158extern atomic_t fscache_n_stores_oom; 178extern atomic_t fscache_n_stores_oom;
159extern atomic_t fscache_n_store_ops; 179extern atomic_t fscache_n_store_ops;
160extern atomic_t fscache_n_store_calls; 180extern atomic_t fscache_n_store_calls;
181extern atomic_t fscache_n_store_pages;
182extern atomic_t fscache_n_store_radix_deletes;
183extern atomic_t fscache_n_store_pages_over_limit;
184
185extern atomic_t fscache_n_store_vmscan_not_storing;
186extern atomic_t fscache_n_store_vmscan_gone;
187extern atomic_t fscache_n_store_vmscan_busy;
188extern atomic_t fscache_n_store_vmscan_cancelled;
161 189
162extern atomic_t fscache_n_marks; 190extern atomic_t fscache_n_marks;
163extern atomic_t fscache_n_uncaches; 191extern atomic_t fscache_n_uncaches;
@@ -176,6 +204,7 @@ extern atomic_t fscache_n_updates_run;
176extern atomic_t fscache_n_relinquishes; 204extern atomic_t fscache_n_relinquishes;
177extern atomic_t fscache_n_relinquishes_null; 205extern atomic_t fscache_n_relinquishes_null;
178extern atomic_t fscache_n_relinquishes_waitcrt; 206extern atomic_t fscache_n_relinquishes_waitcrt;
207extern atomic_t fscache_n_relinquishes_retire;
179 208
180extern atomic_t fscache_n_cookie_index; 209extern atomic_t fscache_n_cookie_index;
181extern atomic_t fscache_n_cookie_data; 210extern atomic_t fscache_n_cookie_data;
@@ -186,6 +215,7 @@ extern atomic_t fscache_n_object_no_alloc;
186extern atomic_t fscache_n_object_lookups; 215extern atomic_t fscache_n_object_lookups;
187extern atomic_t fscache_n_object_lookups_negative; 216extern atomic_t fscache_n_object_lookups_negative;
188extern atomic_t fscache_n_object_lookups_positive; 217extern atomic_t fscache_n_object_lookups_positive;
218extern atomic_t fscache_n_object_lookups_timed_out;
189extern atomic_t fscache_n_object_created; 219extern atomic_t fscache_n_object_created;
190extern atomic_t fscache_n_object_avail; 220extern atomic_t fscache_n_object_avail;
191extern atomic_t fscache_n_object_dead; 221extern atomic_t fscache_n_object_dead;
@@ -195,15 +225,41 @@ extern atomic_t fscache_n_checkaux_okay;
195extern atomic_t fscache_n_checkaux_update; 225extern atomic_t fscache_n_checkaux_update;
196extern atomic_t fscache_n_checkaux_obsolete; 226extern atomic_t fscache_n_checkaux_obsolete;
197 227
228extern atomic_t fscache_n_cop_alloc_object;
229extern atomic_t fscache_n_cop_lookup_object;
230extern atomic_t fscache_n_cop_lookup_complete;
231extern atomic_t fscache_n_cop_grab_object;
232extern atomic_t fscache_n_cop_update_object;
233extern atomic_t fscache_n_cop_drop_object;
234extern atomic_t fscache_n_cop_put_object;
235extern atomic_t fscache_n_cop_sync_cache;
236extern atomic_t fscache_n_cop_attr_changed;
237extern atomic_t fscache_n_cop_read_or_alloc_page;
238extern atomic_t fscache_n_cop_read_or_alloc_pages;
239extern atomic_t fscache_n_cop_allocate_page;
240extern atomic_t fscache_n_cop_allocate_pages;
241extern atomic_t fscache_n_cop_write_page;
242extern atomic_t fscache_n_cop_uncache_page;
243extern atomic_t fscache_n_cop_dissociate_pages;
244
198static inline void fscache_stat(atomic_t *stat) 245static inline void fscache_stat(atomic_t *stat)
199{ 246{
200 atomic_inc(stat); 247 atomic_inc(stat);
201} 248}
202 249
250static inline void fscache_stat_d(atomic_t *stat)
251{
252 atomic_dec(stat);
253}
254
255#define __fscache_stat(stat) (stat)
256
203extern const struct file_operations fscache_stats_fops; 257extern const struct file_operations fscache_stats_fops;
204#else 258#else
205 259
260#define __fscache_stat(stat) (NULL)
206#define fscache_stat(stat) do {} while (0) 261#define fscache_stat(stat) do {} while (0)
262#define fscache_stat_d(stat) do {} while (0)
207#endif 263#endif
208 264
209/* 265/*
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 4de41b59749..add6bdb53f0 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -48,7 +48,7 @@ static int __init fscache_init(void)
48{ 48{
49 int ret; 49 int ret;
50 50
51 ret = slow_work_register_user(); 51 ret = slow_work_register_user(THIS_MODULE);
52 if (ret < 0) 52 if (ret < 0)
53 goto error_slow_work; 53 goto error_slow_work;
54 54
@@ -80,7 +80,7 @@ error_kobj:
80error_cookie_jar: 80error_cookie_jar:
81 fscache_proc_cleanup(); 81 fscache_proc_cleanup();
82error_proc: 82error_proc:
83 slow_work_unregister_user(); 83 slow_work_unregister_user(THIS_MODULE);
84error_slow_work: 84error_slow_work:
85 return ret; 85 return ret;
86} 86}
@@ -97,7 +97,7 @@ static void __exit fscache_exit(void)
97 kobject_put(fscache_root); 97 kobject_put(fscache_root);
98 kmem_cache_destroy(fscache_cookie_jar); 98 kmem_cache_destroy(fscache_cookie_jar);
99 fscache_proc_cleanup(); 99 fscache_proc_cleanup();
100 slow_work_unregister_user(); 100 slow_work_unregister_user(THIS_MODULE);
101 printk(KERN_NOTICE "FS-Cache: Unloaded\n"); 101 printk(KERN_NOTICE "FS-Cache: Unloaded\n");
102} 102}
103 103
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
new file mode 100644
index 00000000000..e590242fa41
--- /dev/null
+++ b/fs/fscache/object-list.c
@@ -0,0 +1,432 @@
1/* Global fscache object list maintainer and viewer
2 *
3 * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#define FSCACHE_DEBUG_LEVEL COOKIE
13#include <linux/module.h>
14#include <linux/seq_file.h>
15#include <linux/key.h>
16#include <keys/user-type.h>
17#include "internal.h"
18
19static struct rb_root fscache_object_list;
20static DEFINE_RWLOCK(fscache_object_list_lock);
21
22struct fscache_objlist_data {
23 unsigned long config; /* display configuration */
24#define FSCACHE_OBJLIST_CONFIG_KEY 0x00000001 /* show object keys */
25#define FSCACHE_OBJLIST_CONFIG_AUX 0x00000002 /* show object auxdata */
26#define FSCACHE_OBJLIST_CONFIG_COOKIE 0x00000004 /* show objects with cookies */
27#define FSCACHE_OBJLIST_CONFIG_NOCOOKIE 0x00000008 /* show objects without cookies */
28#define FSCACHE_OBJLIST_CONFIG_BUSY 0x00000010 /* show busy objects */
29#define FSCACHE_OBJLIST_CONFIG_IDLE 0x00000020 /* show idle objects */
30#define FSCACHE_OBJLIST_CONFIG_PENDWR 0x00000040 /* show objects with pending writes */
31#define FSCACHE_OBJLIST_CONFIG_NOPENDWR 0x00000080 /* show objects without pending writes */
32#define FSCACHE_OBJLIST_CONFIG_READS 0x00000100 /* show objects with active reads */
33#define FSCACHE_OBJLIST_CONFIG_NOREADS 0x00000200 /* show objects without active reads */
34#define FSCACHE_OBJLIST_CONFIG_EVENTS 0x00000400 /* show objects with events */
35#define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */
36#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with slow work */
37#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without slow work */
38
39 u8 buf[512]; /* key and aux data buffer */
40};
41
42/*
43 * Add an object to the object list
44 * - we use the address of the fscache_object structure as the key into the
45 * tree
46 */
47void fscache_objlist_add(struct fscache_object *obj)
48{
49 struct fscache_object *xobj;
50 struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL;
51
52 write_lock(&fscache_object_list_lock);
53
54 while (*p) {
55 parent = *p;
56 xobj = rb_entry(parent, struct fscache_object, objlist_link);
57
58 if (obj < xobj)
59 p = &(*p)->rb_left;
60 else if (obj > xobj)
61 p = &(*p)->rb_right;
62 else
63 BUG();
64 }
65
66 rb_link_node(&obj->objlist_link, parent, p);
67 rb_insert_color(&obj->objlist_link, &fscache_object_list);
68
69 write_unlock(&fscache_object_list_lock);
70}
71
72/**
73 * fscache_object_destroy - Note that a cache object is about to be destroyed
74 * @object: The object to be destroyed
75 *
76 * Note the imminent destruction and deallocation of a cache object record.
77 */
78void fscache_object_destroy(struct fscache_object *obj)
79{
80 write_lock(&fscache_object_list_lock);
81
82 BUG_ON(RB_EMPTY_ROOT(&fscache_object_list));
83 rb_erase(&obj->objlist_link, &fscache_object_list);
84
85 write_unlock(&fscache_object_list_lock);
86}
87EXPORT_SYMBOL(fscache_object_destroy);
88
89/*
90 * find the object in the tree on or after the specified index
91 */
92static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
93{
94 struct fscache_object *pobj, *obj, *minobj = NULL;
95 struct rb_node *p;
96 unsigned long pos;
97
98 if (*_pos >= (unsigned long) ERR_PTR(-ENOENT))
99 return NULL;
100 pos = *_pos;
101
102 /* banners (can't represent line 0 by pos 0 as that would involve
103 * returning a NULL pointer) */
104 if (pos == 0)
105 return (struct fscache_object *) ++(*_pos);
106 if (pos < 3)
107 return (struct fscache_object *)pos;
108
109 pobj = (struct fscache_object *)pos;
110 p = fscache_object_list.rb_node;
111 while (p) {
112 obj = rb_entry(p, struct fscache_object, objlist_link);
113 if (pobj < obj) {
114 if (!minobj || minobj > obj)
115 minobj = obj;
116 p = p->rb_left;
117 } else if (pobj > obj) {
118 p = p->rb_right;
119 } else {
120 minobj = obj;
121 break;
122 }
123 obj = NULL;
124 }
125
126 if (!minobj)
127 *_pos = (unsigned long) ERR_PTR(-ENOENT);
128 else if (minobj != obj)
129 *_pos = (unsigned long) minobj;
130 return minobj;
131}
132
133/*
134 * set up the iterator to start reading from the first line
135 */
136static void *fscache_objlist_start(struct seq_file *m, loff_t *_pos)
137 __acquires(&fscache_object_list_lock)
138{
139 read_lock(&fscache_object_list_lock);
140 return fscache_objlist_lookup(_pos);
141}
142
143/*
144 * move to the next line
145 */
146static void *fscache_objlist_next(struct seq_file *m, void *v, loff_t *_pos)
147{
148 (*_pos)++;
149 return fscache_objlist_lookup(_pos);
150}
151
152/*
153 * clean up after reading
154 */
155static void fscache_objlist_stop(struct seq_file *m, void *v)
156 __releases(&fscache_object_list_lock)
157{
158 read_unlock(&fscache_object_list_lock);
159}
160
161/*
162 * display an object
163 */
164static int fscache_objlist_show(struct seq_file *m, void *v)
165{
166 struct fscache_objlist_data *data = m->private;
167 struct fscache_object *obj = v;
168 unsigned long config = data->config;
169 uint16_t keylen, auxlen;
170 char _type[3], *type;
171 bool no_cookie;
172 u8 *buf = data->buf, *p;
173
174 if ((unsigned long) v == 1) {
175 seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS"
176 " EM EV F S"
177 " | NETFS_COOKIE_DEF TY FL NETFS_DATA");
178 if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
179 FSCACHE_OBJLIST_CONFIG_AUX))
180 seq_puts(m, " ");
181 if (config & FSCACHE_OBJLIST_CONFIG_KEY)
182 seq_puts(m, "OBJECT_KEY");
183 if ((config & (FSCACHE_OBJLIST_CONFIG_KEY |
184 FSCACHE_OBJLIST_CONFIG_AUX)) ==
185 (FSCACHE_OBJLIST_CONFIG_KEY | FSCACHE_OBJLIST_CONFIG_AUX))
186 seq_puts(m, ", ");
187 if (config & FSCACHE_OBJLIST_CONFIG_AUX)
188 seq_puts(m, "AUX_DATA");
189 seq_puts(m, "\n");
190 return 0;
191 }
192
193 if ((unsigned long) v == 2) {
194 seq_puts(m, "======== ======== ==== ===== === === === == ====="
195 " == == = ="
196 " | ================ == == ================");
197 if (config & (FSCACHE_OBJLIST_CONFIG_KEY |
198 FSCACHE_OBJLIST_CONFIG_AUX))
199 seq_puts(m, " ================");
200 seq_puts(m, "\n");
201 return 0;
202 }
203
204 /* filter out any unwanted objects */
205#define FILTER(criterion, _yes, _no) \
206 do { \
207 unsigned long yes = FSCACHE_OBJLIST_CONFIG_##_yes; \
208 unsigned long no = FSCACHE_OBJLIST_CONFIG_##_no; \
209 if (criterion) { \
210 if (!(config & yes)) \
211 return 0; \
212 } else { \
213 if (!(config & no)) \
214 return 0; \
215 } \
216 } while(0)
217
218 if (~config) {
219 FILTER(obj->cookie,
220 COOKIE, NOCOOKIE);
221 FILTER(obj->state != FSCACHE_OBJECT_ACTIVE ||
222 obj->n_ops != 0 ||
223 obj->n_obj_ops != 0 ||
224 obj->flags ||
225 !list_empty(&obj->dependents),
226 BUSY, IDLE);
227 FILTER(test_bit(FSCACHE_OBJECT_PENDING_WRITE, &obj->flags),
228 PENDWR, NOPENDWR);
229 FILTER(atomic_read(&obj->n_reads),
230 READS, NOREADS);
231 FILTER(obj->events & obj->event_mask,
232 EVENTS, NOEVENTS);
233 FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW),
234 WORK, NOWORK);
235 }
236
237 seq_printf(m,
238 "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ",
239 obj->debug_id,
240 obj->parent ? obj->parent->debug_id : -1,
241 fscache_object_states_short[obj->state],
242 obj->n_children,
243 obj->n_ops,
244 obj->n_obj_ops,
245 obj->n_in_progress,
246 obj->n_exclusive,
247 atomic_read(&obj->n_reads),
248 obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK,
249 obj->events,
250 obj->flags,
251 obj->work.flags);
252
253 no_cookie = true;
254 keylen = auxlen = 0;
255 if (obj->cookie) {
256 spin_lock(&obj->lock);
257 if (obj->cookie) {
258 switch (obj->cookie->def->type) {
259 case 0:
260 type = "IX";
261 break;
262 case 1:
263 type = "DT";
264 break;
265 default:
266 sprintf(_type, "%02u",
267 obj->cookie->def->type);
268 type = _type;
269 break;
270 }
271
272 seq_printf(m, "%-16s %s %2lx %16p",
273 obj->cookie->def->name,
274 type,
275 obj->cookie->flags,
276 obj->cookie->netfs_data);
277
278 if (obj->cookie->def->get_key &&
279 config & FSCACHE_OBJLIST_CONFIG_KEY)
280 keylen = obj->cookie->def->get_key(
281 obj->cookie->netfs_data,
282 buf, 400);
283
284 if (obj->cookie->def->get_aux &&
285 config & FSCACHE_OBJLIST_CONFIG_AUX)
286 auxlen = obj->cookie->def->get_aux(
287 obj->cookie->netfs_data,
288 buf + keylen, 512 - keylen);
289
290 no_cookie = false;
291 }
292 spin_unlock(&obj->lock);
293
294 if (!no_cookie && (keylen > 0 || auxlen > 0)) {
295 seq_printf(m, " ");
296 for (p = buf; keylen > 0; keylen--)
297 seq_printf(m, "%02x", *p++);
298 if (auxlen > 0) {
299 if (config & FSCACHE_OBJLIST_CONFIG_KEY)
300 seq_printf(m, ", ");
301 for (; auxlen > 0; auxlen--)
302 seq_printf(m, "%02x", *p++);
303 }
304 }
305 }
306
307 if (no_cookie)
308 seq_printf(m, "<no_cookie>\n");
309 else
310 seq_printf(m, "\n");
311 return 0;
312}
313
314static const struct seq_operations fscache_objlist_ops = {
315 .start = fscache_objlist_start,
316 .stop = fscache_objlist_stop,
317 .next = fscache_objlist_next,
318 .show = fscache_objlist_show,
319};
320
321/*
322 * get the configuration for filtering the list
323 */
324static void fscache_objlist_config(struct fscache_objlist_data *data)
325{
326#ifdef CONFIG_KEYS
327 struct user_key_payload *confkey;
328 unsigned long config;
329 struct key *key;
330 const char *buf;
331 int len;
332
333 key = request_key(&key_type_user, "fscache:objlist", NULL);
334 if (IS_ERR(key))
335 goto no_config;
336
337 config = 0;
338 rcu_read_lock();
339
340 confkey = key->payload.data;
341 buf = confkey->data;
342
343 for (len = confkey->datalen - 1; len >= 0; len--) {
344 switch (buf[len]) {
345 case 'K': config |= FSCACHE_OBJLIST_CONFIG_KEY; break;
346 case 'A': config |= FSCACHE_OBJLIST_CONFIG_AUX; break;
347 case 'C': config |= FSCACHE_OBJLIST_CONFIG_COOKIE; break;
348 case 'c': config |= FSCACHE_OBJLIST_CONFIG_NOCOOKIE; break;
349 case 'B': config |= FSCACHE_OBJLIST_CONFIG_BUSY; break;
350 case 'b': config |= FSCACHE_OBJLIST_CONFIG_IDLE; break;
351 case 'W': config |= FSCACHE_OBJLIST_CONFIG_PENDWR; break;
352 case 'w': config |= FSCACHE_OBJLIST_CONFIG_NOPENDWR; break;
353 case 'R': config |= FSCACHE_OBJLIST_CONFIG_READS; break;
354 case 'r': config |= FSCACHE_OBJLIST_CONFIG_NOREADS; break;
355 case 'S': config |= FSCACHE_OBJLIST_CONFIG_WORK; break;
356 case 's': config |= FSCACHE_OBJLIST_CONFIG_NOWORK; break;
357 }
358 }
359
360 rcu_read_unlock();
361 key_put(key);
362
363 if (!(config & (FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE)))
364 config |= FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE;
365 if (!(config & (FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE)))
366 config |= FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE;
367 if (!(config & (FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR)))
368 config |= FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR;
369 if (!(config & (FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS)))
370 config |= FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS;
371 if (!(config & (FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS)))
372 config |= FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS;
373 if (!(config & (FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK)))
374 config |= FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK;
375
376 data->config = config;
377 return;
378
379no_config:
380#endif
381 data->config = ULONG_MAX;
382}
383
384/*
385 * open "/proc/fs/fscache/objects" to provide a list of active objects
386 * - can be configured by a user-defined key added to the caller's keyrings
387 */
388static int fscache_objlist_open(struct inode *inode, struct file *file)
389{
390 struct fscache_objlist_data *data;
391 struct seq_file *m;
392 int ret;
393
394 ret = seq_open(file, &fscache_objlist_ops);
395 if (ret < 0)
396 return ret;
397
398 m = file->private_data;
399
400 /* buffer for key extraction */
401 data = kmalloc(sizeof(struct fscache_objlist_data), GFP_KERNEL);
402 if (!data) {
403 seq_release(inode, file);
404 return -ENOMEM;
405 }
406
407 /* get the configuration key */
408 fscache_objlist_config(data);
409
410 m->private = data;
411 return 0;
412}
413
414/*
415 * clean up on close
416 */
417static int fscache_objlist_release(struct inode *inode, struct file *file)
418{
419 struct seq_file *m = file->private_data;
420
421 kfree(m->private);
422 m->private = NULL;
423 return seq_release(inode, file);
424}
425
426const struct file_operations fscache_objlist_fops = {
427 .owner = THIS_MODULE,
428 .open = fscache_objlist_open,
429 .read = seq_read,
430 .llseek = seq_lseek,
431 .release = fscache_objlist_release,
432};
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 392a41b1b79..e513ac599c8 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,9 +14,10 @@
14 14
15#define FSCACHE_DEBUG_LEVEL COOKIE 15#define FSCACHE_DEBUG_LEVEL COOKIE
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/seq_file.h>
17#include "internal.h" 18#include "internal.h"
18 19
19const char *fscache_object_states[] = { 20const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
20 [FSCACHE_OBJECT_INIT] = "OBJECT_INIT", 21 [FSCACHE_OBJECT_INIT] = "OBJECT_INIT",
21 [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP", 22 [FSCACHE_OBJECT_LOOKING_UP] = "OBJECT_LOOKING_UP",
22 [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING", 23 [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING",
@@ -33,9 +34,28 @@ const char *fscache_object_states[] = {
33}; 34};
34EXPORT_SYMBOL(fscache_object_states); 35EXPORT_SYMBOL(fscache_object_states);
35 36
37const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
38 [FSCACHE_OBJECT_INIT] = "INIT",
39 [FSCACHE_OBJECT_LOOKING_UP] = "LOOK",
40 [FSCACHE_OBJECT_CREATING] = "CRTN",
41 [FSCACHE_OBJECT_AVAILABLE] = "AVBL",
42 [FSCACHE_OBJECT_ACTIVE] = "ACTV",
43 [FSCACHE_OBJECT_UPDATING] = "UPDT",
44 [FSCACHE_OBJECT_DYING] = "DYNG",
45 [FSCACHE_OBJECT_LC_DYING] = "LCDY",
46 [FSCACHE_OBJECT_ABORT_INIT] = "ABTI",
47 [FSCACHE_OBJECT_RELEASING] = "RELS",
48 [FSCACHE_OBJECT_RECYCLING] = "RCYC",
49 [FSCACHE_OBJECT_WITHDRAWING] = "WTHD",
50 [FSCACHE_OBJECT_DEAD] = "DEAD",
51};
52
36static void fscache_object_slow_work_put_ref(struct slow_work *); 53static void fscache_object_slow_work_put_ref(struct slow_work *);
37static int fscache_object_slow_work_get_ref(struct slow_work *); 54static int fscache_object_slow_work_get_ref(struct slow_work *);
38static void fscache_object_slow_work_execute(struct slow_work *); 55static void fscache_object_slow_work_execute(struct slow_work *);
56#ifdef CONFIG_SLOW_WORK_PROC
57static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *);
58#endif
39static void fscache_initialise_object(struct fscache_object *); 59static void fscache_initialise_object(struct fscache_object *);
40static void fscache_lookup_object(struct fscache_object *); 60static void fscache_lookup_object(struct fscache_object *);
41static void fscache_object_available(struct fscache_object *); 61static void fscache_object_available(struct fscache_object *);
@@ -45,9 +65,13 @@ static void fscache_enqueue_dependents(struct fscache_object *);
45static void fscache_dequeue_object(struct fscache_object *); 65static void fscache_dequeue_object(struct fscache_object *);
46 66
47const struct slow_work_ops fscache_object_slow_work_ops = { 67const struct slow_work_ops fscache_object_slow_work_ops = {
68 .owner = THIS_MODULE,
48 .get_ref = fscache_object_slow_work_get_ref, 69 .get_ref = fscache_object_slow_work_get_ref,
49 .put_ref = fscache_object_slow_work_put_ref, 70 .put_ref = fscache_object_slow_work_put_ref,
50 .execute = fscache_object_slow_work_execute, 71 .execute = fscache_object_slow_work_execute,
72#ifdef CONFIG_SLOW_WORK_PROC
73 .desc = fscache_object_slow_work_desc,
74#endif
51}; 75};
52EXPORT_SYMBOL(fscache_object_slow_work_ops); 76EXPORT_SYMBOL(fscache_object_slow_work_ops);
53 77
@@ -81,6 +105,7 @@ static inline void fscache_done_parent_op(struct fscache_object *object)
81static void fscache_object_state_machine(struct fscache_object *object) 105static void fscache_object_state_machine(struct fscache_object *object)
82{ 106{
83 enum fscache_object_state new_state; 107 enum fscache_object_state new_state;
108 struct fscache_cookie *cookie;
84 109
85 ASSERT(object != NULL); 110 ASSERT(object != NULL);
86 111
@@ -120,20 +145,31 @@ static void fscache_object_state_machine(struct fscache_object *object)
120 case FSCACHE_OBJECT_UPDATING: 145 case FSCACHE_OBJECT_UPDATING:
121 clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events); 146 clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
122 fscache_stat(&fscache_n_updates_run); 147 fscache_stat(&fscache_n_updates_run);
148 fscache_stat(&fscache_n_cop_update_object);
123 object->cache->ops->update_object(object); 149 object->cache->ops->update_object(object);
150 fscache_stat_d(&fscache_n_cop_update_object);
124 goto active_transit; 151 goto active_transit;
125 152
126 /* handle an object dying during lookup or creation */ 153 /* handle an object dying during lookup or creation */
127 case FSCACHE_OBJECT_LC_DYING: 154 case FSCACHE_OBJECT_LC_DYING:
128 object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE); 155 object->event_mask &= ~(1 << FSCACHE_OBJECT_EV_UPDATE);
156 fscache_stat(&fscache_n_cop_lookup_complete);
129 object->cache->ops->lookup_complete(object); 157 object->cache->ops->lookup_complete(object);
158 fscache_stat_d(&fscache_n_cop_lookup_complete);
130 159
131 spin_lock(&object->lock); 160 spin_lock(&object->lock);
132 object->state = FSCACHE_OBJECT_DYING; 161 object->state = FSCACHE_OBJECT_DYING;
133 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, 162 cookie = object->cookie;
134 &object->cookie->flags)) 163 if (cookie) {
135 wake_up_bit(&object->cookie->flags, 164 if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP,
136 FSCACHE_COOKIE_CREATING); 165 &cookie->flags))
166 wake_up_bit(&cookie->flags,
167 FSCACHE_COOKIE_LOOKING_UP);
168 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING,
169 &cookie->flags))
170 wake_up_bit(&cookie->flags,
171 FSCACHE_COOKIE_CREATING);
172 }
137 spin_unlock(&object->lock); 173 spin_unlock(&object->lock);
138 174
139 fscache_done_parent_op(object); 175 fscache_done_parent_op(object);
@@ -165,6 +201,7 @@ static void fscache_object_state_machine(struct fscache_object *object)
165 } 201 }
166 spin_unlock(&object->lock); 202 spin_unlock(&object->lock);
167 fscache_enqueue_dependents(object); 203 fscache_enqueue_dependents(object);
204 fscache_start_operations(object);
168 goto terminal_transit; 205 goto terminal_transit;
169 206
170 /* handle an abort during initialisation */ 207 /* handle an abort during initialisation */
@@ -316,14 +353,29 @@ static void fscache_object_slow_work_execute(struct slow_work *work)
316 353
317 _enter("{OBJ%x}", object->debug_id); 354 _enter("{OBJ%x}", object->debug_id);
318 355
319 clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
320
321 start = jiffies; 356 start = jiffies;
322 fscache_object_state_machine(object); 357 fscache_object_state_machine(object);
323 fscache_hist(fscache_objs_histogram, start); 358 fscache_hist(fscache_objs_histogram, start);
324 if (object->events & object->event_mask) 359 if (object->events & object->event_mask)
325 fscache_enqueue_object(object); 360 fscache_enqueue_object(object);
361 clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
362}
363
364/*
365 * describe an object for slow-work debugging
366 */
367#ifdef CONFIG_SLOW_WORK_PROC
368static void fscache_object_slow_work_desc(struct slow_work *work,
369 struct seq_file *m)
370{
371 struct fscache_object *object =
372 container_of(work, struct fscache_object, work);
373
374 seq_printf(m, "FSC: OBJ%x: %s",
375 object->debug_id,
376 fscache_object_states_short[object->state]);
326} 377}
378#endif
327 379
328/* 380/*
329 * initialise an object 381 * initialise an object
@@ -376,7 +428,9 @@ static void fscache_initialise_object(struct fscache_object *object)
376 * binding on to us, so we need to make sure we don't 428 * binding on to us, so we need to make sure we don't
377 * add ourself to the list multiple times */ 429 * add ourself to the list multiple times */
378 if (list_empty(&object->dep_link)) { 430 if (list_empty(&object->dep_link)) {
431 fscache_stat(&fscache_n_cop_grab_object);
379 object->cache->ops->grab_object(object); 432 object->cache->ops->grab_object(object);
433 fscache_stat_d(&fscache_n_cop_grab_object);
380 list_add(&object->dep_link, 434 list_add(&object->dep_link,
381 &parent->dependents); 435 &parent->dependents);
382 436
@@ -414,6 +468,7 @@ static void fscache_lookup_object(struct fscache_object *object)
414{ 468{
415 struct fscache_cookie *cookie = object->cookie; 469 struct fscache_cookie *cookie = object->cookie;
416 struct fscache_object *parent; 470 struct fscache_object *parent;
471 int ret;
417 472
418 _enter(""); 473 _enter("");
419 474
@@ -438,11 +493,20 @@ static void fscache_lookup_object(struct fscache_object *object)
438 object->cache->tag->name); 493 object->cache->tag->name);
439 494
440 fscache_stat(&fscache_n_object_lookups); 495 fscache_stat(&fscache_n_object_lookups);
441 object->cache->ops->lookup_object(object); 496 fscache_stat(&fscache_n_cop_lookup_object);
497 ret = object->cache->ops->lookup_object(object);
498 fscache_stat_d(&fscache_n_cop_lookup_object);
442 499
443 if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events)) 500 if (test_bit(FSCACHE_OBJECT_EV_ERROR, &object->events))
444 set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); 501 set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
445 502
503 if (ret == -ETIMEDOUT) {
504 /* probably stuck behind another object, so move this one to
505 * the back of the queue */
506 fscache_stat(&fscache_n_object_lookups_timed_out);
507 set_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events);
508 }
509
446 _leave(""); 510 _leave("");
447} 511}
448 512
@@ -546,7 +610,8 @@ static void fscache_object_available(struct fscache_object *object)
546 610
547 spin_lock(&object->lock); 611 spin_lock(&object->lock);
548 612
549 if (test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags)) 613 if (object->cookie &&
614 test_and_clear_bit(FSCACHE_COOKIE_CREATING, &object->cookie->flags))
550 wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING); 615 wake_up_bit(&object->cookie->flags, FSCACHE_COOKIE_CREATING);
551 616
552 fscache_done_parent_op(object); 617 fscache_done_parent_op(object);
@@ -562,7 +627,9 @@ static void fscache_object_available(struct fscache_object *object)
562 } 627 }
563 spin_unlock(&object->lock); 628 spin_unlock(&object->lock);
564 629
630 fscache_stat(&fscache_n_cop_lookup_complete);
565 object->cache->ops->lookup_complete(object); 631 object->cache->ops->lookup_complete(object);
632 fscache_stat_d(&fscache_n_cop_lookup_complete);
566 fscache_enqueue_dependents(object); 633 fscache_enqueue_dependents(object);
567 634
568 fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif); 635 fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif);
@@ -581,11 +648,16 @@ static void fscache_drop_object(struct fscache_object *object)
581 648
582 _enter("{OBJ%x,%d}", object->debug_id, object->n_children); 649 _enter("{OBJ%x,%d}", object->debug_id, object->n_children);
583 650
651 ASSERTCMP(object->cookie, ==, NULL);
652 ASSERT(hlist_unhashed(&object->cookie_link));
653
584 spin_lock(&cache->object_list_lock); 654 spin_lock(&cache->object_list_lock);
585 list_del_init(&object->cache_link); 655 list_del_init(&object->cache_link);
586 spin_unlock(&cache->object_list_lock); 656 spin_unlock(&cache->object_list_lock);
587 657
658 fscache_stat(&fscache_n_cop_drop_object);
588 cache->ops->drop_object(object); 659 cache->ops->drop_object(object);
660 fscache_stat_d(&fscache_n_cop_drop_object);
589 661
590 if (parent) { 662 if (parent) {
591 _debug("release parent OBJ%x {%d}", 663 _debug("release parent OBJ%x {%d}",
@@ -600,7 +672,9 @@ static void fscache_drop_object(struct fscache_object *object)
600 } 672 }
601 673
602 /* this just shifts the object release to the slow work processor */ 674 /* this just shifts the object release to the slow work processor */
675 fscache_stat(&fscache_n_cop_put_object);
603 object->cache->ops->put_object(object); 676 object->cache->ops->put_object(object);
677 fscache_stat_d(&fscache_n_cop_put_object);
604 678
605 _leave(""); 679 _leave("");
606} 680}
@@ -690,8 +764,12 @@ static int fscache_object_slow_work_get_ref(struct slow_work *work)
690{ 764{
691 struct fscache_object *object = 765 struct fscache_object *object =
692 container_of(work, struct fscache_object, work); 766 container_of(work, struct fscache_object, work);
767 int ret;
693 768
694 return object->cache->ops->grab_object(object) ? 0 : -EAGAIN; 769 fscache_stat(&fscache_n_cop_grab_object);
770 ret = object->cache->ops->grab_object(object) ? 0 : -EAGAIN;
771 fscache_stat_d(&fscache_n_cop_grab_object);
772 return ret;
695} 773}
696 774
697/* 775/*
@@ -702,7 +780,9 @@ static void fscache_object_slow_work_put_ref(struct slow_work *work)
702 struct fscache_object *object = 780 struct fscache_object *object =
703 container_of(work, struct fscache_object, work); 781 container_of(work, struct fscache_object, work);
704 782
705 return object->cache->ops->put_object(object); 783 fscache_stat(&fscache_n_cop_put_object);
784 object->cache->ops->put_object(object);
785 fscache_stat_d(&fscache_n_cop_put_object);
706} 786}
707 787
708/* 788/*
@@ -739,7 +819,9 @@ static void fscache_enqueue_dependents(struct fscache_object *object)
739 819
740 /* sort onto appropriate lists */ 820 /* sort onto appropriate lists */
741 fscache_enqueue_object(dep); 821 fscache_enqueue_object(dep);
822 fscache_stat(&fscache_n_cop_put_object);
742 dep->cache->ops->put_object(dep); 823 dep->cache->ops->put_object(dep);
824 fscache_stat_d(&fscache_n_cop_put_object);
743 825
744 if (!list_empty(&object->dependents)) 826 if (!list_empty(&object->dependents))
745 cond_resched_lock(&object->lock); 827 cond_resched_lock(&object->lock);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index e7f8d53b8b6..313e79a1426 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -13,6 +13,7 @@
13 13
14#define FSCACHE_DEBUG_LEVEL OPERATION 14#define FSCACHE_DEBUG_LEVEL OPERATION
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/seq_file.h>
16#include "internal.h" 17#include "internal.h"
17 18
18atomic_t fscache_op_debug_id; 19atomic_t fscache_op_debug_id;
@@ -31,32 +32,33 @@ void fscache_enqueue_operation(struct fscache_operation *op)
31 _enter("{OBJ%x OP%x,%u}", 32 _enter("{OBJ%x OP%x,%u}",
32 op->object->debug_id, op->debug_id, atomic_read(&op->usage)); 33 op->object->debug_id, op->debug_id, atomic_read(&op->usage));
33 34
35 fscache_set_op_state(op, "EnQ");
36
37 ASSERT(list_empty(&op->pend_link));
34 ASSERT(op->processor != NULL); 38 ASSERT(op->processor != NULL);
35 ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); 39 ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
36 ASSERTCMP(atomic_read(&op->usage), >, 0); 40 ASSERTCMP(atomic_read(&op->usage), >, 0);
37 41
38 if (list_empty(&op->pend_link)) { 42 fscache_stat(&fscache_n_op_enqueue);
39 switch (op->flags & FSCACHE_OP_TYPE) { 43 switch (op->flags & FSCACHE_OP_TYPE) {
40 case FSCACHE_OP_FAST: 44 case FSCACHE_OP_FAST:
41 _debug("queue fast"); 45 _debug("queue fast");
42 atomic_inc(&op->usage); 46 atomic_inc(&op->usage);
43 if (!schedule_work(&op->fast_work)) 47 if (!schedule_work(&op->fast_work))
44 fscache_put_operation(op); 48 fscache_put_operation(op);
45 break; 49 break;
46 case FSCACHE_OP_SLOW: 50 case FSCACHE_OP_SLOW:
47 _debug("queue slow"); 51 _debug("queue slow");
48 slow_work_enqueue(&op->slow_work); 52 slow_work_enqueue(&op->slow_work);
49 break; 53 break;
50 case FSCACHE_OP_MYTHREAD: 54 case FSCACHE_OP_MYTHREAD:
51 _debug("queue for caller's attention"); 55 _debug("queue for caller's attention");
52 break; 56 break;
53 default: 57 default:
54 printk(KERN_ERR "FS-Cache: Unexpected op type %lx", 58 printk(KERN_ERR "FS-Cache: Unexpected op type %lx",
55 op->flags); 59 op->flags);
56 BUG(); 60 BUG();
57 break; 61 break;
58 }
59 fscache_stat(&fscache_n_op_enqueue);
60 } 62 }
61} 63}
62EXPORT_SYMBOL(fscache_enqueue_operation); 64EXPORT_SYMBOL(fscache_enqueue_operation);
@@ -67,6 +69,8 @@ EXPORT_SYMBOL(fscache_enqueue_operation);
67static void fscache_run_op(struct fscache_object *object, 69static void fscache_run_op(struct fscache_object *object,
68 struct fscache_operation *op) 70 struct fscache_operation *op)
69{ 71{
72 fscache_set_op_state(op, "Run");
73
70 object->n_in_progress++; 74 object->n_in_progress++;
71 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) 75 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
72 wake_up_bit(&op->flags, FSCACHE_OP_WAITING); 76 wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
@@ -87,9 +91,12 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
87 91
88 _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); 92 _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
89 93
94 fscache_set_op_state(op, "SubmitX");
95
90 spin_lock(&object->lock); 96 spin_lock(&object->lock);
91 ASSERTCMP(object->n_ops, >=, object->n_in_progress); 97 ASSERTCMP(object->n_ops, >=, object->n_in_progress);
92 ASSERTCMP(object->n_ops, >=, object->n_exclusive); 98 ASSERTCMP(object->n_ops, >=, object->n_exclusive);
99 ASSERT(list_empty(&op->pend_link));
93 100
94 ret = -ENOBUFS; 101 ret = -ENOBUFS;
95 if (fscache_object_is_active(object)) { 102 if (fscache_object_is_active(object)) {
@@ -190,9 +197,12 @@ int fscache_submit_op(struct fscache_object *object,
190 197
191 ASSERTCMP(atomic_read(&op->usage), >, 0); 198 ASSERTCMP(atomic_read(&op->usage), >, 0);
192 199
200 fscache_set_op_state(op, "Submit");
201
193 spin_lock(&object->lock); 202 spin_lock(&object->lock);
194 ASSERTCMP(object->n_ops, >=, object->n_in_progress); 203 ASSERTCMP(object->n_ops, >=, object->n_in_progress);
195 ASSERTCMP(object->n_ops, >=, object->n_exclusive); 204 ASSERTCMP(object->n_ops, >=, object->n_exclusive);
205 ASSERT(list_empty(&op->pend_link));
196 206
197 ostate = object->state; 207 ostate = object->state;
198 smp_rmb(); 208 smp_rmb();
@@ -222,6 +232,11 @@ int fscache_submit_op(struct fscache_object *object,
222 list_add_tail(&op->pend_link, &object->pending_ops); 232 list_add_tail(&op->pend_link, &object->pending_ops);
223 fscache_stat(&fscache_n_op_pend); 233 fscache_stat(&fscache_n_op_pend);
224 ret = 0; 234 ret = 0;
235 } else if (object->state == FSCACHE_OBJECT_DYING ||
236 object->state == FSCACHE_OBJECT_LC_DYING ||
237 object->state == FSCACHE_OBJECT_WITHDRAWING) {
238 fscache_stat(&fscache_n_op_rejected);
239 ret = -ENOBUFS;
225 } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) { 240 } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
226 fscache_report_unexpected_submission(object, op, ostate); 241 fscache_report_unexpected_submission(object, op, ostate);
227 ASSERT(!fscache_object_is_active(object)); 242 ASSERT(!fscache_object_is_active(object));
@@ -264,12 +279,7 @@ void fscache_start_operations(struct fscache_object *object)
264 stop = true; 279 stop = true;
265 } 280 }
266 list_del_init(&op->pend_link); 281 list_del_init(&op->pend_link);
267 object->n_in_progress++; 282 fscache_run_op(object, op);
268
269 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
270 wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
271 if (op->processor)
272 fscache_enqueue_operation(op);
273 283
274 /* the pending queue was holding a ref on the object */ 284 /* the pending queue was holding a ref on the object */
275 fscache_put_operation(op); 285 fscache_put_operation(op);
@@ -282,6 +292,36 @@ void fscache_start_operations(struct fscache_object *object)
282} 292}
283 293
284/* 294/*
295 * cancel an operation that's pending on an object
296 */
297int fscache_cancel_op(struct fscache_operation *op)
298{
299 struct fscache_object *object = op->object;
300 int ret;
301
302 _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id);
303
304 spin_lock(&object->lock);
305
306 ret = -EBUSY;
307 if (!list_empty(&op->pend_link)) {
308 fscache_stat(&fscache_n_op_cancelled);
309 list_del_init(&op->pend_link);
310 object->n_ops--;
311 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
312 object->n_exclusive--;
313 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
314 wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
315 fscache_put_operation(op);
316 ret = 0;
317 }
318
319 spin_unlock(&object->lock);
320 _leave(" = %d", ret);
321 return ret;
322}
323
324/*
285 * release an operation 325 * release an operation
286 * - queues pending ops if this is the last in-progress op 326 * - queues pending ops if this is the last in-progress op
287 */ 327 */
@@ -298,6 +338,8 @@ void fscache_put_operation(struct fscache_operation *op)
298 if (!atomic_dec_and_test(&op->usage)) 338 if (!atomic_dec_and_test(&op->usage))
299 return; 339 return;
300 340
341 fscache_set_op_state(op, "Put");
342
301 _debug("PUT OP"); 343 _debug("PUT OP");
302 if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags)) 344 if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
303 BUG(); 345 BUG();
@@ -311,6 +353,9 @@ void fscache_put_operation(struct fscache_operation *op)
311 353
312 object = op->object; 354 object = op->object;
313 355
356 if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
357 atomic_dec(&object->n_reads);
358
314 /* now... we may get called with the object spinlock held, so we 359 /* now... we may get called with the object spinlock held, so we
315 * complete the cleanup here only if we can immediately acquire the 360 * complete the cleanup here only if we can immediately acquire the
316 * lock, and defer it otherwise */ 361 * lock, and defer it otherwise */
@@ -452,8 +497,27 @@ static void fscache_op_execute(struct slow_work *work)
452 _leave(""); 497 _leave("");
453} 498}
454 499
500/*
501 * describe an operation for slow-work debugging
502 */
503#ifdef CONFIG_SLOW_WORK_PROC
504static void fscache_op_desc(struct slow_work *work, struct seq_file *m)
505{
506 struct fscache_operation *op =
507 container_of(work, struct fscache_operation, slow_work);
508
509 seq_printf(m, "FSC: OBJ%x OP%x: %s/%s fl=%lx",
510 op->object->debug_id, op->debug_id,
511 op->name, op->state, op->flags);
512}
513#endif
514
455const struct slow_work_ops fscache_op_slow_work_ops = { 515const struct slow_work_ops fscache_op_slow_work_ops = {
516 .owner = THIS_MODULE,
456 .get_ref = fscache_op_get_ref, 517 .get_ref = fscache_op_get_ref,
457 .put_ref = fscache_op_put_ref, 518 .put_ref = fscache_op_put_ref,
458 .execute = fscache_op_execute, 519 .execute = fscache_op_execute,
520#ifdef CONFIG_SLOW_WORK_PROC
521 .desc = fscache_op_desc,
522#endif
459}; 523};
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 2568e0eb644..c598ea4c4e7 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -43,18 +43,102 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa
43EXPORT_SYMBOL(__fscache_wait_on_page_write); 43EXPORT_SYMBOL(__fscache_wait_on_page_write);
44 44
45/* 45/*
46 * note that a page has finished being written to the cache 46 * decide whether a page can be released, possibly by cancelling a store to it
47 * - we're allowed to sleep if __GFP_WAIT is flagged
47 */ 48 */
48static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *page) 49bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
50 struct page *page,
51 gfp_t gfp)
49{ 52{
50 struct page *xpage; 53 struct page *xpage;
54 void *val;
55
56 _enter("%p,%p,%x", cookie, page, gfp);
57
58 rcu_read_lock();
59 val = radix_tree_lookup(&cookie->stores, page->index);
60 if (!val) {
61 rcu_read_unlock();
62 fscache_stat(&fscache_n_store_vmscan_not_storing);
63 __fscache_uncache_page(cookie, page);
64 return true;
65 }
66
67 /* see if the page is actually undergoing storage - if so we can't get
68 * rid of it till the cache has finished with it */
69 if (radix_tree_tag_get(&cookie->stores, page->index,
70 FSCACHE_COOKIE_STORING_TAG)) {
71 rcu_read_unlock();
72 goto page_busy;
73 }
74
75 /* the page is pending storage, so we attempt to cancel the store and
76 * discard the store request so that the page can be reclaimed */
77 spin_lock(&cookie->stores_lock);
78 rcu_read_unlock();
79
80 if (radix_tree_tag_get(&cookie->stores, page->index,
81 FSCACHE_COOKIE_STORING_TAG)) {
82 /* the page started to undergo storage whilst we were looking,
83 * so now we can only wait or return */
84 spin_unlock(&cookie->stores_lock);
85 goto page_busy;
86 }
51 87
52 spin_lock(&cookie->lock);
53 xpage = radix_tree_delete(&cookie->stores, page->index); 88 xpage = radix_tree_delete(&cookie->stores, page->index);
54 spin_unlock(&cookie->lock); 89 spin_unlock(&cookie->stores_lock);
55 ASSERT(xpage != NULL); 90
91 if (xpage) {
92 fscache_stat(&fscache_n_store_vmscan_cancelled);
93 fscache_stat(&fscache_n_store_radix_deletes);
94 ASSERTCMP(xpage, ==, page);
95 } else {
96 fscache_stat(&fscache_n_store_vmscan_gone);
97 }
56 98
57 wake_up_bit(&cookie->flags, 0); 99 wake_up_bit(&cookie->flags, 0);
100 if (xpage)
101 page_cache_release(xpage);
102 __fscache_uncache_page(cookie, page);
103 return true;
104
105page_busy:
106 /* we might want to wait here, but that could deadlock the allocator as
107 * the slow-work threads writing to the cache may all end up sleeping
108 * on memory allocation */
109 fscache_stat(&fscache_n_store_vmscan_busy);
110 return false;
111}
112EXPORT_SYMBOL(__fscache_maybe_release_page);
113
114/*
115 * note that a page has finished being written to the cache
116 */
117static void fscache_end_page_write(struct fscache_object *object,
118 struct page *page)
119{
120 struct fscache_cookie *cookie;
121 struct page *xpage = NULL;
122
123 spin_lock(&object->lock);
124 cookie = object->cookie;
125 if (cookie) {
126 /* delete the page from the tree if it is now no longer
127 * pending */
128 spin_lock(&cookie->stores_lock);
129 radix_tree_tag_clear(&cookie->stores, page->index,
130 FSCACHE_COOKIE_STORING_TAG);
131 if (!radix_tree_tag_get(&cookie->stores, page->index,
132 FSCACHE_COOKIE_PENDING_TAG)) {
133 fscache_stat(&fscache_n_store_radix_deletes);
134 xpage = radix_tree_delete(&cookie->stores, page->index);
135 }
136 spin_unlock(&cookie->stores_lock);
137 wake_up_bit(&cookie->flags, 0);
138 }
139 spin_unlock(&object->lock);
140 if (xpage)
141 page_cache_release(xpage);
58} 142}
59 143
60/* 144/*
@@ -63,14 +147,21 @@ static void fscache_end_page_write(struct fscache_cookie *cookie, struct page *p
63static void fscache_attr_changed_op(struct fscache_operation *op) 147static void fscache_attr_changed_op(struct fscache_operation *op)
64{ 148{
65 struct fscache_object *object = op->object; 149 struct fscache_object *object = op->object;
150 int ret;
66 151
67 _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id); 152 _enter("{OBJ%x OP%x}", object->debug_id, op->debug_id);
68 153
69 fscache_stat(&fscache_n_attr_changed_calls); 154 fscache_stat(&fscache_n_attr_changed_calls);
70 155
71 if (fscache_object_is_active(object) && 156 if (fscache_object_is_active(object)) {
72 object->cache->ops->attr_changed(object) < 0) 157 fscache_set_op_state(op, "CallFS");
73 fscache_abort_object(object); 158 fscache_stat(&fscache_n_cop_attr_changed);
159 ret = object->cache->ops->attr_changed(object);
160 fscache_stat_d(&fscache_n_cop_attr_changed);
161 fscache_set_op_state(op, "Done");
162 if (ret < 0)
163 fscache_abort_object(object);
164 }
74 165
75 _leave(""); 166 _leave("");
76} 167}
@@ -99,6 +190,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
99 fscache_operation_init(op, NULL); 190 fscache_operation_init(op, NULL);
100 fscache_operation_init_slow(op, fscache_attr_changed_op); 191 fscache_operation_init_slow(op, fscache_attr_changed_op);
101 op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE); 192 op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE);
193 fscache_set_op_name(op, "Attr");
102 194
103 spin_lock(&cookie->lock); 195 spin_lock(&cookie->lock);
104 196
@@ -184,6 +276,7 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
184 op->start_time = jiffies; 276 op->start_time = jiffies;
185 INIT_WORK(&op->op.fast_work, fscache_retrieval_work); 277 INIT_WORK(&op->op.fast_work, fscache_retrieval_work);
186 INIT_LIST_HEAD(&op->to_do); 278 INIT_LIST_HEAD(&op->to_do);
279 fscache_set_op_name(&op->op, "Retr");
187 return op; 280 return op;
188} 281}
189 282
@@ -221,6 +314,43 @@ static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
221} 314}
222 315
223/* 316/*
317 * wait for an object to become active (or dead)
318 */
319static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
320 struct fscache_retrieval *op,
321 atomic_t *stat_op_waits,
322 atomic_t *stat_object_dead)
323{
324 int ret;
325
326 if (!test_bit(FSCACHE_OP_WAITING, &op->op.flags))
327 goto check_if_dead;
328
329 _debug(">>> WT");
330 fscache_stat(stat_op_waits);
331 if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
332 fscache_wait_bit_interruptible,
333 TASK_INTERRUPTIBLE) < 0) {
334 ret = fscache_cancel_op(&op->op);
335 if (ret == 0)
336 return -ERESTARTSYS;
337
338 /* it's been removed from the pending queue by another party,
339 * so we should get to run shortly */
340 wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
341 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
342 }
343 _debug("<<< GO");
344
345check_if_dead:
346 if (unlikely(fscache_object_is_dead(object))) {
347 fscache_stat(stat_object_dead);
348 return -ENOBUFS;
349 }
350 return 0;
351}
352
353/*
224 * read a page from the cache or allocate a block in which to store it 354 * read a page from the cache or allocate a block in which to store it
225 * - we return: 355 * - we return:
226 * -ENOMEM - out of memory, nothing done 356 * -ENOMEM - out of memory, nothing done
@@ -257,6 +387,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
257 _leave(" = -ENOMEM"); 387 _leave(" = -ENOMEM");
258 return -ENOMEM; 388 return -ENOMEM;
259 } 389 }
390 fscache_set_op_name(&op->op, "RetrRA1");
260 391
261 spin_lock(&cookie->lock); 392 spin_lock(&cookie->lock);
262 393
@@ -267,6 +398,9 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
267 398
268 ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP); 399 ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
269 400
401 atomic_inc(&object->n_reads);
402 set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
403
270 if (fscache_submit_op(object, &op->op) < 0) 404 if (fscache_submit_op(object, &op->op) < 0)
271 goto nobufs_unlock; 405 goto nobufs_unlock;
272 spin_unlock(&cookie->lock); 406 spin_unlock(&cookie->lock);
@@ -279,23 +413,27 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
279 413
280 /* we wait for the operation to become active, and then process it 414 /* we wait for the operation to become active, and then process it
281 * *here*, in this thread, and not in the thread pool */ 415 * *here*, in this thread, and not in the thread pool */
282 if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { 416 ret = fscache_wait_for_retrieval_activation(
283 _debug(">>> WT"); 417 object, op,
284 fscache_stat(&fscache_n_retrieval_op_waits); 418 __fscache_stat(&fscache_n_retrieval_op_waits),
285 wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, 419 __fscache_stat(&fscache_n_retrievals_object_dead));
286 fscache_wait_bit, TASK_UNINTERRUPTIBLE); 420 if (ret < 0)
287 _debug("<<< GO"); 421 goto error;
288 }
289 422
290 /* ask the cache to honour the operation */ 423 /* ask the cache to honour the operation */
291 if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) { 424 if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
425 fscache_stat(&fscache_n_cop_allocate_page);
292 ret = object->cache->ops->allocate_page(op, page, gfp); 426 ret = object->cache->ops->allocate_page(op, page, gfp);
427 fscache_stat_d(&fscache_n_cop_allocate_page);
293 if (ret == 0) 428 if (ret == 0)
294 ret = -ENODATA; 429 ret = -ENODATA;
295 } else { 430 } else {
431 fscache_stat(&fscache_n_cop_read_or_alloc_page);
296 ret = object->cache->ops->read_or_alloc_page(op, page, gfp); 432 ret = object->cache->ops->read_or_alloc_page(op, page, gfp);
433 fscache_stat_d(&fscache_n_cop_read_or_alloc_page);
297 } 434 }
298 435
436error:
299 if (ret == -ENOMEM) 437 if (ret == -ENOMEM)
300 fscache_stat(&fscache_n_retrievals_nomem); 438 fscache_stat(&fscache_n_retrievals_nomem);
301 else if (ret == -ERESTARTSYS) 439 else if (ret == -ERESTARTSYS)
@@ -347,7 +485,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
347 void *context, 485 void *context,
348 gfp_t gfp) 486 gfp_t gfp)
349{ 487{
350 fscache_pages_retrieval_func_t func;
351 struct fscache_retrieval *op; 488 struct fscache_retrieval *op;
352 struct fscache_object *object; 489 struct fscache_object *object;
353 int ret; 490 int ret;
@@ -369,6 +506,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
369 op = fscache_alloc_retrieval(mapping, end_io_func, context); 506 op = fscache_alloc_retrieval(mapping, end_io_func, context);
370 if (!op) 507 if (!op)
371 return -ENOMEM; 508 return -ENOMEM;
509 fscache_set_op_name(&op->op, "RetrRAN");
372 510
373 spin_lock(&cookie->lock); 511 spin_lock(&cookie->lock);
374 512
@@ -377,6 +515,9 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
377 object = hlist_entry(cookie->backing_objects.first, 515 object = hlist_entry(cookie->backing_objects.first,
378 struct fscache_object, cookie_link); 516 struct fscache_object, cookie_link);
379 517
518 atomic_inc(&object->n_reads);
519 set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
520
380 if (fscache_submit_op(object, &op->op) < 0) 521 if (fscache_submit_op(object, &op->op) < 0)
381 goto nobufs_unlock; 522 goto nobufs_unlock;
382 spin_unlock(&cookie->lock); 523 spin_unlock(&cookie->lock);
@@ -389,21 +530,27 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
389 530
390 /* we wait for the operation to become active, and then process it 531 /* we wait for the operation to become active, and then process it
391 * *here*, in this thread, and not in the thread pool */ 532 * *here*, in this thread, and not in the thread pool */
392 if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { 533 ret = fscache_wait_for_retrieval_activation(
393 _debug(">>> WT"); 534 object, op,
394 fscache_stat(&fscache_n_retrieval_op_waits); 535 __fscache_stat(&fscache_n_retrieval_op_waits),
395 wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, 536 __fscache_stat(&fscache_n_retrievals_object_dead));
396 fscache_wait_bit, TASK_UNINTERRUPTIBLE); 537 if (ret < 0)
397 _debug("<<< GO"); 538 goto error;
398 }
399 539
400 /* ask the cache to honour the operation */ 540 /* ask the cache to honour the operation */
401 if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) 541 if (test_bit(FSCACHE_COOKIE_NO_DATA_YET, &object->cookie->flags)) {
402 func = object->cache->ops->allocate_pages; 542 fscache_stat(&fscache_n_cop_allocate_pages);
403 else 543 ret = object->cache->ops->allocate_pages(
404 func = object->cache->ops->read_or_alloc_pages; 544 op, pages, nr_pages, gfp);
405 ret = func(op, pages, nr_pages, gfp); 545 fscache_stat_d(&fscache_n_cop_allocate_pages);
546 } else {
547 fscache_stat(&fscache_n_cop_read_or_alloc_pages);
548 ret = object->cache->ops->read_or_alloc_pages(
549 op, pages, nr_pages, gfp);
550 fscache_stat_d(&fscache_n_cop_read_or_alloc_pages);
551 }
406 552
553error:
407 if (ret == -ENOMEM) 554 if (ret == -ENOMEM)
408 fscache_stat(&fscache_n_retrievals_nomem); 555 fscache_stat(&fscache_n_retrievals_nomem);
409 else if (ret == -ERESTARTSYS) 556 else if (ret == -ERESTARTSYS)
@@ -461,6 +608,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
461 op = fscache_alloc_retrieval(page->mapping, NULL, NULL); 608 op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
462 if (!op) 609 if (!op)
463 return -ENOMEM; 610 return -ENOMEM;
611 fscache_set_op_name(&op->op, "RetrAL1");
464 612
465 spin_lock(&cookie->lock); 613 spin_lock(&cookie->lock);
466 614
@@ -475,18 +623,22 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
475 623
476 fscache_stat(&fscache_n_alloc_ops); 624 fscache_stat(&fscache_n_alloc_ops);
477 625
478 if (test_bit(FSCACHE_OP_WAITING, &op->op.flags)) { 626 ret = fscache_wait_for_retrieval_activation(
479 _debug(">>> WT"); 627 object, op,
480 fscache_stat(&fscache_n_alloc_op_waits); 628 __fscache_stat(&fscache_n_alloc_op_waits),
481 wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, 629 __fscache_stat(&fscache_n_allocs_object_dead));
482 fscache_wait_bit, TASK_UNINTERRUPTIBLE); 630 if (ret < 0)
483 _debug("<<< GO"); 631 goto error;
484 }
485 632
486 /* ask the cache to honour the operation */ 633 /* ask the cache to honour the operation */
634 fscache_stat(&fscache_n_cop_allocate_page);
487 ret = object->cache->ops->allocate_page(op, page, gfp); 635 ret = object->cache->ops->allocate_page(op, page, gfp);
636 fscache_stat_d(&fscache_n_cop_allocate_page);
488 637
489 if (ret < 0) 638error:
639 if (ret == -ERESTARTSYS)
640 fscache_stat(&fscache_n_allocs_intr);
641 else if (ret < 0)
490 fscache_stat(&fscache_n_allocs_nobufs); 642 fscache_stat(&fscache_n_allocs_nobufs);
491 else 643 else
492 fscache_stat(&fscache_n_allocs_ok); 644 fscache_stat(&fscache_n_allocs_ok);
@@ -521,7 +673,7 @@ static void fscache_write_op(struct fscache_operation *_op)
521 struct fscache_storage *op = 673 struct fscache_storage *op =
522 container_of(_op, struct fscache_storage, op); 674 container_of(_op, struct fscache_storage, op);
523 struct fscache_object *object = op->op.object; 675 struct fscache_object *object = op->op.object;
524 struct fscache_cookie *cookie = object->cookie; 676 struct fscache_cookie *cookie;
525 struct page *page; 677 struct page *page;
526 unsigned n; 678 unsigned n;
527 void *results[1]; 679 void *results[1];
@@ -529,16 +681,19 @@ static void fscache_write_op(struct fscache_operation *_op)
529 681
530 _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage)); 682 _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
531 683
532 spin_lock(&cookie->lock); 684 fscache_set_op_state(&op->op, "GetPage");
685
533 spin_lock(&object->lock); 686 spin_lock(&object->lock);
687 cookie = object->cookie;
534 688
535 if (!fscache_object_is_active(object)) { 689 if (!fscache_object_is_active(object) || !cookie) {
536 spin_unlock(&object->lock); 690 spin_unlock(&object->lock);
537 spin_unlock(&cookie->lock);
538 _leave(""); 691 _leave("");
539 return; 692 return;
540 } 693 }
541 694
695 spin_lock(&cookie->stores_lock);
696
542 fscache_stat(&fscache_n_store_calls); 697 fscache_stat(&fscache_n_store_calls);
543 698
544 /* find a page to store */ 699 /* find a page to store */
@@ -549,23 +704,35 @@ static void fscache_write_op(struct fscache_operation *_op)
549 goto superseded; 704 goto superseded;
550 page = results[0]; 705 page = results[0];
551 _debug("gang %d [%lx]", n, page->index); 706 _debug("gang %d [%lx]", n, page->index);
552 if (page->index > op->store_limit) 707 if (page->index > op->store_limit) {
708 fscache_stat(&fscache_n_store_pages_over_limit);
553 goto superseded; 709 goto superseded;
710 }
554 711
555 radix_tree_tag_clear(&cookie->stores, page->index, 712 if (page) {
556 FSCACHE_COOKIE_PENDING_TAG); 713 radix_tree_tag_set(&cookie->stores, page->index,
714 FSCACHE_COOKIE_STORING_TAG);
715 radix_tree_tag_clear(&cookie->stores, page->index,
716 FSCACHE_COOKIE_PENDING_TAG);
717 }
557 718
719 spin_unlock(&cookie->stores_lock);
558 spin_unlock(&object->lock); 720 spin_unlock(&object->lock);
559 spin_unlock(&cookie->lock);
560 721
561 if (page) { 722 if (page) {
723 fscache_set_op_state(&op->op, "Store");
724 fscache_stat(&fscache_n_store_pages);
725 fscache_stat(&fscache_n_cop_write_page);
562 ret = object->cache->ops->write_page(op, page); 726 ret = object->cache->ops->write_page(op, page);
563 fscache_end_page_write(cookie, page); 727 fscache_stat_d(&fscache_n_cop_write_page);
564 page_cache_release(page); 728 fscache_set_op_state(&op->op, "EndWrite");
565 if (ret < 0) 729 fscache_end_page_write(object, page);
730 if (ret < 0) {
731 fscache_set_op_state(&op->op, "Abort");
566 fscache_abort_object(object); 732 fscache_abort_object(object);
567 else 733 } else {
568 fscache_enqueue_operation(&op->op); 734 fscache_enqueue_operation(&op->op);
735 }
569 } 736 }
570 737
571 _leave(""); 738 _leave("");
@@ -575,9 +742,9 @@ superseded:
575 /* this writer is going away and there aren't any more things to 742 /* this writer is going away and there aren't any more things to
576 * write */ 743 * write */
577 _debug("cease"); 744 _debug("cease");
745 spin_unlock(&cookie->stores_lock);
578 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); 746 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
579 spin_unlock(&object->lock); 747 spin_unlock(&object->lock);
580 spin_unlock(&cookie->lock);
581 _leave(""); 748 _leave("");
582} 749}
583 750
@@ -634,6 +801,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
634 fscache_operation_init(&op->op, fscache_release_write_op); 801 fscache_operation_init(&op->op, fscache_release_write_op);
635 fscache_operation_init_slow(&op->op, fscache_write_op); 802 fscache_operation_init_slow(&op->op, fscache_write_op);
636 op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING); 803 op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING);
804 fscache_set_op_name(&op->op, "Write1");
637 805
638 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); 806 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
639 if (ret < 0) 807 if (ret < 0)
@@ -652,6 +820,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
652 /* add the page to the pending-storage radix tree on the backing 820 /* add the page to the pending-storage radix tree on the backing
653 * object */ 821 * object */
654 spin_lock(&object->lock); 822 spin_lock(&object->lock);
823 spin_lock(&cookie->stores_lock);
655 824
656 _debug("store limit %llx", (unsigned long long) object->store_limit); 825 _debug("store limit %llx", (unsigned long long) object->store_limit);
657 826
@@ -672,6 +841,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
672 if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags)) 841 if (test_and_set_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags))
673 goto already_pending; 842 goto already_pending;
674 843
844 spin_unlock(&cookie->stores_lock);
675 spin_unlock(&object->lock); 845 spin_unlock(&object->lock);
676 846
677 op->op.debug_id = atomic_inc_return(&fscache_op_debug_id); 847 op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
@@ -693,6 +863,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
693already_queued: 863already_queued:
694 fscache_stat(&fscache_n_stores_again); 864 fscache_stat(&fscache_n_stores_again);
695already_pending: 865already_pending:
866 spin_unlock(&cookie->stores_lock);
696 spin_unlock(&object->lock); 867 spin_unlock(&object->lock);
697 spin_unlock(&cookie->lock); 868 spin_unlock(&cookie->lock);
698 radix_tree_preload_end(); 869 radix_tree_preload_end();
@@ -702,7 +873,9 @@ already_pending:
702 return 0; 873 return 0;
703 874
704submit_failed: 875submit_failed:
876 spin_lock(&cookie->stores_lock);
705 radix_tree_delete(&cookie->stores, page->index); 877 radix_tree_delete(&cookie->stores, page->index);
878 spin_unlock(&cookie->stores_lock);
706 page_cache_release(page); 879 page_cache_release(page);
707 ret = -ENOBUFS; 880 ret = -ENOBUFS;
708 goto nobufs; 881 goto nobufs;
@@ -763,7 +936,9 @@ void __fscache_uncache_page(struct fscache_cookie *cookie, struct page *page)
763 if (TestClearPageFsCache(page) && 936 if (TestClearPageFsCache(page) &&
764 object->cache->ops->uncache_page) { 937 object->cache->ops->uncache_page) {
765 /* the cache backend releases the cookie lock */ 938 /* the cache backend releases the cookie lock */
939 fscache_stat(&fscache_n_cop_uncache_page);
766 object->cache->ops->uncache_page(object, page); 940 object->cache->ops->uncache_page(object, page);
941 fscache_stat_d(&fscache_n_cop_uncache_page);
767 goto done; 942 goto done;
768 } 943 }
769 944
diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c
index beeab44bc31..1d9e4951a59 100644
--- a/fs/fscache/proc.c
+++ b/fs/fscache/proc.c
@@ -37,10 +37,20 @@ int __init fscache_proc_init(void)
37 goto error_histogram; 37 goto error_histogram;
38#endif 38#endif
39 39
40#ifdef CONFIG_FSCACHE_OBJECT_LIST
41 if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL,
42 &fscache_objlist_fops))
43 goto error_objects;
44#endif
45
40 _leave(" = 0"); 46 _leave(" = 0");
41 return 0; 47 return 0;
42 48
49#ifdef CONFIG_FSCACHE_OBJECT_LIST
50error_objects:
51#endif
43#ifdef CONFIG_FSCACHE_HISTOGRAM 52#ifdef CONFIG_FSCACHE_HISTOGRAM
53 remove_proc_entry("fs/fscache/histogram", NULL);
44error_histogram: 54error_histogram:
45#endif 55#endif
46#ifdef CONFIG_FSCACHE_STATS 56#ifdef CONFIG_FSCACHE_STATS
@@ -58,6 +68,9 @@ error_dir:
58 */ 68 */
59void fscache_proc_cleanup(void) 69void fscache_proc_cleanup(void)
60{ 70{
71#ifdef CONFIG_FSCACHE_OBJECT_LIST
72 remove_proc_entry("fs/fscache/objects", NULL);
73#endif
61#ifdef CONFIG_FSCACHE_HISTOGRAM 74#ifdef CONFIG_FSCACHE_HISTOGRAM
62 remove_proc_entry("fs/fscache/histogram", NULL); 75 remove_proc_entry("fs/fscache/histogram", NULL);
63#endif 76#endif
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 65deb99e756..46435f3aae6 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -25,6 +25,8 @@ atomic_t fscache_n_op_requeue;
25atomic_t fscache_n_op_deferred_release; 25atomic_t fscache_n_op_deferred_release;
26atomic_t fscache_n_op_release; 26atomic_t fscache_n_op_release;
27atomic_t fscache_n_op_gc; 27atomic_t fscache_n_op_gc;
28atomic_t fscache_n_op_cancelled;
29atomic_t fscache_n_op_rejected;
28 30
29atomic_t fscache_n_attr_changed; 31atomic_t fscache_n_attr_changed;
30atomic_t fscache_n_attr_changed_ok; 32atomic_t fscache_n_attr_changed_ok;
@@ -36,6 +38,8 @@ atomic_t fscache_n_allocs;
36atomic_t fscache_n_allocs_ok; 38atomic_t fscache_n_allocs_ok;
37atomic_t fscache_n_allocs_wait; 39atomic_t fscache_n_allocs_wait;
38atomic_t fscache_n_allocs_nobufs; 40atomic_t fscache_n_allocs_nobufs;
41atomic_t fscache_n_allocs_intr;
42atomic_t fscache_n_allocs_object_dead;
39atomic_t fscache_n_alloc_ops; 43atomic_t fscache_n_alloc_ops;
40atomic_t fscache_n_alloc_op_waits; 44atomic_t fscache_n_alloc_op_waits;
41 45
@@ -46,6 +50,7 @@ atomic_t fscache_n_retrievals_nodata;
46atomic_t fscache_n_retrievals_nobufs; 50atomic_t fscache_n_retrievals_nobufs;
47atomic_t fscache_n_retrievals_intr; 51atomic_t fscache_n_retrievals_intr;
48atomic_t fscache_n_retrievals_nomem; 52atomic_t fscache_n_retrievals_nomem;
53atomic_t fscache_n_retrievals_object_dead;
49atomic_t fscache_n_retrieval_ops; 54atomic_t fscache_n_retrieval_ops;
50atomic_t fscache_n_retrieval_op_waits; 55atomic_t fscache_n_retrieval_op_waits;
51 56
@@ -56,6 +61,14 @@ atomic_t fscache_n_stores_nobufs;
56atomic_t fscache_n_stores_oom; 61atomic_t fscache_n_stores_oom;
57atomic_t fscache_n_store_ops; 62atomic_t fscache_n_store_ops;
58atomic_t fscache_n_store_calls; 63atomic_t fscache_n_store_calls;
64atomic_t fscache_n_store_pages;
65atomic_t fscache_n_store_radix_deletes;
66atomic_t fscache_n_store_pages_over_limit;
67
68atomic_t fscache_n_store_vmscan_not_storing;
69atomic_t fscache_n_store_vmscan_gone;
70atomic_t fscache_n_store_vmscan_busy;
71atomic_t fscache_n_store_vmscan_cancelled;
59 72
60atomic_t fscache_n_marks; 73atomic_t fscache_n_marks;
61atomic_t fscache_n_uncaches; 74atomic_t fscache_n_uncaches;
@@ -74,6 +87,7 @@ atomic_t fscache_n_updates_run;
74atomic_t fscache_n_relinquishes; 87atomic_t fscache_n_relinquishes;
75atomic_t fscache_n_relinquishes_null; 88atomic_t fscache_n_relinquishes_null;
76atomic_t fscache_n_relinquishes_waitcrt; 89atomic_t fscache_n_relinquishes_waitcrt;
90atomic_t fscache_n_relinquishes_retire;
77 91
78atomic_t fscache_n_cookie_index; 92atomic_t fscache_n_cookie_index;
79atomic_t fscache_n_cookie_data; 93atomic_t fscache_n_cookie_data;
@@ -84,6 +98,7 @@ atomic_t fscache_n_object_no_alloc;
84atomic_t fscache_n_object_lookups; 98atomic_t fscache_n_object_lookups;
85atomic_t fscache_n_object_lookups_negative; 99atomic_t fscache_n_object_lookups_negative;
86atomic_t fscache_n_object_lookups_positive; 100atomic_t fscache_n_object_lookups_positive;
101atomic_t fscache_n_object_lookups_timed_out;
87atomic_t fscache_n_object_created; 102atomic_t fscache_n_object_created;
88atomic_t fscache_n_object_avail; 103atomic_t fscache_n_object_avail;
89atomic_t fscache_n_object_dead; 104atomic_t fscache_n_object_dead;
@@ -93,6 +108,23 @@ atomic_t fscache_n_checkaux_okay;
93atomic_t fscache_n_checkaux_update; 108atomic_t fscache_n_checkaux_update;
94atomic_t fscache_n_checkaux_obsolete; 109atomic_t fscache_n_checkaux_obsolete;
95 110
111atomic_t fscache_n_cop_alloc_object;
112atomic_t fscache_n_cop_lookup_object;
113atomic_t fscache_n_cop_lookup_complete;
114atomic_t fscache_n_cop_grab_object;
115atomic_t fscache_n_cop_update_object;
116atomic_t fscache_n_cop_drop_object;
117atomic_t fscache_n_cop_put_object;
118atomic_t fscache_n_cop_sync_cache;
119atomic_t fscache_n_cop_attr_changed;
120atomic_t fscache_n_cop_read_or_alloc_page;
121atomic_t fscache_n_cop_read_or_alloc_pages;
122atomic_t fscache_n_cop_allocate_page;
123atomic_t fscache_n_cop_allocate_pages;
124atomic_t fscache_n_cop_write_page;
125atomic_t fscache_n_cop_uncache_page;
126atomic_t fscache_n_cop_dissociate_pages;
127
96/* 128/*
97 * display the general statistics 129 * display the general statistics
98 */ 130 */
@@ -129,10 +161,11 @@ static int fscache_stats_show(struct seq_file *m, void *v)
129 atomic_read(&fscache_n_acquires_nobufs), 161 atomic_read(&fscache_n_acquires_nobufs),
130 atomic_read(&fscache_n_acquires_oom)); 162 atomic_read(&fscache_n_acquires_oom));
131 163
132 seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u\n", 164 seq_printf(m, "Lookups: n=%u neg=%u pos=%u crt=%u tmo=%u\n",
133 atomic_read(&fscache_n_object_lookups), 165 atomic_read(&fscache_n_object_lookups),
134 atomic_read(&fscache_n_object_lookups_negative), 166 atomic_read(&fscache_n_object_lookups_negative),
135 atomic_read(&fscache_n_object_lookups_positive), 167 atomic_read(&fscache_n_object_lookups_positive),
168 atomic_read(&fscache_n_object_lookups_timed_out),
136 atomic_read(&fscache_n_object_created)); 169 atomic_read(&fscache_n_object_created));
137 170
138 seq_printf(m, "Updates: n=%u nul=%u run=%u\n", 171 seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
@@ -140,10 +173,11 @@ static int fscache_stats_show(struct seq_file *m, void *v)
140 atomic_read(&fscache_n_updates_null), 173 atomic_read(&fscache_n_updates_null),
141 atomic_read(&fscache_n_updates_run)); 174 atomic_read(&fscache_n_updates_run));
142 175
143 seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u\n", 176 seq_printf(m, "Relinqs: n=%u nul=%u wcr=%u rtr=%u\n",
144 atomic_read(&fscache_n_relinquishes), 177 atomic_read(&fscache_n_relinquishes),
145 atomic_read(&fscache_n_relinquishes_null), 178 atomic_read(&fscache_n_relinquishes_null),
146 atomic_read(&fscache_n_relinquishes_waitcrt)); 179 atomic_read(&fscache_n_relinquishes_waitcrt),
180 atomic_read(&fscache_n_relinquishes_retire));
147 181
148 seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n", 182 seq_printf(m, "AttrChg: n=%u ok=%u nbf=%u oom=%u run=%u\n",
149 atomic_read(&fscache_n_attr_changed), 183 atomic_read(&fscache_n_attr_changed),
@@ -152,14 +186,16 @@ static int fscache_stats_show(struct seq_file *m, void *v)
152 atomic_read(&fscache_n_attr_changed_nomem), 186 atomic_read(&fscache_n_attr_changed_nomem),
153 atomic_read(&fscache_n_attr_changed_calls)); 187 atomic_read(&fscache_n_attr_changed_calls));
154 188
155 seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u\n", 189 seq_printf(m, "Allocs : n=%u ok=%u wt=%u nbf=%u int=%u\n",
156 atomic_read(&fscache_n_allocs), 190 atomic_read(&fscache_n_allocs),
157 atomic_read(&fscache_n_allocs_ok), 191 atomic_read(&fscache_n_allocs_ok),
158 atomic_read(&fscache_n_allocs_wait), 192 atomic_read(&fscache_n_allocs_wait),
159 atomic_read(&fscache_n_allocs_nobufs)); 193 atomic_read(&fscache_n_allocs_nobufs),
160 seq_printf(m, "Allocs : ops=%u owt=%u\n", 194 atomic_read(&fscache_n_allocs_intr));
195 seq_printf(m, "Allocs : ops=%u owt=%u abt=%u\n",
161 atomic_read(&fscache_n_alloc_ops), 196 atomic_read(&fscache_n_alloc_ops),
162 atomic_read(&fscache_n_alloc_op_waits)); 197 atomic_read(&fscache_n_alloc_op_waits),
198 atomic_read(&fscache_n_allocs_object_dead));
163 199
164 seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u" 200 seq_printf(m, "Retrvls: n=%u ok=%u wt=%u nod=%u nbf=%u"
165 " int=%u oom=%u\n", 201 " int=%u oom=%u\n",
@@ -170,9 +206,10 @@ static int fscache_stats_show(struct seq_file *m, void *v)
170 atomic_read(&fscache_n_retrievals_nobufs), 206 atomic_read(&fscache_n_retrievals_nobufs),
171 atomic_read(&fscache_n_retrievals_intr), 207 atomic_read(&fscache_n_retrievals_intr),
172 atomic_read(&fscache_n_retrievals_nomem)); 208 atomic_read(&fscache_n_retrievals_nomem));
173 seq_printf(m, "Retrvls: ops=%u owt=%u\n", 209 seq_printf(m, "Retrvls: ops=%u owt=%u abt=%u\n",
174 atomic_read(&fscache_n_retrieval_ops), 210 atomic_read(&fscache_n_retrieval_ops),
175 atomic_read(&fscache_n_retrieval_op_waits)); 211 atomic_read(&fscache_n_retrieval_op_waits),
212 atomic_read(&fscache_n_retrievals_object_dead));
176 213
177 seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n", 214 seq_printf(m, "Stores : n=%u ok=%u agn=%u nbf=%u oom=%u\n",
178 atomic_read(&fscache_n_stores), 215 atomic_read(&fscache_n_stores),
@@ -180,18 +217,49 @@ static int fscache_stats_show(struct seq_file *m, void *v)
180 atomic_read(&fscache_n_stores_again), 217 atomic_read(&fscache_n_stores_again),
181 atomic_read(&fscache_n_stores_nobufs), 218 atomic_read(&fscache_n_stores_nobufs),
182 atomic_read(&fscache_n_stores_oom)); 219 atomic_read(&fscache_n_stores_oom));
183 seq_printf(m, "Stores : ops=%u run=%u\n", 220 seq_printf(m, "Stores : ops=%u run=%u pgs=%u rxd=%u olm=%u\n",
184 atomic_read(&fscache_n_store_ops), 221 atomic_read(&fscache_n_store_ops),
185 atomic_read(&fscache_n_store_calls)); 222 atomic_read(&fscache_n_store_calls),
223 atomic_read(&fscache_n_store_pages),
224 atomic_read(&fscache_n_store_radix_deletes),
225 atomic_read(&fscache_n_store_pages_over_limit));
186 226
187 seq_printf(m, "Ops : pend=%u run=%u enq=%u\n", 227 seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u\n",
228 atomic_read(&fscache_n_store_vmscan_not_storing),
229 atomic_read(&fscache_n_store_vmscan_gone),
230 atomic_read(&fscache_n_store_vmscan_busy),
231 atomic_read(&fscache_n_store_vmscan_cancelled));
232
233 seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n",
188 atomic_read(&fscache_n_op_pend), 234 atomic_read(&fscache_n_op_pend),
189 atomic_read(&fscache_n_op_run), 235 atomic_read(&fscache_n_op_run),
190 atomic_read(&fscache_n_op_enqueue)); 236 atomic_read(&fscache_n_op_enqueue),
237 atomic_read(&fscache_n_op_cancelled),
238 atomic_read(&fscache_n_op_rejected));
191 seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n", 239 seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n",
192 atomic_read(&fscache_n_op_deferred_release), 240 atomic_read(&fscache_n_op_deferred_release),
193 atomic_read(&fscache_n_op_release), 241 atomic_read(&fscache_n_op_release),
194 atomic_read(&fscache_n_op_gc)); 242 atomic_read(&fscache_n_op_gc));
243
244 seq_printf(m, "CacheOp: alo=%d luo=%d luc=%d gro=%d\n",
245 atomic_read(&fscache_n_cop_alloc_object),
246 atomic_read(&fscache_n_cop_lookup_object),
247 atomic_read(&fscache_n_cop_lookup_complete),
248 atomic_read(&fscache_n_cop_grab_object));
249 seq_printf(m, "CacheOp: upo=%d dro=%d pto=%d atc=%d syn=%d\n",
250 atomic_read(&fscache_n_cop_update_object),
251 atomic_read(&fscache_n_cop_drop_object),
252 atomic_read(&fscache_n_cop_put_object),
253 atomic_read(&fscache_n_cop_attr_changed),
254 atomic_read(&fscache_n_cop_sync_cache));
255 seq_printf(m, "CacheOp: rap=%d ras=%d alp=%d als=%d wrp=%d ucp=%d dsp=%d\n",
256 atomic_read(&fscache_n_cop_read_or_alloc_page),
257 atomic_read(&fscache_n_cop_read_or_alloc_pages),
258 atomic_read(&fscache_n_cop_allocate_page),
259 atomic_read(&fscache_n_cop_allocate_pages),
260 atomic_read(&fscache_n_cop_write_page),
261 atomic_read(&fscache_n_cop_uncache_page),
262 atomic_read(&fscache_n_cop_dissociate_pages));
195 return 0; 263 return 0;
196} 264}
197 265
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 992f6c9410b..4787ae6c5c1 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -385,6 +385,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
385 if (fc->no_create) 385 if (fc->no_create)
386 return -ENOSYS; 386 return -ENOSYS;
387 387
388 if (flags & O_DIRECT)
389 return -EINVAL;
390
388 forget_req = fuse_get_req(fc); 391 forget_req = fuse_get_req(fc);
389 if (IS_ERR(forget_req)) 392 if (IS_ERR(forget_req))
390 return PTR_ERR(forget_req); 393 return PTR_ERR(forget_req);
@@ -712,8 +715,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
712 fuse_invalidate_attr(newdir); 715 fuse_invalidate_attr(newdir);
713 716
714 /* newent will end up negative */ 717 /* newent will end up negative */
715 if (newent->d_inode) 718 if (newent->d_inode) {
719 fuse_invalidate_attr(newent->d_inode);
716 fuse_invalidate_entry_cache(newent); 720 fuse_invalidate_entry_cache(newent);
721 }
717 } else if (err == -EINTR) { 722 } else if (err == -EINTR) {
718 /* If request was interrupted, DEITY only knows if the 723 /* If request was interrupted, DEITY only knows if the
719 rename actually took place. If the invalidation 724 rename actually took place. If the invalidation
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a3492f7d207..c18913a777a 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1063,7 +1063,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
1063 break; 1063 break;
1064 } 1064 }
1065 } 1065 }
1066 fuse_put_request(fc, req); 1066 if (!IS_ERR(req))
1067 fuse_put_request(fc, req);
1067 if (res > 0) 1068 if (res > 0)
1068 *ppos = pos; 1069 *ppos = pos;
1069 1070
@@ -1599,7 +1600,7 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1599 kaddr += copy; 1600 kaddr += copy;
1600 } 1601 }
1601 1602
1602 kunmap(map); 1603 kunmap(page);
1603 } 1604 }
1604 1605
1605 return 0; 1606 return 0;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index eacd78a5d08..5b31f7741a8 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -114,7 +114,7 @@ static int __init init_gfs2_fs(void)
114 if (error) 114 if (error)
115 goto fail_unregister; 115 goto fail_unregister;
116 116
117 error = slow_work_register_user(); 117 error = slow_work_register_user(THIS_MODULE);
118 if (error) 118 if (error)
119 goto fail_slow; 119 goto fail_slow;
120 120
@@ -163,7 +163,7 @@ static void __exit exit_gfs2_fs(void)
163 gfs2_unregister_debugfs(); 163 gfs2_unregister_debugfs();
164 unregister_filesystem(&gfs2_fs_type); 164 unregister_filesystem(&gfs2_fs_type);
165 unregister_filesystem(&gfs2meta_fs_type); 165 unregister_filesystem(&gfs2meta_fs_type);
166 slow_work_unregister_user(); 166 slow_work_unregister_user(THIS_MODULE);
167 167
168 kmem_cache_destroy(gfs2_quotad_cachep); 168 kmem_cache_destroy(gfs2_quotad_cachep);
169 kmem_cache_destroy(gfs2_rgrpd_cachep); 169 kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 59d2695509d..09fa3196557 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -7,6 +7,7 @@
7 * of the GNU General Public License version 2. 7 * of the GNU General Public License version 2.
8 */ 8 */
9 9
10#include <linux/module.h>
10#include <linux/slab.h> 11#include <linux/slab.h>
11#include <linux/spinlock.h> 12#include <linux/spinlock.h>
12#include <linux/completion.h> 13#include <linux/completion.h>
@@ -593,6 +594,7 @@ fail:
593} 594}
594 595
595struct slow_work_ops gfs2_recover_ops = { 596struct slow_work_ops gfs2_recover_ops = {
597 .owner = THIS_MODULE,
596 .get_ref = gfs2_recover_get_ref, 598 .get_ref = gfs2_recover_get_ref,
597 .put_ref = gfs2_recover_put_ref, 599 .put_ref = gfs2_recover_put_ref,
598 .execute = gfs2_recover_work, 600 .execute = gfs2_recover_work,
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 9b9d6395bad..052f214ea6f 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -58,6 +58,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
58 } 58 }
59 unlock_new_inode(tree->inode); 59 unlock_new_inode(tree->inode);
60 60
61 if (!HFS_I(tree->inode)->first_blocks) {
62 printk(KERN_ERR "hfs: invalid btree extent records (0 size).\n");
63 goto free_inode;
64 }
65
61 mapping = tree->inode->i_mapping; 66 mapping = tree->inode->i_mapping;
62 page = read_mapping_page(mapping, 0, NULL); 67 page = read_mapping_page(mapping, 0, NULL);
63 if (IS_ERR(page)) 68 if (IS_ERR(page))
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 175d08eacc8..bed78ac8f6d 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -99,6 +99,10 @@ int hfsplus_read_wrapper(struct super_block *sb)
99 99
100 if (hfsplus_get_last_session(sb, &part_start, &part_size)) 100 if (hfsplus_get_last_session(sb, &part_start, &part_size))
101 return -EINVAL; 101 return -EINVAL;
102 if ((u64)part_start + part_size > 0x100000000ULL) {
103 pr_err("hfs: volumes larger than 2TB are not supported yet\n");
104 return -EINVAL;
105 }
102 while (1) { 106 while (1) {
103 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr); 107 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
104 if (!bh) 108 if (!bh)
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7b17a14396f..6c751106c2e 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -254,7 +254,7 @@ int __generic_block_fiemap(struct inode *inode,
254 u64 len, get_block_t *get_block) 254 u64 len, get_block_t *get_block)
255{ 255{
256 struct buffer_head tmp; 256 struct buffer_head tmp;
257 unsigned int start_blk; 257 unsigned long long start_blk;
258 long long length = 0, map_len = 0; 258 long long length = 0, map_len = 0;
259 u64 logical = 0, phys = 0, size = 0; 259 u64 logical = 0, phys = 0, size = 0;
260 u32 flags = FIEMAP_EXTENT_MERGED; 260 u32 flags = FIEMAP_EXTENT_MERGED;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index bd3c073b485..4160afad6d0 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -73,6 +73,7 @@ EXPORT_SYMBOL(journal_errno);
73EXPORT_SYMBOL(journal_ack_err); 73EXPORT_SYMBOL(journal_ack_err);
74EXPORT_SYMBOL(journal_clear_err); 74EXPORT_SYMBOL(journal_clear_err);
75EXPORT_SYMBOL(log_wait_commit); 75EXPORT_SYMBOL(log_wait_commit);
76EXPORT_SYMBOL(log_start_commit);
76EXPORT_SYMBOL(journal_start_commit); 77EXPORT_SYMBOL(journal_start_commit);
77EXPORT_SYMBOL(journal_force_commit_nested); 78EXPORT_SYMBOL(journal_force_commit_nested);
78EXPORT_SYMBOL(journal_wipe); 79EXPORT_SYMBOL(journal_wipe);
@@ -756,6 +757,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
756 757
757 return journal; 758 return journal;
758out_err: 759out_err:
760 kfree(journal->j_wbuf);
759 kfree(journal); 761 kfree(journal);
760 return NULL; 762 return NULL;
761} 763}
@@ -820,6 +822,7 @@ journal_t * journal_init_inode (struct inode *inode)
820 822
821 return journal; 823 return journal;
822out_err: 824out_err:
825 kfree(journal->j_wbuf);
823 kfree(journal); 826 kfree(journal);
824 return NULL; 827 return NULL;
825} 828}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5d70b3e6d49..ca0f5eb62b2 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -643,6 +643,7 @@ out:
643 643
644int __jbd2_journal_remove_checkpoint(struct journal_head *jh) 644int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
645{ 645{
646 struct transaction_chp_stats_s *stats;
646 transaction_t *transaction; 647 transaction_t *transaction;
647 journal_t *journal; 648 journal_t *journal;
648 int ret = 0; 649 int ret = 0;
@@ -679,6 +680,12 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
679 680
680 /* OK, that was the last buffer for the transaction: we can now 681 /* OK, that was the last buffer for the transaction: we can now
681 safely remove this transaction from the log */ 682 safely remove this transaction from the log */
683 stats = &transaction->t_chp_stats;
684 if (stats->cs_chp_time)
685 stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time,
686 jiffies);
687 trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev,
688 transaction->t_tid, stats);
682 689
683 __jbd2_journal_drop_transaction(journal, transaction); 690 __jbd2_journal_drop_transaction(journal, transaction);
684 kfree(transaction); 691 kfree(transaction);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 26d991ddc1e..d4cfd6d2779 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -410,10 +410,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
410 if (commit_transaction->t_synchronous_commit) 410 if (commit_transaction->t_synchronous_commit)
411 write_op = WRITE_SYNC_PLUG; 411 write_op = WRITE_SYNC_PLUG;
412 trace_jbd2_commit_locking(journal, commit_transaction); 412 trace_jbd2_commit_locking(journal, commit_transaction);
413 stats.u.run.rs_wait = commit_transaction->t_max_wait; 413 stats.run.rs_wait = commit_transaction->t_max_wait;
414 stats.u.run.rs_locked = jiffies; 414 stats.run.rs_locked = jiffies;
415 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 415 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
416 stats.u.run.rs_locked); 416 stats.run.rs_locked);
417 417
418 spin_lock(&commit_transaction->t_handle_lock); 418 spin_lock(&commit_transaction->t_handle_lock);
419 while (commit_transaction->t_updates) { 419 while (commit_transaction->t_updates) {
@@ -486,9 +486,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
486 jbd2_journal_switch_revoke_table(journal); 486 jbd2_journal_switch_revoke_table(journal);
487 487
488 trace_jbd2_commit_flushing(journal, commit_transaction); 488 trace_jbd2_commit_flushing(journal, commit_transaction);
489 stats.u.run.rs_flushing = jiffies; 489 stats.run.rs_flushing = jiffies;
490 stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked, 490 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
491 stats.u.run.rs_flushing); 491 stats.run.rs_flushing);
492 492
493 commit_transaction->t_state = T_FLUSH; 493 commit_transaction->t_state = T_FLUSH;
494 journal->j_committing_transaction = commit_transaction; 494 journal->j_committing_transaction = commit_transaction;
@@ -523,11 +523,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
523 spin_unlock(&journal->j_state_lock); 523 spin_unlock(&journal->j_state_lock);
524 524
525 trace_jbd2_commit_logging(journal, commit_transaction); 525 trace_jbd2_commit_logging(journal, commit_transaction);
526 stats.u.run.rs_logging = jiffies; 526 stats.run.rs_logging = jiffies;
527 stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing, 527 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
528 stats.u.run.rs_logging); 528 stats.run.rs_logging);
529 stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits; 529 stats.run.rs_blocks = commit_transaction->t_outstanding_credits;
530 stats.u.run.rs_blocks_logged = 0; 530 stats.run.rs_blocks_logged = 0;
531 531
532 J_ASSERT(commit_transaction->t_nr_buffers <= 532 J_ASSERT(commit_transaction->t_nr_buffers <=
533 commit_transaction->t_outstanding_credits); 533 commit_transaction->t_outstanding_credits);
@@ -695,7 +695,7 @@ start_journal_io:
695 submit_bh(write_op, bh); 695 submit_bh(write_op, bh);
696 } 696 }
697 cond_resched(); 697 cond_resched();
698 stats.u.run.rs_blocks_logged += bufs; 698 stats.run.rs_blocks_logged += bufs;
699 699
700 /* Force a new descriptor to be generated next 700 /* Force a new descriptor to be generated next
701 time round the loop. */ 701 time round the loop. */
@@ -988,33 +988,30 @@ restart_loop:
988 J_ASSERT(commit_transaction->t_state == T_COMMIT); 988 J_ASSERT(commit_transaction->t_state == T_COMMIT);
989 989
990 commit_transaction->t_start = jiffies; 990 commit_transaction->t_start = jiffies;
991 stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging, 991 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
992 commit_transaction->t_start); 992 commit_transaction->t_start);
993 993
994 /* 994 /*
995 * File the transaction for history 995 * File the transaction statistics
996 */ 996 */
997 stats.ts_type = JBD2_STATS_RUN;
998 stats.ts_tid = commit_transaction->t_tid; 997 stats.ts_tid = commit_transaction->t_tid;
999 stats.u.run.rs_handle_count = commit_transaction->t_handle_count; 998 stats.run.rs_handle_count = commit_transaction->t_handle_count;
1000 spin_lock(&journal->j_history_lock); 999 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1001 memcpy(journal->j_history + journal->j_history_cur, &stats, 1000 commit_transaction->t_tid, &stats.run);
1002 sizeof(stats));
1003 if (++journal->j_history_cur == journal->j_history_max)
1004 journal->j_history_cur = 0;
1005 1001
1006 /* 1002 /*
1007 * Calculate overall stats 1003 * Calculate overall stats
1008 */ 1004 */
1005 spin_lock(&journal->j_history_lock);
1009 journal->j_stats.ts_tid++; 1006 journal->j_stats.ts_tid++;
1010 journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait; 1007 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1011 journal->j_stats.u.run.rs_running += stats.u.run.rs_running; 1008 journal->j_stats.run.rs_running += stats.run.rs_running;
1012 journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked; 1009 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1013 journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing; 1010 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1014 journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging; 1011 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1015 journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count; 1012 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1016 journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks; 1013 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1017 journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged; 1014 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1018 spin_unlock(&journal->j_history_lock); 1015 spin_unlock(&journal->j_history_lock);
1019 1016
1020 commit_transaction->t_state = T_FINISHED; 1017 commit_transaction->t_state = T_FINISHED;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 53b86e16e5f..fed85388ee8 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -136,10 +136,6 @@ static int kjournald2(void *arg)
136 journal->j_task = current; 136 journal->j_task = current;
137 wake_up(&journal->j_wait_done_commit); 137 wake_up(&journal->j_wait_done_commit);
138 138
139 printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
140 "commit interval %ld seconds\n", current->pid,
141 journal->j_devname, journal->j_commit_interval / HZ);
142
143 /* 139 /*
144 * And now, wait forever for commit wakeup events. 140 * And now, wait forever for commit wakeup events.
145 */ 141 */
@@ -223,7 +219,8 @@ static int jbd2_journal_start_thread(journal_t *journal)
223{ 219{
224 struct task_struct *t; 220 struct task_struct *t;
225 221
226 t = kthread_run(kjournald2, journal, "kjournald2"); 222 t = kthread_run(kjournald2, journal, "jbd2/%s",
223 journal->j_devname);
227 if (IS_ERR(t)) 224 if (IS_ERR(t))
228 return PTR_ERR(t); 225 return PTR_ERR(t);
229 226
@@ -679,153 +676,6 @@ struct jbd2_stats_proc_session {
679 int max; 676 int max;
680}; 677};
681 678
682static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s,
683 struct transaction_stats_s *ts,
684 int first)
685{
686 if (ts == s->stats + s->max)
687 ts = s->stats;
688 if (!first && ts == s->stats + s->start)
689 return NULL;
690 while (ts->ts_type == 0) {
691 ts++;
692 if (ts == s->stats + s->max)
693 ts = s->stats;
694 if (ts == s->stats + s->start)
695 return NULL;
696 }
697 return ts;
698
699}
700
701static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos)
702{
703 struct jbd2_stats_proc_session *s = seq->private;
704 struct transaction_stats_s *ts;
705 int l = *pos;
706
707 if (l == 0)
708 return SEQ_START_TOKEN;
709 ts = jbd2_history_skip_empty(s, s->stats + s->start, 1);
710 if (!ts)
711 return NULL;
712 l--;
713 while (l) {
714 ts = jbd2_history_skip_empty(s, ++ts, 0);
715 if (!ts)
716 break;
717 l--;
718 }
719 return ts;
720}
721
722static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
723{
724 struct jbd2_stats_proc_session *s = seq->private;
725 struct transaction_stats_s *ts = v;
726
727 ++*pos;
728 if (v == SEQ_START_TOKEN)
729 return jbd2_history_skip_empty(s, s->stats + s->start, 1);
730 else
731 return jbd2_history_skip_empty(s, ++ts, 0);
732}
733
734static int jbd2_seq_history_show(struct seq_file *seq, void *v)
735{
736 struct transaction_stats_s *ts = v;
737 if (v == SEQ_START_TOKEN) {
738 seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s "
739 "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid",
740 "wait", "run", "lock", "flush", "log", "hndls",
741 "block", "inlog", "ctime", "write", "drop",
742 "close");
743 return 0;
744 }
745 if (ts->ts_type == JBD2_STATS_RUN)
746 seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u "
747 "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid,
748 jiffies_to_msecs(ts->u.run.rs_wait),
749 jiffies_to_msecs(ts->u.run.rs_running),
750 jiffies_to_msecs(ts->u.run.rs_locked),
751 jiffies_to_msecs(ts->u.run.rs_flushing),
752 jiffies_to_msecs(ts->u.run.rs_logging),
753 ts->u.run.rs_handle_count,
754 ts->u.run.rs_blocks,
755 ts->u.run.rs_blocks_logged);
756 else if (ts->ts_type == JBD2_STATS_CHECKPOINT)
757 seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n",
758 "C", ts->ts_tid, " ",
759 jiffies_to_msecs(ts->u.chp.cs_chp_time),
760 ts->u.chp.cs_written, ts->u.chp.cs_dropped,
761 ts->u.chp.cs_forced_to_close);
762 else
763 J_ASSERT(0);
764 return 0;
765}
766
767static void jbd2_seq_history_stop(struct seq_file *seq, void *v)
768{
769}
770
771static const struct seq_operations jbd2_seq_history_ops = {
772 .start = jbd2_seq_history_start,
773 .next = jbd2_seq_history_next,
774 .stop = jbd2_seq_history_stop,
775 .show = jbd2_seq_history_show,
776};
777
778static int jbd2_seq_history_open(struct inode *inode, struct file *file)
779{
780 journal_t *journal = PDE(inode)->data;
781 struct jbd2_stats_proc_session *s;
782 int rc, size;
783
784 s = kmalloc(sizeof(*s), GFP_KERNEL);
785 if (s == NULL)
786 return -ENOMEM;
787 size = sizeof(struct transaction_stats_s) * journal->j_history_max;
788 s->stats = kmalloc(size, GFP_KERNEL);
789 if (s->stats == NULL) {
790 kfree(s);
791 return -ENOMEM;
792 }
793 spin_lock(&journal->j_history_lock);
794 memcpy(s->stats, journal->j_history, size);
795 s->max = journal->j_history_max;
796 s->start = journal->j_history_cur % s->max;
797 spin_unlock(&journal->j_history_lock);
798
799 rc = seq_open(file, &jbd2_seq_history_ops);
800 if (rc == 0) {
801 struct seq_file *m = file->private_data;
802 m->private = s;
803 } else {
804 kfree(s->stats);
805 kfree(s);
806 }
807 return rc;
808
809}
810
811static int jbd2_seq_history_release(struct inode *inode, struct file *file)
812{
813 struct seq_file *seq = file->private_data;
814 struct jbd2_stats_proc_session *s = seq->private;
815
816 kfree(s->stats);
817 kfree(s);
818 return seq_release(inode, file);
819}
820
821static struct file_operations jbd2_seq_history_fops = {
822 .owner = THIS_MODULE,
823 .open = jbd2_seq_history_open,
824 .read = seq_read,
825 .llseek = seq_lseek,
826 .release = jbd2_seq_history_release,
827};
828
829static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) 679static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
830{ 680{
831 return *pos ? NULL : SEQ_START_TOKEN; 681 return *pos ? NULL : SEQ_START_TOKEN;
@@ -842,29 +692,29 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
842 692
843 if (v != SEQ_START_TOKEN) 693 if (v != SEQ_START_TOKEN)
844 return 0; 694 return 0;
845 seq_printf(seq, "%lu transaction, each upto %u blocks\n", 695 seq_printf(seq, "%lu transaction, each up to %u blocks\n",
846 s->stats->ts_tid, 696 s->stats->ts_tid,
847 s->journal->j_max_transaction_buffers); 697 s->journal->j_max_transaction_buffers);
848 if (s->stats->ts_tid == 0) 698 if (s->stats->ts_tid == 0)
849 return 0; 699 return 0;
850 seq_printf(seq, "average: \n %ums waiting for transaction\n", 700 seq_printf(seq, "average: \n %ums waiting for transaction\n",
851 jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid)); 701 jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
852 seq_printf(seq, " %ums running transaction\n", 702 seq_printf(seq, " %ums running transaction\n",
853 jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid)); 703 jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
854 seq_printf(seq, " %ums transaction was being locked\n", 704 seq_printf(seq, " %ums transaction was being locked\n",
855 jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid)); 705 jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
856 seq_printf(seq, " %ums flushing data (in ordered mode)\n", 706 seq_printf(seq, " %ums flushing data (in ordered mode)\n",
857 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); 707 jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
858 seq_printf(seq, " %ums logging transaction\n", 708 seq_printf(seq, " %ums logging transaction\n",
859 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); 709 jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
860 seq_printf(seq, " %lluus average transaction commit time\n", 710 seq_printf(seq, " %lluus average transaction commit time\n",
861 div_u64(s->journal->j_average_commit_time, 1000)); 711 div_u64(s->journal->j_average_commit_time, 1000));
862 seq_printf(seq, " %lu handles per transaction\n", 712 seq_printf(seq, " %lu handles per transaction\n",
863 s->stats->u.run.rs_handle_count / s->stats->ts_tid); 713 s->stats->run.rs_handle_count / s->stats->ts_tid);
864 seq_printf(seq, " %lu blocks per transaction\n", 714 seq_printf(seq, " %lu blocks per transaction\n",
865 s->stats->u.run.rs_blocks / s->stats->ts_tid); 715 s->stats->run.rs_blocks / s->stats->ts_tid);
866 seq_printf(seq, " %lu logged blocks per transaction\n", 716 seq_printf(seq, " %lu logged blocks per transaction\n",
867 s->stats->u.run.rs_blocks_logged / s->stats->ts_tid); 717 s->stats->run.rs_blocks_logged / s->stats->ts_tid);
868 return 0; 718 return 0;
869} 719}
870 720
@@ -920,7 +770,7 @@ static int jbd2_seq_info_release(struct inode *inode, struct file *file)
920 return seq_release(inode, file); 770 return seq_release(inode, file);
921} 771}
922 772
923static struct file_operations jbd2_seq_info_fops = { 773static const struct file_operations jbd2_seq_info_fops = {
924 .owner = THIS_MODULE, 774 .owner = THIS_MODULE,
925 .open = jbd2_seq_info_open, 775 .open = jbd2_seq_info_open,
926 .read = seq_read, 776 .read = seq_read,
@@ -934,8 +784,6 @@ static void jbd2_stats_proc_init(journal_t *journal)
934{ 784{
935 journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); 785 journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
936 if (journal->j_proc_entry) { 786 if (journal->j_proc_entry) {
937 proc_create_data("history", S_IRUGO, journal->j_proc_entry,
938 &jbd2_seq_history_fops, journal);
939 proc_create_data("info", S_IRUGO, journal->j_proc_entry, 787 proc_create_data("info", S_IRUGO, journal->j_proc_entry,
940 &jbd2_seq_info_fops, journal); 788 &jbd2_seq_info_fops, journal);
941 } 789 }
@@ -944,27 +792,9 @@ static void jbd2_stats_proc_init(journal_t *journal)
944static void jbd2_stats_proc_exit(journal_t *journal) 792static void jbd2_stats_proc_exit(journal_t *journal)
945{ 793{
946 remove_proc_entry("info", journal->j_proc_entry); 794 remove_proc_entry("info", journal->j_proc_entry);
947 remove_proc_entry("history", journal->j_proc_entry);
948 remove_proc_entry(journal->j_devname, proc_jbd2_stats); 795 remove_proc_entry(journal->j_devname, proc_jbd2_stats);
949} 796}
950 797
951static void journal_init_stats(journal_t *journal)
952{
953 int size;
954
955 if (!proc_jbd2_stats)
956 return;
957
958 journal->j_history_max = 100;
959 size = sizeof(struct transaction_stats_s) * journal->j_history_max;
960 journal->j_history = kzalloc(size, GFP_KERNEL);
961 if (!journal->j_history) {
962 journal->j_history_max = 0;
963 return;
964 }
965 spin_lock_init(&journal->j_history_lock);
966}
967
968/* 798/*
969 * Management for journal control blocks: functions to create and 799 * Management for journal control blocks: functions to create and
970 * destroy journal_t structures, and to initialise and read existing 800 * destroy journal_t structures, and to initialise and read existing
@@ -1009,7 +839,7 @@ static journal_t * journal_init_common (void)
1009 goto fail; 839 goto fail;
1010 } 840 }
1011 841
1012 journal_init_stats(journal); 842 spin_lock_init(&journal->j_history_lock);
1013 843
1014 return journal; 844 return journal;
1015fail: 845fail:
@@ -1083,6 +913,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1083 913
1084 return journal; 914 return journal;
1085out_err: 915out_err:
916 kfree(journal->j_wbuf);
1086 jbd2_stats_proc_exit(journal); 917 jbd2_stats_proc_exit(journal);
1087 kfree(journal); 918 kfree(journal);
1088 return NULL; 919 return NULL;
@@ -1115,7 +946,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1115 while ((p = strchr(p, '/'))) 946 while ((p = strchr(p, '/')))
1116 *p = '!'; 947 *p = '!';
1117 p = journal->j_devname + strlen(journal->j_devname); 948 p = journal->j_devname + strlen(journal->j_devname);
1118 sprintf(p, ":%lu", journal->j_inode->i_ino); 949 sprintf(p, "-%lu", journal->j_inode->i_ino);
1119 jbd_debug(1, 950 jbd_debug(1,
1120 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", 951 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
1121 journal, inode->i_sb->s_id, inode->i_ino, 952 journal, inode->i_sb->s_id, inode->i_ino,
@@ -1156,6 +987,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1156 987
1157 return journal; 988 return journal;
1158out_err: 989out_err:
990 kfree(journal->j_wbuf);
1159 jbd2_stats_proc_exit(journal); 991 jbd2_stats_proc_exit(journal);
1160 kfree(journal); 992 kfree(journal);
1161 return NULL; 993 return NULL;
diff --git a/fs/jffs2/read.c b/fs/jffs2/read.c
index cfe05c1966a..3f39be1b045 100644
--- a/fs/jffs2/read.c
+++ b/fs/jffs2/read.c
@@ -164,12 +164,15 @@ int jffs2_read_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
164 164
165 /* XXX FIXME: Where a single physical node actually shows up in two 165 /* XXX FIXME: Where a single physical node actually shows up in two
166 frags, we read it twice. Don't do that. */ 166 frags, we read it twice. Don't do that. */
167 /* Now we're pointing at the first frag which overlaps our page */ 167 /* Now we're pointing at the first frag which overlaps our page
168 * (or perhaps is before it, if we've been asked to read off the
169 * end of the file). */
168 while(offset < end) { 170 while(offset < end) {
169 D2(printk(KERN_DEBUG "jffs2_read_inode_range: offset %d, end %d\n", offset, end)); 171 D2(printk(KERN_DEBUG "jffs2_read_inode_range: offset %d, end %d\n", offset, end));
170 if (unlikely(!frag || frag->ofs > offset)) { 172 if (unlikely(!frag || frag->ofs > offset ||
173 frag->ofs + frag->size <= offset)) {
171 uint32_t holesize = end - offset; 174 uint32_t holesize = end - offset;
172 if (frag) { 175 if (frag && frag->ofs > offset) {
173 D1(printk(KERN_NOTICE "Eep. Hole in ino #%u fraglist. frag->ofs = 0x%08x, offset = 0x%08x\n", f->inocache->ino, frag->ofs, offset)); 176 D1(printk(KERN_NOTICE "Eep. Hole in ino #%u fraglist. frag->ofs = 0x%08x, offset = 0x%08x\n", f->inocache->ino, frag->ofs, offset));
174 holesize = min(holesize, frag->ofs - offset); 177 holesize = min(holesize, frag->ofs - offset);
175 } 178 }
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 63976c0ccc2..99ea196f071 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1180,7 +1180,7 @@ static int nfs4_init_client(struct nfs_client *clp,
1180 1, flags & NFS_MOUNT_NORESVPORT); 1180 1, flags & NFS_MOUNT_NORESVPORT);
1181 if (error < 0) 1181 if (error < 0)
1182 goto error; 1182 goto error;
1183 memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); 1183 strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
1184 1184
1185 error = nfs_idmap_new(clp); 1185 error = nfs_idmap_new(clp);
1186 if (error < 0) { 1186 if (error < 0) {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 32062c33c85..7cb298525ee 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1536,6 +1536,8 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1536 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1536 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1537 dentry->d_parent->d_name.name, dentry->d_name.name); 1537 dentry->d_parent->d_name.name, dentry->d_name.name);
1538 1538
1539 nfs_inode_return_delegation(inode);
1540
1539 d_drop(dentry); 1541 d_drop(dentry);
1540 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); 1542 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
1541 if (error == 0) { 1543 if (error == 0) {
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 6c3210099d5..e1d415e9784 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -457,6 +457,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
457 }; 457 };
458 struct rpc_task_setup task_setup_data = { 458 struct rpc_task_setup task_setup_data = {
459 .rpc_client = NFS_CLIENT(inode), 459 .rpc_client = NFS_CLIENT(inode),
460 .rpc_message = &msg,
460 .callback_ops = &nfs_write_direct_ops, 461 .callback_ops = &nfs_write_direct_ops,
461 .workqueue = nfsiod_workqueue, 462 .workqueue = nfsiod_workqueue,
462 .flags = RPC_TASK_ASYNC, 463 .flags = RPC_TASK_ASYNC,
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 70fad69eb95..fa588006588 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -359,17 +359,13 @@ int nfs_fscache_release_page(struct page *page, gfp_t gfp)
359 359
360 BUG_ON(!cookie); 360 BUG_ON(!cookie);
361 361
362 if (fscache_check_page_write(cookie, page)) {
363 if (!(gfp & __GFP_WAIT))
364 return 0;
365 fscache_wait_on_page_write(cookie, page);
366 }
367
368 if (PageFsCache(page)) { 362 if (PageFsCache(page)) {
369 dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n", 363 dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
370 cookie, page, nfsi); 364 cookie, page, nfsi);
371 365
372 fscache_uncache_page(cookie, page); 366 if (!fscache_maybe_release_page(cookie, page, gfp))
367 return 0;
368
373 nfs_add_fscache_stats(page->mapping->host, 369 nfs_add_fscache_stats(page->mapping->host,
374 NFSIOS_FSCACHE_PAGES_UNCACHED, 1); 370 NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
375 } 371 }
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 2636c26d56f..fa3408f2011 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -121,7 +121,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
121 121
122 mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); 122 mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
123 if (IS_ERR(mnt_path)) 123 if (IS_ERR(mnt_path))
124 return mnt; 124 return ERR_CAST(mnt_path);
125 mountdata->mnt_path = mnt_path; 125 mountdata->mnt_path = mnt_path;
126 maxbuflen = mnt_path - 1 - page2; 126 maxbuflen = mnt_path - 1 - page2;
127 127
@@ -132,15 +132,15 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
132 if (buf->len <= 0 || buf->len >= maxbuflen) 132 if (buf->len <= 0 || buf->len >= maxbuflen)
133 continue; 133 continue;
134 134
135 mountdata->addr = (struct sockaddr *)&addr;
136
137 if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) 135 if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
138 continue; 136 continue;
139 mountdata->addrlen = nfs_parse_server_name(buf->data, 137
140 buf->len, 138 mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
141 mountdata->addr, mountdata->addrlen); 139 (struct sockaddr *)&addr, sizeof(addr));
142 if (mountdata->addrlen == 0) 140 if (mountdata->addrlen == 0)
143 continue; 141 continue;
142
143 mountdata->addr = (struct sockaddr *)&addr;
144 rpc_set_port(mountdata->addr, NFS_PORT); 144 rpc_set_port(mountdata->addr, NFS_PORT);
145 145
146 memcpy(page2, buf->data, buf->len); 146 memcpy(page2, buf->data, buf->len);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ed7c269e251..741a562177f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -72,12 +72,17 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
72/* Prevent leaks of NFSv4 errors into userland */ 72/* Prevent leaks of NFSv4 errors into userland */
73static int nfs4_map_errors(int err) 73static int nfs4_map_errors(int err)
74{ 74{
75 if (err < -1000) { 75 if (err >= -1000)
76 return err;
77 switch (err) {
78 case -NFS4ERR_RESOURCE:
79 return -EREMOTEIO;
80 default:
76 dprintk("%s could not handle NFSv4 error %d\n", 81 dprintk("%s could not handle NFSv4 error %d\n",
77 __func__, -err); 82 __func__, -err);
78 return -EIO; 83 break;
79 } 84 }
80 return err; 85 return -EIO;
81} 86}
82 87
83/* 88/*
@@ -2762,7 +2767,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2762 .pages = &page, 2767 .pages = &page,
2763 .pgbase = 0, 2768 .pgbase = 0,
2764 .count = count, 2769 .count = count,
2765 .bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask, 2770 .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
2766 }; 2771 };
2767 struct nfs4_readdir_res res; 2772 struct nfs4_readdir_res res;
2768 struct rpc_message msg = { 2773 struct rpc_message msg = {
@@ -3060,9 +3065,6 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
3060 if (time_before(clp->cl_last_renewal,timestamp)) 3065 if (time_before(clp->cl_last_renewal,timestamp))
3061 clp->cl_last_renewal = timestamp; 3066 clp->cl_last_renewal = timestamp;
3062 spin_unlock(&clp->cl_lock); 3067 spin_unlock(&clp->cl_lock);
3063 dprintk("%s calling put_rpccred on rpc_cred %p\n", __func__,
3064 task->tk_msg.rpc_cred);
3065 put_rpccred(task->tk_msg.rpc_cred);
3066} 3068}
3067 3069
3068static const struct rpc_call_ops nfs4_renew_ops = { 3070static const struct rpc_call_ops nfs4_renew_ops = {
@@ -4877,7 +4879,6 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data)
4877 nfs41_sequence_free_slot(clp, task->tk_msg.rpc_resp); 4879 nfs41_sequence_free_slot(clp, task->tk_msg.rpc_resp);
4878 dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); 4880 dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
4879 4881
4880 put_rpccred(task->tk_msg.rpc_cred);
4881 kfree(task->tk_msg.rpc_argp); 4882 kfree(task->tk_msg.rpc_argp);
4882 kfree(task->tk_msg.rpc_resp); 4883 kfree(task->tk_msg.rpc_resp);
4883 4884
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index e27c6cef18f..0156c01c212 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -127,12 +127,6 @@ nfs4_schedule_state_renewal(struct nfs_client *clp)
127} 127}
128 128
129void 129void
130nfs4_renewd_prepare_shutdown(struct nfs_server *server)
131{
132 cancel_delayed_work(&server->nfs_client->cl_renewd);
133}
134
135void
136nfs4_kill_renewd(struct nfs_client *clp) 130nfs4_kill_renewd(struct nfs_client *clp)
137{ 131{
138 cancel_delayed_work_sync(&clp->cl_renewd); 132 cancel_delayed_work_sync(&clp->cl_renewd);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 83ad47cbdd8..20b4e30e6c8 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5681,7 +5681,6 @@ static struct {
5681 { NFS4ERR_SERVERFAULT, -ESERVERFAULT }, 5681 { NFS4ERR_SERVERFAULT, -ESERVERFAULT },
5682 { NFS4ERR_BADTYPE, -EBADTYPE }, 5682 { NFS4ERR_BADTYPE, -EBADTYPE },
5683 { NFS4ERR_LOCKED, -EAGAIN }, 5683 { NFS4ERR_LOCKED, -EAGAIN },
5684 { NFS4ERR_RESOURCE, -EREMOTEIO },
5685 { NFS4ERR_SYMLINK, -ELOOP }, 5684 { NFS4ERR_SYMLINK, -ELOOP },
5686 { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, 5685 { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP },
5687 { NFS4ERR_DEADLOCK, -EDEADLK }, 5686 { NFS4ERR_DEADLOCK, -EDEADLK },
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 29786d3b932..90be551b80c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -728,22 +728,24 @@ static void nfs_umount_begin(struct super_block *sb)
728 unlock_kernel(); 728 unlock_kernel();
729} 729}
730 730
731static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(int flags) 731static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version)
732{ 732{
733 struct nfs_parsed_mount_data *data; 733 struct nfs_parsed_mount_data *data;
734 734
735 data = kzalloc(sizeof(*data), GFP_KERNEL); 735 data = kzalloc(sizeof(*data), GFP_KERNEL);
736 if (data) { 736 if (data) {
737 data->flags = flags;
738 data->rsize = NFS_MAX_FILE_IO_SIZE; 737 data->rsize = NFS_MAX_FILE_IO_SIZE;
739 data->wsize = NFS_MAX_FILE_IO_SIZE; 738 data->wsize = NFS_MAX_FILE_IO_SIZE;
740 data->acregmin = NFS_DEF_ACREGMIN; 739 data->acregmin = NFS_DEF_ACREGMIN;
741 data->acregmax = NFS_DEF_ACREGMAX; 740 data->acregmax = NFS_DEF_ACREGMAX;
742 data->acdirmin = NFS_DEF_ACDIRMIN; 741 data->acdirmin = NFS_DEF_ACDIRMIN;
743 data->acdirmax = NFS_DEF_ACDIRMAX; 742 data->acdirmax = NFS_DEF_ACDIRMAX;
743 data->mount_server.port = NFS_UNSPEC_PORT;
744 data->nfs_server.port = NFS_UNSPEC_PORT; 744 data->nfs_server.port = NFS_UNSPEC_PORT;
745 data->nfs_server.protocol = XPRT_TRANSPORT_TCP;
745 data->auth_flavors[0] = RPC_AUTH_UNIX; 746 data->auth_flavors[0] = RPC_AUTH_UNIX;
746 data->auth_flavor_len = 1; 747 data->auth_flavor_len = 1;
748 data->version = version;
747 data->minorversion = 0; 749 data->minorversion = 0;
748 } 750 }
749 return data; 751 return data;
@@ -776,15 +778,13 @@ static int nfs_verify_server_address(struct sockaddr *addr)
776 * Select between a default port value and a user-specified port value. 778 * Select between a default port value and a user-specified port value.
777 * If a zero value is set, then autobind will be used. 779 * If a zero value is set, then autobind will be used.
778 */ 780 */
779static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port, 781static void nfs_set_port(struct sockaddr *sap, int *port,
780 const unsigned short default_port) 782 const unsigned short default_port)
781{ 783{
782 unsigned short port = default_port; 784 if (*port == NFS_UNSPEC_PORT)
785 *port = default_port;
783 786
784 if (parsed_port != NFS_UNSPEC_PORT) 787 rpc_set_port(sap, *port);
785 port = parsed_port;
786
787 rpc_set_port(sap, port);
788} 788}
789 789
790/* 790/*
@@ -1253,6 +1253,7 @@ static int nfs_parse_mount_options(char *raw,
1253 default: 1253 default:
1254 dfprintk(MOUNT, "NFS: unrecognized " 1254 dfprintk(MOUNT, "NFS: unrecognized "
1255 "transport protocol\n"); 1255 "transport protocol\n");
1256 kfree(string);
1256 return 0; 1257 return 0;
1257 } 1258 }
1258 break; 1259 break;
@@ -1475,7 +1476,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1475 args->mount_server.addrlen = args->nfs_server.addrlen; 1476 args->mount_server.addrlen = args->nfs_server.addrlen;
1476 } 1477 }
1477 request.salen = args->mount_server.addrlen; 1478 request.salen = args->mount_server.addrlen;
1478 nfs_set_default_port(request.sap, args->mount_server.port, 0); 1479 nfs_set_port(request.sap, &args->mount_server.port, 0);
1479 1480
1480 /* 1481 /*
1481 * Now ask the mount server to map our export path 1482 * Now ask the mount server to map our export path
@@ -1711,8 +1712,6 @@ static int nfs_validate_mount_data(void *options,
1711 1712
1712 if (!(data->flags & NFS_MOUNT_TCP)) 1713 if (!(data->flags & NFS_MOUNT_TCP))
1713 args->nfs_server.protocol = XPRT_TRANSPORT_UDP; 1714 args->nfs_server.protocol = XPRT_TRANSPORT_UDP;
1714 else
1715 args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1716 /* N.B. caller will free nfs_server.hostname in all cases */ 1715 /* N.B. caller will free nfs_server.hostname in all cases */
1717 args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); 1716 args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
1718 args->namlen = data->namlen; 1717 args->namlen = data->namlen;
@@ -1767,7 +1766,7 @@ static int nfs_validate_mount_data(void *options,
1767 goto out_v4_not_compiled; 1766 goto out_v4_not_compiled;
1768#endif 1767#endif
1769 1768
1770 nfs_set_default_port(sap, args->nfs_server.port, 0); 1769 nfs_set_port(sap, &args->nfs_server.port, 0);
1771 1770
1772 nfs_set_mount_transport_protocol(args); 1771 nfs_set_mount_transport_protocol(args);
1773 1772
@@ -1848,9 +1847,10 @@ nfs_compare_remount_data(struct nfs_server *nfss,
1848 data->acdirmin != nfss->acdirmin / HZ || 1847 data->acdirmin != nfss->acdirmin / HZ ||
1849 data->acdirmax != nfss->acdirmax / HZ || 1848 data->acdirmax != nfss->acdirmax / HZ ||
1850 data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) || 1849 data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
1850 data->nfs_server.port != nfss->port ||
1851 data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen || 1851 data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
1852 memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr, 1852 !rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address,
1853 data->nfs_server.addrlen) != 0) 1853 (struct sockaddr *)&nfss->nfs_client->cl_addr))
1854 return -EINVAL; 1854 return -EINVAL;
1855 1855
1856 return 0; 1856 return 0;
@@ -1893,6 +1893,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
1893 data->acdirmin = nfss->acdirmin / HZ; 1893 data->acdirmin = nfss->acdirmin / HZ;
1894 data->acdirmax = nfss->acdirmax / HZ; 1894 data->acdirmax = nfss->acdirmax / HZ;
1895 data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; 1895 data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
1896 data->nfs_server.port = nfss->port;
1896 data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; 1897 data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
1897 memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, 1898 memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
1898 data->nfs_server.addrlen); 1899 data->nfs_server.addrlen);
@@ -2106,7 +2107,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2106 }; 2107 };
2107 int error = -ENOMEM; 2108 int error = -ENOMEM;
2108 2109
2109 data = nfs_alloc_parsed_mount_data(NFS_MOUNT_VER3 | NFS_MOUNT_TCP); 2110 data = nfs_alloc_parsed_mount_data(3);
2110 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); 2111 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
2111 if (data == NULL || mntfh == NULL) 2112 if (data == NULL || mntfh == NULL)
2112 goto out_free_fh; 2113 goto out_free_fh;
@@ -2331,7 +2332,7 @@ static int nfs4_validate_text_mount_data(void *options,
2331{ 2332{
2332 struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; 2333 struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
2333 2334
2334 nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT); 2335 nfs_set_port(sap, &args->nfs_server.port, NFS_PORT);
2335 2336
2336 nfs_validate_transport_protocol(args); 2337 nfs_validate_transport_protocol(args);
2337 2338
@@ -2376,7 +2377,6 @@ static int nfs4_validate_mount_data(void *options,
2376 if (data == NULL) 2377 if (data == NULL)
2377 goto out_no_data; 2378 goto out_no_data;
2378 2379
2379 args->version = 4;
2380 switch (data->version) { 2380 switch (data->version) {
2381 case 1: 2381 case 1:
2382 if (data->host_addrlen > sizeof(args->nfs_server.address)) 2382 if (data->host_addrlen > sizeof(args->nfs_server.address))
@@ -2660,7 +2660,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2660 struct nfs_parsed_mount_data *data; 2660 struct nfs_parsed_mount_data *data;
2661 int error = -ENOMEM; 2661 int error = -ENOMEM;
2662 2662
2663 data = nfs_alloc_parsed_mount_data(0); 2663 data = nfs_alloc_parsed_mount_data(4);
2664 if (data == NULL) 2664 if (data == NULL)
2665 goto out_free_data; 2665 goto out_free_data;
2666 2666
@@ -2690,7 +2690,6 @@ static void nfs4_kill_super(struct super_block *sb)
2690 dprintk("--> %s\n", __func__); 2690 dprintk("--> %s\n", __func__);
2691 nfs_super_return_all_delegations(sb); 2691 nfs_super_return_all_delegations(sb);
2692 kill_anon_super(sb); 2692 kill_anon_super(sb);
2693 nfs4_renewd_prepare_shutdown(server);
2694 nfs_fscache_release_super_cookie(sb); 2693 nfs_fscache_release_super_cookie(sb);
2695 nfs_free_server(server); 2694 nfs_free_server(server);
2696 dprintk("<-- %s\n", __func__); 2695 dprintk("<-- %s\n", __func__);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index edf926e1062..d0a2ce1b432 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -958,7 +958,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
958 p1 = encode_entry_baggage(cd, p1, name, namlen, ino); 958 p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
959 959
960 if (plus) 960 if (plus)
961 p = encode_entryplus_baggage(cd, p1, name, namlen); 961 p1 = encode_entryplus_baggage(cd, p1, name, namlen);
962 962
963 /* determine entry word length and lengths to go in pages */ 963 /* determine entry word length and lengths to go in pages */
964 num_entry_words = p1 - tmp; 964 num_entry_words = p1 - tmp;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 00388d2a3c9..5c01fc148ce 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -176,7 +176,7 @@ static const struct file_operations exports_operations = {
176extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); 176extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
177extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); 177extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
178 178
179static struct file_operations pool_stats_operations = { 179static const struct file_operations pool_stats_operations = {
180 .open = nfsd_pool_stats_open, 180 .open = nfsd_pool_stats_open,
181 .read = seq_read, 181 .read = seq_read,
182 .llseek = seq_lseek, 182 .llseek = seq_lseek,
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 6a2711f4c32..84c25382f8e 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -36,6 +36,7 @@
36 36
37void nilfs_btnode_cache_init_once(struct address_space *btnc) 37void nilfs_btnode_cache_init_once(struct address_space *btnc)
38{ 38{
39 memset(btnc, 0, sizeof(*btnc));
39 INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC); 40 INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
40 spin_lock_init(&btnc->tree_lock); 41 spin_lock_init(&btnc->tree_lock);
41 INIT_LIST_HEAD(&btnc->private_list); 42 INIT_LIST_HEAD(&btnc->private_list);
@@ -86,6 +87,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
86 brelse(bh); 87 brelse(bh);
87 BUG(); 88 BUG();
88 } 89 }
90 memset(bh->b_data, 0, 1 << inode->i_blkbits);
89 bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; 91 bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
90 bh->b_blocknr = blocknr; 92 bh->b_blocknr = blocknr;
91 set_buffer_mapped(bh); 93 set_buffer_mapped(bh);
@@ -275,8 +277,7 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
275 "invalid oldkey %lld (newkey=%lld)", 277 "invalid oldkey %lld (newkey=%lld)",
276 (unsigned long long)oldkey, 278 (unsigned long long)oldkey,
277 (unsigned long long)newkey); 279 (unsigned long long)newkey);
278 if (!test_set_buffer_dirty(obh) && TestSetPageDirty(opage)) 280 nilfs_btnode_mark_dirty(obh);
279 BUG();
280 281
281 spin_lock_irq(&btnc->tree_lock); 282 spin_lock_irq(&btnc->tree_lock);
282 radix_tree_delete(&btnc->page_tree, oldkey); 283 radix_tree_delete(&btnc->page_tree, oldkey);
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 1c6cfb59128..3f5d5d06f53 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -871,7 +871,6 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
871 * exclusive with a new mount job. Though it doesn't cover 871 * exclusive with a new mount job. Though it doesn't cover
872 * umount, it's enough for the purpose. 872 * umount, it's enough for the purpose.
873 */ 873 */
874 mutex_lock(&nilfs->ns_mount_mutex);
875 if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) { 874 if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
876 /* Current implementation does not have to protect 875 /* Current implementation does not have to protect
877 plain read-only mounts since they are exclusive 876 plain read-only mounts since they are exclusive
@@ -880,7 +879,6 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
880 ret = -EBUSY; 879 ret = -EBUSY;
881 } else 880 } else
882 ret = nilfs_cpfile_clear_snapshot(cpfile, cno); 881 ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
883 mutex_unlock(&nilfs->ns_mount_mutex);
884 return ret; 882 return ret;
885 case NILFS_SNAPSHOT: 883 case NILFS_SNAPSHOT:
886 return nilfs_cpfile_set_snapshot(cpfile, cno); 884 return nilfs_cpfile_set_snapshot(cpfile, cno);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 1a4fa04cf07..e097099bfc8 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -697,7 +697,7 @@ not_empty:
697 return 0; 697 return 0;
698} 698}
699 699
700struct file_operations nilfs_dir_operations = { 700const struct file_operations nilfs_dir_operations = {
701 .llseek = generic_file_llseek, 701 .llseek = generic_file_llseek,
702 .read = generic_read_dir, 702 .read = generic_read_dir,
703 .readdir = nilfs_readdir, 703 .readdir = nilfs_readdir,
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 7d7b4983dee..30292df443c 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -134,7 +134,7 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
134 * We have mostly NULL's here: the current defaults are ok for 134 * We have mostly NULL's here: the current defaults are ok for
135 * the nilfs filesystem. 135 * the nilfs filesystem.
136 */ 136 */
137struct file_operations nilfs_file_operations = { 137const struct file_operations nilfs_file_operations = {
138 .llseek = generic_file_llseek, 138 .llseek = generic_file_llseek,
139 .read = do_sync_read, 139 .read = do_sync_read,
140 .write = do_sync_write, 140 .write = do_sync_write,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2d2c501deb5..2a0a5a3ac13 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -400,6 +400,7 @@ int nilfs_read_inode_common(struct inode *inode,
400 ii->i_dir_acl = S_ISREG(inode->i_mode) ? 400 ii->i_dir_acl = S_ISREG(inode->i_mode) ?
401 0 : le32_to_cpu(raw_inode->i_dir_acl); 401 0 : le32_to_cpu(raw_inode->i_dir_acl);
402#endif 402#endif
403 ii->i_dir_start_lookup = 0;
403 ii->i_cno = 0; 404 ii->i_cno = 0;
404 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 405 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
405 406
@@ -663,7 +664,6 @@ int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
663 int err; 664 int err;
664 665
665 spin_lock(&sbi->s_inode_lock); 666 spin_lock(&sbi->s_inode_lock);
666 /* Caller of this function MUST lock s_inode_lock */
667 if (ii->i_bh == NULL) { 667 if (ii->i_bh == NULL) {
668 spin_unlock(&sbi->s_inode_lock); 668 spin_unlock(&sbi->s_inode_lock);
669 err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino, 669 err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino,
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 6572ea4bc4d..f6af76042d8 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -99,7 +99,8 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
99static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, 99static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
100 unsigned int cmd, void __user *argp) 100 unsigned int cmd, void __user *argp)
101{ 101{
102 struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile; 102 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
103 struct inode *cpfile = nilfs->ns_cpfile;
103 struct nilfs_transaction_info ti; 104 struct nilfs_transaction_info ti;
104 struct nilfs_cpmode cpmode; 105 struct nilfs_cpmode cpmode;
105 int ret; 106 int ret;
@@ -109,14 +110,17 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
109 if (copy_from_user(&cpmode, argp, sizeof(cpmode))) 110 if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
110 return -EFAULT; 111 return -EFAULT;
111 112
113 mutex_lock(&nilfs->ns_mount_mutex);
112 nilfs_transaction_begin(inode->i_sb, &ti, 0); 114 nilfs_transaction_begin(inode->i_sb, &ti, 0);
113 ret = nilfs_cpfile_change_cpmode( 115 ret = nilfs_cpfile_change_cpmode(
114 cpfile, cpmode.cm_cno, cpmode.cm_mode); 116 cpfile, cpmode.cm_cno, cpmode.cm_mode);
115 if (unlikely(ret < 0)) { 117 if (unlikely(ret < 0)) {
116 nilfs_transaction_abort(inode->i_sb); 118 nilfs_transaction_abort(inode->i_sb);
119 mutex_unlock(&nilfs->ns_mount_mutex);
117 return ret; 120 return ret;
118 } 121 }
119 nilfs_transaction_commit(inode->i_sb); /* never fails */ 122 nilfs_transaction_commit(inode->i_sb); /* never fails */
123 mutex_unlock(&nilfs->ns_mount_mutex);
120 return ret; 124 return ret;
121} 125}
122 126
@@ -297,7 +301,18 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
297 (unsigned long long)vdesc->vd_vblocknr); 301 (unsigned long long)vdesc->vd_vblocknr);
298 return ret; 302 return ret;
299 } 303 }
300 bh->b_private = vdesc; 304 if (unlikely(!list_empty(&bh->b_assoc_buffers))) {
305 printk(KERN_CRIT "%s: conflicting %s buffer: ino=%llu, "
306 "cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu\n",
307 __func__, vdesc->vd_flags ? "node" : "data",
308 (unsigned long long)vdesc->vd_ino,
309 (unsigned long long)vdesc->vd_cno,
310 (unsigned long long)vdesc->vd_offset,
311 (unsigned long long)vdesc->vd_blocknr,
312 (unsigned long long)vdesc->vd_vblocknr);
313 brelse(bh);
314 return -EEXIST;
315 }
301 list_add_tail(&bh->b_assoc_buffers, buffers); 316 list_add_tail(&bh->b_assoc_buffers, buffers);
302 return 0; 317 return 0;
303} 318}
@@ -335,24 +350,10 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
335 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { 350 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
336 ret = nilfs_gccache_wait_and_mark_dirty(bh); 351 ret = nilfs_gccache_wait_and_mark_dirty(bh);
337 if (unlikely(ret < 0)) { 352 if (unlikely(ret < 0)) {
338 if (ret == -EEXIST) { 353 WARN_ON(ret == -EEXIST);
339 vdesc = bh->b_private;
340 printk(KERN_CRIT
341 "%s: conflicting %s buffer: "
342 "ino=%llu, cno=%llu, offset=%llu, "
343 "blocknr=%llu, vblocknr=%llu\n",
344 __func__,
345 vdesc->vd_flags ? "node" : "data",
346 (unsigned long long)vdesc->vd_ino,
347 (unsigned long long)vdesc->vd_cno,
348 (unsigned long long)vdesc->vd_offset,
349 (unsigned long long)vdesc->vd_blocknr,
350 (unsigned long long)vdesc->vd_vblocknr);
351 }
352 goto failed; 354 goto failed;
353 } 355 }
354 list_del_init(&bh->b_assoc_buffers); 356 list_del_init(&bh->b_assoc_buffers);
355 bh->b_private = NULL;
356 brelse(bh); 357 brelse(bh);
357 } 358 }
358 return nmembs; 359 return nmembs;
@@ -360,7 +361,6 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
360 failed: 361 failed:
361 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { 362 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
362 list_del_init(&bh->b_assoc_buffers); 363 list_del_init(&bh->b_assoc_buffers);
363 bh->b_private = NULL;
364 brelse(bh); 364 brelse(bh);
365 } 365 }
366 return ret; 366 return ret;
@@ -471,7 +471,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
471 return 0; 471 return 0;
472 472
473 failed: 473 failed:
474 nilfs_remove_all_gcinode(nilfs);
475 printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n", 474 printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n",
476 msg, ret); 475 msg, ret);
477 return ret; 476 return ret;
@@ -560,6 +559,8 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
560 else 559 else
561 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); 560 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
562 561
562 if (ret < 0)
563 nilfs_remove_all_gcinode(nilfs);
563 clear_nilfs_gc_running(nilfs); 564 clear_nilfs_gc_running(nilfs);
564 565
565 out_free: 566 out_free:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index b18c4998f8d..f6326112d64 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -433,7 +433,7 @@ static const struct address_space_operations def_mdt_aops = {
433}; 433};
434 434
435static const struct inode_operations def_mdt_iops; 435static const struct inode_operations def_mdt_iops;
436static struct file_operations def_mdt_fops; 436static const struct file_operations def_mdt_fops;
437 437
438/* 438/*
439 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile, 439 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index bad7368782d..4da6f67e9a9 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -294,9 +294,9 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *);
294/* 294/*
295 * Inodes and files operations 295 * Inodes and files operations
296 */ 296 */
297extern struct file_operations nilfs_dir_operations; 297extern const struct file_operations nilfs_dir_operations;
298extern const struct inode_operations nilfs_file_inode_operations; 298extern const struct inode_operations nilfs_file_inode_operations;
299extern struct file_operations nilfs_file_operations; 299extern const struct file_operations nilfs_file_operations;
300extern const struct address_space_operations nilfs_aops; 300extern const struct address_space_operations nilfs_aops;
301extern const struct inode_operations nilfs_dir_inode_operations; 301extern const struct inode_operations nilfs_dir_inode_operations;
302extern const struct inode_operations nilfs_special_inode_operations; 302extern const struct inode_operations nilfs_special_inode_operations;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 683df89dbae..6eff66a070d 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2468,17 +2468,22 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
2468 /* Clear requests (even when the construction failed) */ 2468 /* Clear requests (even when the construction failed) */
2469 spin_lock(&sci->sc_state_lock); 2469 spin_lock(&sci->sc_state_lock);
2470 2470
2471 sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
2472
2473 if (req->mode == SC_LSEG_SR) { 2471 if (req->mode == SC_LSEG_SR) {
2472 sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
2474 sci->sc_seq_done = req->seq_accepted; 2473 sci->sc_seq_done = req->seq_accepted;
2475 nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err); 2474 nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err);
2476 sci->sc_flush_request = 0; 2475 sci->sc_flush_request = 0;
2477 } else if (req->mode == SC_FLUSH_FILE) 2476 } else {
2478 sci->sc_flush_request &= ~FLUSH_FILE_BIT; 2477 if (req->mode == SC_FLUSH_FILE)
2479 else if (req->mode == SC_FLUSH_DAT) 2478 sci->sc_flush_request &= ~FLUSH_FILE_BIT;
2480 sci->sc_flush_request &= ~FLUSH_DAT_BIT; 2479 else if (req->mode == SC_FLUSH_DAT)
2480 sci->sc_flush_request &= ~FLUSH_DAT_BIT;
2481 2481
2482 /* re-enable timer if checkpoint creation was not done */
2483 if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2484 time_before(jiffies, sci->sc_timer->expires))
2485 add_timer(sci->sc_timer);
2486 }
2482 spin_unlock(&sci->sc_state_lock); 2487 spin_unlock(&sci->sc_state_lock);
2483} 2488}
2484 2489
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 2224b4d07bf..44a88a9fa2c 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -124,10 +124,10 @@ int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
124 while (*s && len > 0) { 124 while (*s && len > 0) {
125 if (*s & 0x80) { 125 if (*s & 0x80) {
126 size = utf8_to_utf32(s, len, &u); 126 size = utf8_to_utf32(s, len, &u);
127 if (size < 0) { 127 if (size < 0)
128 /* Ignore character and move on */ 128 return -EINVAL;
129 size = 1; 129
130 } else if (u >= PLANE_SIZE) { 130 if (u >= PLANE_SIZE) {
131 u -= PLANE_SIZE; 131 u -= PLANE_SIZE;
132 *op++ = (wchar_t) (SURROGATE_PAIR | 132 *op++ = (wchar_t) (SURROGATE_PAIR |
133 ((u >> 10) & SURROGATE_BITS)); 133 ((u >> 10) & SURROGATE_BITS));
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 828a889be90..7e54e52964d 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -91,6 +91,7 @@ static int dnotify_handle_event(struct fsnotify_group *group,
91 struct dnotify_struct *dn; 91 struct dnotify_struct *dn;
92 struct dnotify_struct **prev; 92 struct dnotify_struct **prev;
93 struct fown_struct *fown; 93 struct fown_struct *fown;
94 __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD;
94 95
95 to_tell = event->to_tell; 96 to_tell = event->to_tell;
96 97
@@ -106,7 +107,7 @@ static int dnotify_handle_event(struct fsnotify_group *group,
106 spin_lock(&entry->lock); 107 spin_lock(&entry->lock);
107 prev = &dnentry->dn; 108 prev = &dnentry->dn;
108 while ((dn = *prev) != NULL) { 109 while ((dn = *prev) != NULL) {
109 if ((dn->dn_mask & event->mask) == 0) { 110 if ((dn->dn_mask & test_mask) == 0) {
110 prev = &dn->dn_next; 111 prev = &dn->dn_next;
111 continue; 112 continue;
112 } 113 }
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index c8a07c65482..3165d85aada 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -324,11 +324,11 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
324 spin_lock(&group->mark_lock); 324 spin_lock(&group->mark_lock);
325 spin_lock(&inode->i_lock); 325 spin_lock(&inode->i_lock);
326 326
327 entry->group = group;
328 entry->inode = inode;
329
330 lentry = fsnotify_find_mark_entry(group, inode); 327 lentry = fsnotify_find_mark_entry(group, inode);
331 if (!lentry) { 328 if (!lentry) {
329 entry->group = group;
330 entry->inode = inode;
331
332 hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries); 332 hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
333 list_add(&entry->g_list, &group->mark_entries); 333 list_add(&entry->g_list, &group->mark_entries);
334 334
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 3816d5750dd..b8bf53b4c10 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -143,7 +143,7 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
143 /* remember, after old was put on the wait_q we aren't 143 /* remember, after old was put on the wait_q we aren't
144 * allowed to look at the inode any more, only thing 144 * allowed to look at the inode any more, only thing
145 * left to check was if the file_name is the same */ 145 * left to check was if the file_name is the same */
146 if (old->name_len && 146 if (!old->name_len ||
147 !strcmp(old->file_name, new->file_name)) 147 !strcmp(old->file_name, new->file_name))
148 return true; 148 return true;
149 break; 149 break;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 09cc25d0461..c452d116b89 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -966,7 +966,7 @@ static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
966} 966}
967#endif /* CONFIG_DEBUG_FS */ 967#endif /* CONFIG_DEBUG_FS */
968 968
969static struct file_operations o2hb_debug_fops = { 969static const struct file_operations o2hb_debug_fops = {
970 .open = o2hb_debug_open, 970 .open = o2hb_debug_open,
971 .release = o2hb_debug_release, 971 .release = o2hb_debug_release,
972 .read = o2hb_debug_read, 972 .read = o2hb_debug_read,
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index cfb2be708ab..da794bc07a6 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -207,7 +207,7 @@ static int nst_fop_release(struct inode *inode, struct file *file)
207 return seq_release_private(inode, file); 207 return seq_release_private(inode, file);
208} 208}
209 209
210static struct file_operations nst_seq_fops = { 210static const struct file_operations nst_seq_fops = {
211 .open = nst_fop_open, 211 .open = nst_fop_open,
212 .read = seq_read, 212 .read = seq_read,
213 .llseek = seq_lseek, 213 .llseek = seq_lseek,
@@ -388,7 +388,7 @@ static int sc_fop_release(struct inode *inode, struct file *file)
388 return seq_release_private(inode, file); 388 return seq_release_private(inode, file);
389} 389}
390 390
391static struct file_operations sc_seq_fops = { 391static const struct file_operations sc_seq_fops = {
392 .open = sc_fop_open, 392 .open = sc_fop_open,
393 .read = seq_read, 393 .read = seq_read,
394 .llseek = seq_lseek, 394 .llseek = seq_lseek,
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index ca46002ec10..42b0bad7a61 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -478,7 +478,7 @@ bail:
478 return -ENOMEM; 478 return -ENOMEM;
479} 479}
480 480
481static struct file_operations debug_purgelist_fops = { 481static const struct file_operations debug_purgelist_fops = {
482 .open = debug_purgelist_open, 482 .open = debug_purgelist_open,
483 .release = debug_buffer_release, 483 .release = debug_buffer_release,
484 .read = debug_buffer_read, 484 .read = debug_buffer_read,
@@ -538,7 +538,7 @@ bail:
538 return -ENOMEM; 538 return -ENOMEM;
539} 539}
540 540
541static struct file_operations debug_mle_fops = { 541static const struct file_operations debug_mle_fops = {
542 .open = debug_mle_open, 542 .open = debug_mle_open,
543 .release = debug_buffer_release, 543 .release = debug_buffer_release,
544 .read = debug_buffer_read, 544 .read = debug_buffer_read,
@@ -741,7 +741,7 @@ static int debug_lockres_release(struct inode *inode, struct file *file)
741 return seq_release_private(inode, file); 741 return seq_release_private(inode, file);
742} 742}
743 743
744static struct file_operations debug_lockres_fops = { 744static const struct file_operations debug_lockres_fops = {
745 .open = debug_lockres_open, 745 .open = debug_lockres_open,
746 .release = debug_lockres_release, 746 .release = debug_lockres_release,
747 .read = seq_read, 747 .read = seq_read,
@@ -925,7 +925,7 @@ bail:
925 return -ENOMEM; 925 return -ENOMEM;
926} 926}
927 927
928static struct file_operations debug_state_fops = { 928static const struct file_operations debug_state_fops = {
929 .open = debug_state_open, 929 .open = debug_state_open,
930 .release = debug_buffer_release, 930 .release = debug_buffer_release,
931 .read = debug_buffer_read, 931 .read = debug_buffer_read,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 89fc8ee1f5a..de059f49058 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1712,7 +1712,8 @@ int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
1712 struct super_block *sb = inode->i_sb; 1712 struct super_block *sb = inode->i_sb;
1713 1713
1714 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) || 1714 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
1715 !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) 1715 !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) ||
1716 OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1716 return 0; 1717 return 0;
1717 1718
1718 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; 1719 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index eae40460242..d963d863870 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -35,12 +35,7 @@
35#include <linux/kref.h> 35#include <linux/kref.h>
36#include <linux/mutex.h> 36#include <linux/mutex.h>
37#include <linux/lockdep.h> 37#include <linux/lockdep.h>
38#ifndef CONFIG_OCFS2_COMPAT_JBD 38#include <linux/jbd2.h>
39# include <linux/jbd2.h>
40#else
41# include <linux/jbd.h>
42# include "ocfs2_jbd_compat.h"
43#endif
44 39
45/* For union ocfs2_dlm_lksb */ 40/* For union ocfs2_dlm_lksb */
46#include "stackglue.h" 41#include "stackglue.h"
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 60287fc56bc..3a0df7a1b81 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3743,6 +3743,9 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
3743 goto out; 3743 goto out;
3744 } 3744 }
3745 3745
3746 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
3747 goto attach_xattr;
3748
3746 ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh); 3749 ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
3747 3750
3748 size = i_size_read(inode); 3751 size = i_size_read(inode);
@@ -3769,6 +3772,7 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
3769 cpos += num_clusters; 3772 cpos += num_clusters;
3770 } 3773 }
3771 3774
3775attach_xattr:
3772 if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) { 3776 if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
3773 ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh, 3777 ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
3774 &ref_tree->rf_ci, 3778 &ref_tree->rf_ci,
@@ -3858,6 +3862,49 @@ out:
3858 return ret; 3862 return ret;
3859} 3863}
3860 3864
3865static int ocfs2_duplicate_inline_data(struct inode *s_inode,
3866 struct buffer_head *s_bh,
3867 struct inode *t_inode,
3868 struct buffer_head *t_bh)
3869{
3870 int ret;
3871 handle_t *handle;
3872 struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
3873 struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
3874 struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data;
3875
3876 BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
3877
3878 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
3879 if (IS_ERR(handle)) {
3880 ret = PTR_ERR(handle);
3881 mlog_errno(ret);
3882 goto out;
3883 }
3884
3885 ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
3886 OCFS2_JOURNAL_ACCESS_WRITE);
3887 if (ret) {
3888 mlog_errno(ret);
3889 goto out_commit;
3890 }
3891
3892 t_di->id2.i_data.id_count = s_di->id2.i_data.id_count;
3893 memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data,
3894 le16_to_cpu(s_di->id2.i_data.id_count));
3895 spin_lock(&OCFS2_I(t_inode)->ip_lock);
3896 OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
3897 t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features);
3898 spin_unlock(&OCFS2_I(t_inode)->ip_lock);
3899
3900 ocfs2_journal_dirty(handle, t_bh);
3901
3902out_commit:
3903 ocfs2_commit_trans(osb, handle);
3904out:
3905 return ret;
3906}
3907
3861static int ocfs2_duplicate_extent_list(struct inode *s_inode, 3908static int ocfs2_duplicate_extent_list(struct inode *s_inode,
3862 struct inode *t_inode, 3909 struct inode *t_inode,
3863 struct buffer_head *t_bh, 3910 struct buffer_head *t_bh,
@@ -3997,6 +4044,14 @@ static int ocfs2_create_reflink_node(struct inode *s_inode,
3997 goto out; 4044 goto out;
3998 } 4045 }
3999 4046
4047 if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
4048 ret = ocfs2_duplicate_inline_data(s_inode, s_bh,
4049 t_inode, t_bh);
4050 if (ret)
4051 mlog_errno(ret);
4052 goto out;
4053 }
4054
4000 ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), 4055 ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
4001 1, &ref_tree, &ref_root_bh); 4056 1, &ref_tree, &ref_root_bh);
4002 if (ret) { 4057 if (ret) {
@@ -4013,10 +4068,6 @@ static int ocfs2_create_reflink_node(struct inode *s_inode,
4013 goto out_unlock_refcount; 4068 goto out_unlock_refcount;
4014 } 4069 }
4015 4070
4016 ret = ocfs2_complete_reflink(s_inode, s_bh, t_inode, t_bh, preserve);
4017 if (ret)
4018 mlog_errno(ret);
4019
4020out_unlock_refcount: 4071out_unlock_refcount:
4021 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 4072 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
4022 brelse(ref_root_bh); 4073 brelse(ref_root_bh);
@@ -4068,9 +4119,17 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
4068 ret = ocfs2_reflink_xattrs(inode, old_bh, 4119 ret = ocfs2_reflink_xattrs(inode, old_bh,
4069 new_inode, new_bh, 4120 new_inode, new_bh,
4070 preserve); 4121 preserve);
4071 if (ret) 4122 if (ret) {
4072 mlog_errno(ret); 4123 mlog_errno(ret);
4124 goto inode_unlock;
4125 }
4073 } 4126 }
4127
4128 ret = ocfs2_complete_reflink(inode, old_bh,
4129 new_inode, new_bh, preserve);
4130 if (ret)
4131 mlog_errno(ret);
4132
4074inode_unlock: 4133inode_unlock:
4075 ocfs2_inode_unlock(new_inode, 1); 4134 ocfs2_inode_unlock(new_inode, 1);
4076 brelse(new_bh); 4135 brelse(new_bh);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4cc3c890a2c..14f47d2bfe0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -373,7 +373,7 @@ static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
373} 373}
374#endif /* CONFIG_DEBUG_FS */ 374#endif /* CONFIG_DEBUG_FS */
375 375
376static struct file_operations ocfs2_osb_debug_fops = { 376static const struct file_operations ocfs2_osb_debug_fops = {
377 .open = ocfs2_osb_debug_open, 377 .open = ocfs2_osb_debug_open,
378 .release = ocfs2_debug_release, 378 .release = ocfs2_debug_release,
379 .read = ocfs2_debug_read, 379 .read = ocfs2_debug_read,
@@ -773,18 +773,20 @@ static int ocfs2_sb_probe(struct super_block *sb,
773 if (tmpstat < 0) { 773 if (tmpstat < 0) {
774 status = tmpstat; 774 status = tmpstat;
775 mlog_errno(status); 775 mlog_errno(status);
776 goto bail; 776 break;
777 } 777 }
778 di = (struct ocfs2_dinode *) (*bh)->b_data; 778 di = (struct ocfs2_dinode *) (*bh)->b_data;
779 memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats)); 779 memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
780 spin_lock_init(&stats->b_lock); 780 spin_lock_init(&stats->b_lock);
781 status = ocfs2_verify_volume(di, *bh, blksize, stats); 781 tmpstat = ocfs2_verify_volume(di, *bh, blksize, stats);
782 if (status >= 0) 782 if (tmpstat < 0) {
783 goto bail; 783 brelse(*bh);
784 brelse(*bh); 784 *bh = NULL;
785 *bh = NULL; 785 }
786 if (status != -EAGAIN) 786 if (tmpstat != -EAGAIN) {
787 status = tmpstat;
787 break; 788 break;
789 }
788 } 790 }
789 791
790bail: 792bail:
@@ -1645,6 +1647,10 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
1645 buf->f_bavail = buf->f_bfree; 1647 buf->f_bavail = buf->f_bfree;
1646 buf->f_files = numbits; 1648 buf->f_files = numbits;
1647 buf->f_ffree = freebits; 1649 buf->f_ffree = freebits;
1650 buf->f_fsid.val[0] = crc32_le(0, osb->uuid_str, OCFS2_VOL_UUID_LEN)
1651 & 0xFFFFFFFFUL;
1652 buf->f_fsid.val[1] = crc32_le(0, osb->uuid_str + OCFS2_VOL_UUID_LEN,
1653 OCFS2_VOL_UUID_LEN) & 0xFFFFFFFFUL;
1648 1654
1649 brelse(bh); 1655 brelse(bh);
1650 1656
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index b6284f235d2..c61369342a2 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -53,11 +53,6 @@
53#include <linux/highmem.h> 53#include <linux/highmem.h>
54#include <linux/buffer_head.h> 54#include <linux/buffer_head.h>
55#include <linux/rbtree.h> 55#include <linux/rbtree.h>
56#ifndef CONFIG_OCFS2_COMPAT_JBD
57# include <linux/jbd2.h>
58#else
59# include <linux/jbd.h>
60#endif
61 56
62#define MLOG_MASK_PREFIX ML_UPTODATE 57#define MLOG_MASK_PREFIX ML_UPTODATE
63 58
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 3680bae335b..b42d6241903 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -498,7 +498,7 @@ const struct inode_operations omfs_dir_inops = {
498 .rmdir = omfs_rmdir, 498 .rmdir = omfs_rmdir,
499}; 499};
500 500
501struct file_operations omfs_dir_operations = { 501const struct file_operations omfs_dir_operations = {
502 .read = generic_read_dir, 502 .read = generic_read_dir,
503 .readdir = omfs_readdir, 503 .readdir = omfs_readdir,
504 .llseek = generic_file_llseek, 504 .llseek = generic_file_llseek,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 4845fbb18e6..399487c0936 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -322,7 +322,7 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
322 return generic_block_bmap(mapping, block, omfs_get_block); 322 return generic_block_bmap(mapping, block, omfs_get_block);
323} 323}
324 324
325struct file_operations omfs_file_operations = { 325const struct file_operations omfs_file_operations = {
326 .llseek = generic_file_llseek, 326 .llseek = generic_file_llseek,
327 .read = do_sync_read, 327 .read = do_sync_read,
328 .write = do_sync_write, 328 .write = do_sync_write,
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index df71039945a..ebe2fdbe535 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -44,14 +44,14 @@ extern int omfs_allocate_range(struct super_block *sb, int min_request,
44extern int omfs_clear_range(struct super_block *sb, u64 block, int count); 44extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
45 45
46/* dir.c */ 46/* dir.c */
47extern struct file_operations omfs_dir_operations; 47extern const struct file_operations omfs_dir_operations;
48extern const struct inode_operations omfs_dir_inops; 48extern const struct inode_operations omfs_dir_inops;
49extern int omfs_make_empty(struct inode *inode, struct super_block *sb); 49extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
50extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, 50extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
51 u64 fsblock); 51 u64 fsblock);
52 52
53/* file.c */ 53/* file.c */
54extern struct file_operations omfs_file_operations; 54extern const struct file_operations omfs_file_operations;
55extern const struct inode_operations omfs_file_inops; 55extern const struct inode_operations omfs_file_inops;
56extern const struct address_space_operations omfs_aops; 56extern const struct address_space_operations omfs_aops;
57extern void omfs_make_empty_table(struct buffer_head *bh, int offset); 57extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
diff --git a/fs/pipe.c b/fs/pipe.c
index 52c41511483..ae17d026aaa 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -777,36 +777,55 @@ pipe_rdwr_release(struct inode *inode, struct file *filp)
777static int 777static int
778pipe_read_open(struct inode *inode, struct file *filp) 778pipe_read_open(struct inode *inode, struct file *filp)
779{ 779{
780 /* We could have perhaps used atomic_t, but this and friends 780 int ret = -ENOENT;
781 below are the only places. So it doesn't seem worthwhile. */ 781
782 mutex_lock(&inode->i_mutex); 782 mutex_lock(&inode->i_mutex);
783 inode->i_pipe->readers++; 783
784 if (inode->i_pipe) {
785 ret = 0;
786 inode->i_pipe->readers++;
787 }
788
784 mutex_unlock(&inode->i_mutex); 789 mutex_unlock(&inode->i_mutex);
785 790
786 return 0; 791 return ret;
787} 792}
788 793
789static int 794static int
790pipe_write_open(struct inode *inode, struct file *filp) 795pipe_write_open(struct inode *inode, struct file *filp)
791{ 796{
797 int ret = -ENOENT;
798
792 mutex_lock(&inode->i_mutex); 799 mutex_lock(&inode->i_mutex);
793 inode->i_pipe->writers++; 800
801 if (inode->i_pipe) {
802 ret = 0;
803 inode->i_pipe->writers++;
804 }
805
794 mutex_unlock(&inode->i_mutex); 806 mutex_unlock(&inode->i_mutex);
795 807
796 return 0; 808 return ret;
797} 809}
798 810
799static int 811static int
800pipe_rdwr_open(struct inode *inode, struct file *filp) 812pipe_rdwr_open(struct inode *inode, struct file *filp)
801{ 813{
814 int ret = -ENOENT;
815
802 mutex_lock(&inode->i_mutex); 816 mutex_lock(&inode->i_mutex);
803 if (filp->f_mode & FMODE_READ) 817
804 inode->i_pipe->readers++; 818 if (inode->i_pipe) {
805 if (filp->f_mode & FMODE_WRITE) 819 ret = 0;
806 inode->i_pipe->writers++; 820 if (filp->f_mode & FMODE_READ)
821 inode->i_pipe->readers++;
822 if (filp->f_mode & FMODE_WRITE)
823 inode->i_pipe->writers++;
824 }
825
807 mutex_unlock(&inode->i_mutex); 826 mutex_unlock(&inode->i_mutex);
808 827
809 return 0; 828 return ret;
810} 829}
811 830
812/* 831/*
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 07f77a7945c..822c2d50651 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -571,7 +571,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
571 rsslim, 571 rsslim,
572 mm ? mm->start_code : 0, 572 mm ? mm->start_code : 0,
573 mm ? mm->end_code : 0, 573 mm ? mm->end_code : 0,
574 (permitted) ? task->stack_start : 0, 574 (permitted && mm) ? task->stack_start : 0,
575 esp, 575 esp,
576 eip, 576 eip,
577 /* The signal information here is obsolete. 577 /* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 837469a9659..af643b5aefe 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2597,8 +2597,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
2597 name.len = snprintf(buf, sizeof(buf), "%d", pid); 2597 name.len = snprintf(buf, sizeof(buf), "%d", pid);
2598 dentry = d_hash_and_lookup(mnt->mnt_root, &name); 2598 dentry = d_hash_and_lookup(mnt->mnt_root, &name);
2599 if (dentry) { 2599 if (dentry) {
2600 if (!(current->flags & PF_EXITING)) 2600 shrink_dcache_parent(dentry);
2601 shrink_dcache_parent(dentry);
2602 d_drop(dentry); 2601 d_drop(dentry);
2603 dput(dentry); 2602 dput(dentry);
2604 } 2603 }
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 56013371f9f..a44a7897fd4 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -23,7 +23,6 @@
23#include <asm/io.h> 23#include <asm/io.h>
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/ioport.h> 25#include <linux/ioport.h>
26#include <linux/mm.h>
27#include <linux/memory.h> 26#include <linux/memory.h>
28#include <asm/sections.h> 27#include <asm/sections.h>
29 28
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index c7bff4f603f..a65239cfd97 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -99,7 +99,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
99 "VmallocUsed: %8lu kB\n" 99 "VmallocUsed: %8lu kB\n"
100 "VmallocChunk: %8lu kB\n" 100 "VmallocChunk: %8lu kB\n"
101#ifdef CONFIG_MEMORY_FAILURE 101#ifdef CONFIG_MEMORY_FAILURE
102 "HardwareCorrupted: %8lu kB\n" 102 "HardwareCorrupted: %5lu kB\n"
103#endif 103#endif
104 , 104 ,
105 K(i.totalram), 105 K(i.totalram),
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 2281c2cbfe2..5033ce0d254 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -94,6 +94,7 @@ static const struct file_operations proc_kpagecount_operations = {
94#define KPF_COMPOUND_TAIL 16 94#define KPF_COMPOUND_TAIL 16
95#define KPF_HUGE 17 95#define KPF_HUGE 17
96#define KPF_UNEVICTABLE 18 96#define KPF_UNEVICTABLE 18
97#define KPF_HWPOISON 19
97#define KPF_NOPAGE 20 98#define KPF_NOPAGE 20
98 99
99#define KPF_KSM 21 100#define KPF_KSM 21
@@ -180,6 +181,10 @@ static u64 get_uflags(struct page *page)
180 u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); 181 u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable);
181 u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); 182 u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked);
182 183
184#ifdef CONFIG_MEMORY_FAILURE
185 u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
186#endif
187
183#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR 188#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
184 u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); 189 u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached);
185#endif 190#endif
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
index b3208adf8e7..71e2b4d50a0 100644
--- a/fs/romfs/storage.c
+++ b/fs/romfs/storage.c
@@ -253,11 +253,11 @@ ssize_t romfs_dev_strnlen(struct super_block *sb,
253 253
254#ifdef CONFIG_ROMFS_ON_MTD 254#ifdef CONFIG_ROMFS_ON_MTD
255 if (sb->s_mtd) 255 if (sb->s_mtd)
256 return romfs_mtd_strnlen(sb, pos, limit); 256 return romfs_mtd_strnlen(sb, pos, maxlen);
257#endif 257#endif
258#ifdef CONFIG_ROMFS_ON_BLOCK 258#ifdef CONFIG_ROMFS_ON_BLOCK
259 if (sb->s_bdev) 259 if (sb->s_bdev)
260 return romfs_blk_strnlen(sb, pos, limit); 260 return romfs_blk_strnlen(sb, pos, maxlen);
261#endif 261#endif
262 return -EIO; 262 return -EIO;
263} 263}
diff --git a/fs/select.c b/fs/select.c
index a201fc37022..fd38ce2e32e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -15,6 +15,7 @@
15 */ 15 */
16 16
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/sched.h>
18#include <linux/syscalls.h> 19#include <linux/syscalls.h>
19#include <linux/module.h> 20#include <linux/module.h>
20#include <linux/slab.h> 21#include <linux/slab.h>
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 0050fc40e8c..e0201837d24 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -21,6 +21,7 @@
21#include <linux/completion.h> 21#include <linux/completion.h>
22#include <linux/mutex.h> 22#include <linux/mutex.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/security.h>
24#include "sysfs.h" 25#include "sysfs.h"
25 26
26DEFINE_MUTEX(sysfs_mutex); 27DEFINE_MUTEX(sysfs_mutex);
@@ -285,6 +286,9 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
285 sysfs_put(sd->s_symlink.target_sd); 286 sysfs_put(sd->s_symlink.target_sd);
286 if (sysfs_type(sd) & SYSFS_COPY_NAME) 287 if (sysfs_type(sd) & SYSFS_COPY_NAME)
287 kfree(sd->s_name); 288 kfree(sd->s_name);
289 if (sd->s_iattr && sd->s_iattr->ia_secdata)
290 security_release_secctx(sd->s_iattr->ia_secdata,
291 sd->s_iattr->ia_secdata_len);
288 kfree(sd->s_iattr); 292 kfree(sd->s_iattr);
289 sysfs_free_ino(sd->s_ino); 293 sysfs_free_ino(sd->s_ino);
290 kmem_cache_free(sysfs_dir_cachep, sd); 294 kmem_cache_free(sysfs_dir_cachep, sd);
@@ -894,7 +898,8 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
894 898
895 mutex_lock(&sysfs_rename_mutex); 899 mutex_lock(&sysfs_rename_mutex);
896 BUG_ON(!sd->s_parent); 900 BUG_ON(!sd->s_parent);
897 new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root; 901 new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ?
902 new_parent_kobj->sd : &sysfs_root;
898 903
899 error = 0; 904 error = 0;
900 if (sd->s_parent == new_parent_sd) 905 if (sd->s_parent == new_parent_sd)
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 561a9c050ce..f5ea4680f15 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -268,7 +268,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
268 struct sysfs_open_dirent *od, *new_od = NULL; 268 struct sysfs_open_dirent *od, *new_od = NULL;
269 269
270 retry: 270 retry:
271 spin_lock(&sysfs_open_dirent_lock); 271 spin_lock_irq(&sysfs_open_dirent_lock);
272 272
273 if (!sd->s_attr.open && new_od) { 273 if (!sd->s_attr.open && new_od) {
274 sd->s_attr.open = new_od; 274 sd->s_attr.open = new_od;
@@ -281,7 +281,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
281 list_add_tail(&buffer->list, &od->buffers); 281 list_add_tail(&buffer->list, &od->buffers);
282 } 282 }
283 283
284 spin_unlock(&sysfs_open_dirent_lock); 284 spin_unlock_irq(&sysfs_open_dirent_lock);
285 285
286 if (od) { 286 if (od) {
287 kfree(new_od); 287 kfree(new_od);
@@ -315,8 +315,9 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
315 struct sysfs_buffer *buffer) 315 struct sysfs_buffer *buffer)
316{ 316{
317 struct sysfs_open_dirent *od = sd->s_attr.open; 317 struct sysfs_open_dirent *od = sd->s_attr.open;
318 unsigned long flags;
318 319
319 spin_lock(&sysfs_open_dirent_lock); 320 spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
320 321
321 list_del(&buffer->list); 322 list_del(&buffer->list);
322 if (atomic_dec_and_test(&od->refcnt)) 323 if (atomic_dec_and_test(&od->refcnt))
@@ -324,7 +325,7 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
324 else 325 else
325 od = NULL; 326 od = NULL;
326 327
327 spin_unlock(&sysfs_open_dirent_lock); 328 spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
328 329
329 kfree(od); 330 kfree(od);
330} 331}
@@ -456,8 +457,9 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
456void sysfs_notify_dirent(struct sysfs_dirent *sd) 457void sysfs_notify_dirent(struct sysfs_dirent *sd)
457{ 458{
458 struct sysfs_open_dirent *od; 459 struct sysfs_open_dirent *od;
460 unsigned long flags;
459 461
460 spin_lock(&sysfs_open_dirent_lock); 462 spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
461 463
462 od = sd->s_attr.open; 464 od = sd->s_attr.open;
463 if (od) { 465 if (od) {
@@ -465,7 +467,7 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd)
465 wake_up_interruptible(&od->poll); 467 wake_up_interruptible(&od->poll);
466 } 468 }
467 469
468 spin_unlock(&sysfs_open_dirent_lock); 470 spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
469} 471}
470EXPORT_SYMBOL_GPL(sysfs_notify_dirent); 472EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
471 473
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 381854461b2..c2e30eea74d 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -186,19 +186,37 @@ xfs_destroy_ioend(
186} 186}
187 187
188/* 188/*
189 * If the end of the current ioend is beyond the current EOF,
190 * return the new EOF value, otherwise zero.
191 */
192STATIC xfs_fsize_t
193xfs_ioend_new_eof(
194 xfs_ioend_t *ioend)
195{
196 xfs_inode_t *ip = XFS_I(ioend->io_inode);
197 xfs_fsize_t isize;
198 xfs_fsize_t bsize;
199
200 bsize = ioend->io_offset + ioend->io_size;
201 isize = MAX(ip->i_size, ip->i_new_size);
202 isize = MIN(isize, bsize);
203 return isize > ip->i_d.di_size ? isize : 0;
204}
205
206/*
189 * Update on-disk file size now that data has been written to disk. 207 * Update on-disk file size now that data has been written to disk.
190 * The current in-memory file size is i_size. If a write is beyond 208 * The current in-memory file size is i_size. If a write is beyond
191 * eof i_new_size will be the intended file size until i_size is 209 * eof i_new_size will be the intended file size until i_size is
192 * updated. If this write does not extend all the way to the valid 210 * updated. If this write does not extend all the way to the valid
193 * file size then restrict this update to the end of the write. 211 * file size then restrict this update to the end of the write.
194 */ 212 */
213
195STATIC void 214STATIC void
196xfs_setfilesize( 215xfs_setfilesize(
197 xfs_ioend_t *ioend) 216 xfs_ioend_t *ioend)
198{ 217{
199 xfs_inode_t *ip = XFS_I(ioend->io_inode); 218 xfs_inode_t *ip = XFS_I(ioend->io_inode);
200 xfs_fsize_t isize; 219 xfs_fsize_t isize;
201 xfs_fsize_t bsize;
202 220
203 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); 221 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
204 ASSERT(ioend->io_type != IOMAP_READ); 222 ASSERT(ioend->io_type != IOMAP_READ);
@@ -206,16 +224,10 @@ xfs_setfilesize(
206 if (unlikely(ioend->io_error)) 224 if (unlikely(ioend->io_error))
207 return; 225 return;
208 226
209 bsize = ioend->io_offset + ioend->io_size;
210
211 xfs_ilock(ip, XFS_ILOCK_EXCL); 227 xfs_ilock(ip, XFS_ILOCK_EXCL);
212 228 isize = xfs_ioend_new_eof(ioend);
213 isize = MAX(ip->i_size, ip->i_new_size); 229 if (isize) {
214 isize = MIN(isize, bsize);
215
216 if (ip->i_d.di_size < isize) {
217 ip->i_d.di_size = isize; 230 ip->i_d.di_size = isize;
218 ip->i_update_core = 1;
219 xfs_mark_inode_dirty_sync(ip); 231 xfs_mark_inode_dirty_sync(ip);
220 } 232 }
221 233
@@ -404,10 +416,16 @@ xfs_submit_ioend_bio(
404 struct bio *bio) 416 struct bio *bio)
405{ 417{
406 atomic_inc(&ioend->io_remaining); 418 atomic_inc(&ioend->io_remaining);
407
408 bio->bi_private = ioend; 419 bio->bi_private = ioend;
409 bio->bi_end_io = xfs_end_bio; 420 bio->bi_end_io = xfs_end_bio;
410 421
422 /*
423 * If the I/O is beyond EOF we mark the inode dirty immediately
424 * but don't update the inode size until I/O completion.
425 */
426 if (xfs_ioend_new_eof(ioend))
427 xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode));
428
411 submit_bio(WRITE, bio); 429 submit_bio(WRITE, bio);
412 ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP)); 430 ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
413 bio_put(bio); 431 bio_put(bio);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 629370974e5..eff61e2732a 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -176,14 +176,7 @@ xfs_file_fsync(
176 struct dentry *dentry, 176 struct dentry *dentry,
177 int datasync) 177 int datasync)
178{ 178{
179 struct inode *inode = dentry->d_inode; 179 struct xfs_inode *ip = XFS_I(dentry->d_inode);
180 struct xfs_inode *ip = XFS_I(inode);
181 int error;
182
183 /* capture size updates in I/O completion before writing the inode. */
184 error = filemap_fdatawait(inode->i_mapping);
185 if (error)
186 return error;
187 180
188 xfs_iflags_clear(ip, XFS_ITRUNCATED); 181 xfs_iflags_clear(ip, XFS_ITRUNCATED);
189 return -xfs_fsync(ip); 182 return -xfs_fsync(ip);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index da0159d99f8..cd42ef78f6b 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -57,19 +57,22 @@
57#include <linux/fiemap.h> 57#include <linux/fiemap.h>
58 58
59/* 59/*
60 * Bring the atime in the XFS inode uptodate. 60 * Bring the timestamps in the XFS inode uptodate.
61 * Used before logging the inode to disk or when the Linux inode goes away. 61 *
62 * Used before writing the inode to disk.
62 */ 63 */
63void 64void
64xfs_synchronize_atime( 65xfs_synchronize_times(
65 xfs_inode_t *ip) 66 xfs_inode_t *ip)
66{ 67{
67 struct inode *inode = VFS_I(ip); 68 struct inode *inode = VFS_I(ip);
68 69
69 if (!(inode->i_state & I_CLEAR)) { 70 ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
70 ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; 71 ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
71 ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; 72 ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
72 } 73 ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
74 ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
75 ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
73} 76}
74 77
75/* 78/*
@@ -106,32 +109,20 @@ xfs_ichgtime(
106 if ((flags & XFS_ICHGTIME_MOD) && 109 if ((flags & XFS_ICHGTIME_MOD) &&
107 !timespec_equal(&inode->i_mtime, &tv)) { 110 !timespec_equal(&inode->i_mtime, &tv)) {
108 inode->i_mtime = tv; 111 inode->i_mtime = tv;
109 ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
110 ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
111 sync_it = 1; 112 sync_it = 1;
112 } 113 }
113 if ((flags & XFS_ICHGTIME_CHG) && 114 if ((flags & XFS_ICHGTIME_CHG) &&
114 !timespec_equal(&inode->i_ctime, &tv)) { 115 !timespec_equal(&inode->i_ctime, &tv)) {
115 inode->i_ctime = tv; 116 inode->i_ctime = tv;
116 ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
117 ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
118 sync_it = 1; 117 sync_it = 1;
119 } 118 }
120 119
121 /* 120 /*
122 * We update the i_update_core field _after_ changing 121 * Update complete - now make sure everyone knows that the inode
123 * the timestamps in order to coordinate properly with 122 * is dirty.
124 * xfs_iflush() so that we don't lose timestamp updates.
125 * This keeps us from having to hold the inode lock
126 * while doing this. We use the SYNCHRONIZE macro to
127 * ensure that the compiler does not reorder the update
128 * of i_update_core above the timestamp updates above.
129 */ 123 */
130 if (sync_it) { 124 if (sync_it)
131 SYNCHRONIZE();
132 ip->i_update_core = 1;
133 xfs_mark_inode_dirty_sync(ip); 125 xfs_mark_inode_dirty_sync(ip);
134 }
135} 126}
136 127
137/* 128/*
@@ -506,10 +497,8 @@ xfs_vn_getattr(
506 stat->gid = ip->i_d.di_gid; 497 stat->gid = ip->i_d.di_gid;
507 stat->ino = ip->i_ino; 498 stat->ino = ip->i_ino;
508 stat->atime = inode->i_atime; 499 stat->atime = inode->i_atime;
509 stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec; 500 stat->mtime = inode->i_mtime;
510 stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; 501 stat->ctime = inode->i_ctime;
511 stat->ctime.tv_sec = ip->i_d.di_ctime.t_sec;
512 stat->ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
513 stat->blocks = 502 stat->blocks =
514 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); 503 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
515 504
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 49e4a6aea73..072050f8d34 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -667,7 +667,7 @@ start:
667 xip->i_new_size = new_size; 667 xip->i_new_size = new_size;
668 668
669 if (likely(!(ioflags & IO_INVIS))) 669 if (likely(!(ioflags & IO_INVIS)))
670 xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 670 file_update_time(file);
671 671
672 /* 672 /*
673 * If the offset is beyond the size of the file, we have a couple 673 * If the offset is beyond the size of the file, we have a couple
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 9e41f91aa26..3d4a0c84d63 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -80,7 +80,7 @@ xfs_fs_set_xstate(
80 80
81 if (sb->s_flags & MS_RDONLY) 81 if (sb->s_flags & MS_RDONLY)
82 return -EROFS; 82 return -EROFS;
83 if (!XFS_IS_QUOTA_RUNNING(mp)) 83 if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
84 return -ENOSYS; 84 return -ENOSYS;
85 if (!capable(CAP_SYS_ADMIN)) 85 if (!capable(CAP_SYS_ADMIN))
86 return -EPERM; 86 return -EPERM;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index bdd41c8c342..18a4b8e11df 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -977,6 +977,28 @@ xfs_fs_inode_init_once(
977} 977}
978 978
979/* 979/*
980 * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
981 * we catch unlogged VFS level updates to the inode. Care must be taken
982 * here - the transaction code calls mark_inode_dirty_sync() to mark the
983 * VFS inode dirty in a transaction and clears the i_update_core field;
984 * it must clear the field after calling mark_inode_dirty_sync() to
985 * correctly indicate that the dirty state has been propagated into the
986 * inode log item.
987 *
988 * We need the barrier() to maintain correct ordering between unlogged
989 * updates and the transaction commit code that clears the i_update_core
990 * field. This requires all updates to be completed before marking the
991 * inode dirty.
992 */
993STATIC void
994xfs_fs_dirty_inode(
995 struct inode *inode)
996{
997 barrier();
998 XFS_I(inode)->i_update_core = 1;
999}
1000
1001/*
980 * Attempt to flush the inode, this will actually fail 1002 * Attempt to flush the inode, this will actually fail
981 * if the inode is pinned, but we dirty the inode again 1003 * if the inode is pinned, but we dirty the inode again
982 * at the point when it is unpinned after a log write, 1004 * at the point when it is unpinned after a log write,
@@ -1126,7 +1148,7 @@ xfs_fs_put_super(
1126} 1148}
1127 1149
1128STATIC int 1150STATIC int
1129xfs_fs_sync_super( 1151xfs_fs_sync_fs(
1130 struct super_block *sb, 1152 struct super_block *sb,
1131 int wait) 1153 int wait)
1132{ 1154{
@@ -1134,23 +1156,23 @@ xfs_fs_sync_super(
1134 int error; 1156 int error;
1135 1157
1136 /* 1158 /*
1137 * Treat a sync operation like a freeze. This is to work 1159 * Not much we can do for the first async pass. Writing out the
1138 * around a race in sync_inodes() which works in two phases 1160 * superblock would be counter-productive as we are going to redirty
1139 * - an asynchronous flush, which can write out an inode 1161 * when writing out other data and metadata (and writing out a single
1140 * without waiting for file size updates to complete, and a 1162 * block is quite fast anyway).
1141 * synchronous flush, which wont do anything because the 1163 *
1142 * async flush removed the inode's dirty flag. Also 1164 * Try to asynchronously kick off quota syncing at least.
1143 * sync_inodes() will not see any files that just have
1144 * outstanding transactions to be flushed because we don't
1145 * dirty the Linux inode until after the transaction I/O
1146 * completes.
1147 */ 1165 */
1148 if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE)) 1166 if (!wait) {
1149 error = xfs_quiesce_data(mp); 1167 xfs_qm_sync(mp, SYNC_TRYLOCK);
1150 else 1168 return 0;
1151 error = xfs_sync_fsdata(mp, 0); 1169 }
1170
1171 error = xfs_quiesce_data(mp);
1172 if (error)
1173 return -error;
1152 1174
1153 if (unlikely(laptop_mode)) { 1175 if (laptop_mode) {
1154 int prev_sync_seq = mp->m_sync_seq; 1176 int prev_sync_seq = mp->m_sync_seq;
1155 1177
1156 /* 1178 /*
@@ -1169,7 +1191,7 @@ xfs_fs_sync_super(
1169 mp->m_sync_seq != prev_sync_seq); 1191 mp->m_sync_seq != prev_sync_seq);
1170 } 1192 }
1171 1193
1172 return -error; 1194 return 0;
1173} 1195}
1174 1196
1175STATIC int 1197STATIC int
@@ -1539,10 +1561,11 @@ xfs_fs_get_sb(
1539static const struct super_operations xfs_super_operations = { 1561static const struct super_operations xfs_super_operations = {
1540 .alloc_inode = xfs_fs_alloc_inode, 1562 .alloc_inode = xfs_fs_alloc_inode,
1541 .destroy_inode = xfs_fs_destroy_inode, 1563 .destroy_inode = xfs_fs_destroy_inode,
1564 .dirty_inode = xfs_fs_dirty_inode,
1542 .write_inode = xfs_fs_write_inode, 1565 .write_inode = xfs_fs_write_inode,
1543 .clear_inode = xfs_fs_clear_inode, 1566 .clear_inode = xfs_fs_clear_inode,
1544 .put_super = xfs_fs_put_super, 1567 .put_super = xfs_fs_put_super,
1545 .sync_fs = xfs_fs_sync_super, 1568 .sync_fs = xfs_fs_sync_fs,
1546 .freeze_fs = xfs_fs_freeze, 1569 .freeze_fs = xfs_fs_freeze,
1547 .statfs = xfs_fs_statfs, 1570 .statfs = xfs_fs_statfs,
1548 .remount_fs = xfs_fs_remount, 1571 .remount_fs = xfs_fs_remount,
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 320be6aea49..961df0a22c7 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -309,11 +309,15 @@ xfs_sync_attr(
309STATIC int 309STATIC int
310xfs_commit_dummy_trans( 310xfs_commit_dummy_trans(
311 struct xfs_mount *mp, 311 struct xfs_mount *mp,
312 uint log_flags) 312 uint flags)
313{ 313{
314 struct xfs_inode *ip = mp->m_rootip; 314 struct xfs_inode *ip = mp->m_rootip;
315 struct xfs_trans *tp; 315 struct xfs_trans *tp;
316 int error; 316 int error;
317 int log_flags = XFS_LOG_FORCE;
318
319 if (flags & SYNC_WAIT)
320 log_flags |= XFS_LOG_SYNC;
317 321
318 /* 322 /*
319 * Put a dummy transaction in the log to tell recovery 323 * Put a dummy transaction in the log to tell recovery
@@ -331,13 +335,12 @@ xfs_commit_dummy_trans(
331 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 335 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
332 xfs_trans_ihold(tp, ip); 336 xfs_trans_ihold(tp, ip);
333 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 337 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
334 /* XXX(hch): ignoring the error here.. */
335 error = xfs_trans_commit(tp, 0); 338 error = xfs_trans_commit(tp, 0);
336
337 xfs_iunlock(ip, XFS_ILOCK_EXCL); 339 xfs_iunlock(ip, XFS_ILOCK_EXCL);
338 340
341 /* the log force ensures this transaction is pushed to disk */
339 xfs_log_force(mp, 0, log_flags); 342 xfs_log_force(mp, 0, log_flags);
340 return 0; 343 return error;
341} 344}
342 345
343int 346int
@@ -385,7 +388,20 @@ xfs_sync_fsdata(
385 else 388 else
386 XFS_BUF_ASYNC(bp); 389 XFS_BUF_ASYNC(bp);
387 390
388 return xfs_bwrite(mp, bp); 391 error = xfs_bwrite(mp, bp);
392 if (error)
393 return error;
394
395 /*
396 * If this is a data integrity sync make sure all pending buffers
397 * are flushed out for the log coverage check below.
398 */
399 if (flags & SYNC_WAIT)
400 xfs_flush_buftarg(mp->m_ddev_targp, 1);
401
402 if (xfs_log_need_covered(mp))
403 error = xfs_commit_dummy_trans(mp, flags);
404 return error;
389 405
390 out_brelse: 406 out_brelse:
391 xfs_buf_relse(bp); 407 xfs_buf_relse(bp);
@@ -419,14 +435,16 @@ xfs_quiesce_data(
419 /* push non-blocking */ 435 /* push non-blocking */
420 xfs_sync_data(mp, 0); 436 xfs_sync_data(mp, 0);
421 xfs_qm_sync(mp, SYNC_TRYLOCK); 437 xfs_qm_sync(mp, SYNC_TRYLOCK);
422 xfs_filestream_flush(mp);
423 438
424 /* push and block */ 439 /* push and block till complete */
425 xfs_sync_data(mp, SYNC_WAIT); 440 xfs_sync_data(mp, SYNC_WAIT);
426 xfs_qm_sync(mp, SYNC_WAIT); 441 xfs_qm_sync(mp, SYNC_WAIT);
427 442
443 /* drop inode references pinned by filestreams */
444 xfs_filestream_flush(mp);
445
428 /* write superblock and hoover up shutdown errors */ 446 /* write superblock and hoover up shutdown errors */
429 error = xfs_sync_fsdata(mp, 0); 447 error = xfs_sync_fsdata(mp, SYNC_WAIT);
430 448
431 /* flush data-only devices */ 449 /* flush data-only devices */
432 if (mp->m_rtdev_targp) 450 if (mp->m_rtdev_targp)
@@ -570,8 +588,6 @@ xfs_sync_worker(
570 /* dgc: errors ignored here */ 588 /* dgc: errors ignored here */
571 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 589 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
572 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); 590 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
573 if (xfs_log_need_covered(mp))
574 error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
575 } 591 }
576 mp->m_sync_seq++; 592 mp->m_sync_seq++;
577 wake_up(&mp->m_wait_single_sync_task); 593 wake_up(&mp->m_wait_single_sync_task);
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 4e4276b956e..5d1a3b98a6e 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -876,7 +876,6 @@ xfs_dqrele_inode(
876 ip->i_gdquot = NULL; 876 ip->i_gdquot = NULL;
877 } 877 }
878 xfs_iput(ip, XFS_ILOCK_EXCL); 878 xfs_iput(ip, XFS_ILOCK_EXCL);
879 IRELE(ip);
880 879
881 return 0; 880 return 0;
882} 881}
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 7465f9ee125..ab89a7e94a0 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -206,10 +206,10 @@ xfs_swap_extents(
206 * process that the file was not changed out from 206 * process that the file was not changed out from
207 * under it. 207 * under it.
208 */ 208 */
209 if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) || 209 if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
210 (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) || 210 (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
211 (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) || 211 (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
212 (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) { 212 (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
213 error = XFS_ERROR(EBUSY); 213 error = XFS_ERROR(EBUSY);
214 goto out_unlock; 214 goto out_unlock;
215 } 215 }
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index fa913e45944..41ad537c49e 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -854,6 +854,7 @@ xfs_dir2_leaf_getdents(
854 */ 854 */
855 ra_want = howmany(bufsize + mp->m_dirblksize, 855 ra_want = howmany(bufsize + mp->m_dirblksize,
856 mp->m_sb.sb_blocksize) - 1; 856 mp->m_sb.sb_blocksize) - 1;
857 ASSERT(ra_want >= 0);
857 858
858 /* 859 /*
859 * If we don't have as many as we want, and we haven't 860 * If we don't have as many as we want, and we haven't
@@ -1088,7 +1089,8 @@ xfs_dir2_leaf_getdents(
1088 */ 1089 */
1089 ptr += length; 1090 ptr += length;
1090 curoff += length; 1091 curoff += length;
1091 bufsize -= length; 1092 /* bufsize may have just been a guess; don't go negative */
1093 bufsize = bufsize > length ? bufsize - length : 0;
1092 } 1094 }
1093 1095
1094 /* 1096 /*
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index ab64f3efb43..0785797db82 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -880,6 +880,7 @@ nextag:
880 * Not in range - save last search 880 * Not in range - save last search
881 * location and allocate a new inode 881 * location and allocate a new inode
882 */ 882 */
883 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
883 pag->pagl_leftrec = trec.ir_startino; 884 pag->pagl_leftrec = trec.ir_startino;
884 pag->pagl_rightrec = rec.ir_startino; 885 pag->pagl_rightrec = rec.ir_startino;
885 pag->pagl_pagino = pagino; 886 pag->pagl_pagino = pagino;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c1dc7ef5a1d..b92a4fa2a0a 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3068,9 +3068,9 @@ xfs_iflush_int(
3068 SYNCHRONIZE(); 3068 SYNCHRONIZE();
3069 3069
3070 /* 3070 /*
3071 * Make sure to get the latest atime from the Linux inode. 3071 * Make sure to get the latest timestamps from the Linux inode.
3072 */ 3072 */
3073 xfs_synchronize_atime(ip); 3073 xfs_synchronize_times(ip);
3074 3074
3075 if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, 3075 if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
3076 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 3076 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0b38b9a869e..41555de1d1d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -504,7 +504,7 @@ void xfs_ichgtime(xfs_inode_t *, int);
504void xfs_lock_inodes(xfs_inode_t **, int, uint); 504void xfs_lock_inodes(xfs_inode_t **, int, uint);
505void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 505void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
506 506
507void xfs_synchronize_atime(xfs_inode_t *); 507void xfs_synchronize_times(xfs_inode_t *);
508void xfs_mark_inode_dirty_sync(xfs_inode_t *); 508void xfs_mark_inode_dirty_sync(xfs_inode_t *);
509 509
510#if defined(XFS_INODE_TRACE) 510#if defined(XFS_INODE_TRACE)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 47d5b663c37..9794b876d6f 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -232,6 +232,15 @@ xfs_inode_item_format(
232 nvecs = 1; 232 nvecs = 1;
233 233
234 /* 234 /*
235 * Make sure the linux inode is dirty. We do this before
236 * clearing i_update_core as the VFS will call back into
237 * XFS here and set i_update_core, so we need to dirty the
238 * inode first so that the ordering of i_update_core and
239 * unlogged modifications still works as described below.
240 */
241 xfs_mark_inode_dirty_sync(ip);
242
243 /*
235 * Clear i_update_core if the timestamps (or any other 244 * Clear i_update_core if the timestamps (or any other
236 * non-transactional modification) need flushing/logging 245 * non-transactional modification) need flushing/logging
237 * and we're about to log them with the rest of the core. 246 * and we're about to log them with the rest of the core.
@@ -263,14 +272,9 @@ xfs_inode_item_format(
263 } 272 }
264 273
265 /* 274 /*
266 * Make sure to get the latest atime from the Linux inode. 275 * Make sure to get the latest timestamps from the Linux inode.
267 */ 276 */
268 xfs_synchronize_atime(ip); 277 xfs_synchronize_times(ip);
269
270 /*
271 * make sure the linux inode is dirty
272 */
273 xfs_mark_inode_dirty_sync(ip);
274 278
275 vecp->i_addr = (xfs_caddr_t)&ip->i_d; 279 vecp->i_addr = (xfs_caddr_t)&ip->i_d;
276 vecp->i_len = sizeof(struct xfs_icdinode); 280 vecp->i_len = sizeof(struct xfs_icdinode);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index b68f9107e26..62efab2f383 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -59,6 +59,7 @@ xfs_bulkstat_one_iget(
59{ 59{
60 xfs_icdinode_t *dic; /* dinode core info pointer */ 60 xfs_icdinode_t *dic; /* dinode core info pointer */
61 xfs_inode_t *ip; /* incore inode pointer */ 61 xfs_inode_t *ip; /* incore inode pointer */
62 struct inode *inode;
62 int error; 63 int error;
63 64
64 error = xfs_iget(mp, NULL, ino, 65 error = xfs_iget(mp, NULL, ino,
@@ -72,6 +73,7 @@ xfs_bulkstat_one_iget(
72 ASSERT(ip->i_imap.im_blkno != 0); 73 ASSERT(ip->i_imap.im_blkno != 0);
73 74
74 dic = &ip->i_d; 75 dic = &ip->i_d;
76 inode = VFS_I(ip);
75 77
76 /* xfs_iget returns the following without needing 78 /* xfs_iget returns the following without needing
77 * further change. 79 * further change.
@@ -83,16 +85,19 @@ xfs_bulkstat_one_iget(
83 buf->bs_uid = dic->di_uid; 85 buf->bs_uid = dic->di_uid;
84 buf->bs_gid = dic->di_gid; 86 buf->bs_gid = dic->di_gid;
85 buf->bs_size = dic->di_size; 87 buf->bs_size = dic->di_size;
88
86 /* 89 /*
87 * We are reading the atime from the Linux inode because the 90 * We need to read the timestamps from the Linux inode because
88 * dinode might not be uptodate. 91 * the VFS keeps writing directly into the inode structure instead
92 * of telling us about the updates.
89 */ 93 */
90 buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec; 94 buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
91 buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec; 95 buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
92 buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; 96 buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
93 buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; 97 buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
94 buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; 98 buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
95 buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec; 99 buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
100
96 buf->bs_xflags = xfs_ip2xflags(ip); 101 buf->bs_xflags = xfs_ip2xflags(ip);
97 buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; 102 buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
98 buf->bs_extents = dic->di_nextents; 103 buf->bs_extents = dic->di_nextents;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1099395d7d6..fb17f8226b0 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1980,7 +1980,7 @@ xlog_recover_do_reg_buffer(
1980 "XFS: NULL dquot in %s.", __func__); 1980 "XFS: NULL dquot in %s.", __func__);
1981 goto next; 1981 goto next;
1982 } 1982 }
1983 if (item->ri_buf[i].i_len < sizeof(xfs_dqblk_t)) { 1983 if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
1984 cmn_err(CE_ALERT, 1984 cmn_err(CE_ALERT,
1985 "XFS: dquot too small (%d) in %s.", 1985 "XFS: dquot too small (%d) in %s.",
1986 item->ri_buf[i].i_len, __func__); 1986 item->ri_buf[i].i_len, __func__);
@@ -2635,7 +2635,7 @@ xlog_recover_do_dquot_trans(
2635 "XFS: NULL dquot in %s.", __func__); 2635 "XFS: NULL dquot in %s.", __func__);
2636 return XFS_ERROR(EIO); 2636 return XFS_ERROR(EIO);
2637 } 2637 }
2638 if (item->ri_buf[1].i_len < sizeof(xfs_dqblk_t)) { 2638 if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
2639 cmn_err(CE_ALERT, 2639 cmn_err(CE_ALERT,
2640 "XFS: dquot too small (%d) in %s.", 2640 "XFS: dquot too small (%d) in %s.",
2641 item->ri_buf[1].i_len, __func__); 2641 item->ri_buf[1].i_len, __func__);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index f31271c30de..2ffc570679b 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -467,6 +467,7 @@ xfs_trans_ail_update(
467{ 467{
468 xfs_log_item_t *dlip = NULL; 468 xfs_log_item_t *dlip = NULL;
469 xfs_log_item_t *mlip; /* ptr to minimum lip */ 469 xfs_log_item_t *mlip; /* ptr to minimum lip */
470 xfs_lsn_t tail_lsn;
470 471
471 mlip = xfs_ail_min(ailp); 472 mlip = xfs_ail_min(ailp);
472 473
@@ -483,8 +484,16 @@ xfs_trans_ail_update(
483 484
484 if (mlip == dlip) { 485 if (mlip == dlip) {
485 mlip = xfs_ail_min(ailp); 486 mlip = xfs_ail_min(ailp);
487 /*
488 * It is not safe to access mlip after the AIL lock is
489 * dropped, so we must get a copy of li_lsn before we do
490 * so. This is especially important on 32-bit platforms
491 * where accessing and updating 64-bit values like li_lsn
492 * is not atomic.
493 */
494 tail_lsn = mlip->li_lsn;
486 spin_unlock(&ailp->xa_lock); 495 spin_unlock(&ailp->xa_lock);
487 xfs_log_move_tail(ailp->xa_mount, mlip->li_lsn); 496 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
488 } else { 497 } else {
489 spin_unlock(&ailp->xa_lock); 498 spin_unlock(&ailp->xa_lock);
490 } 499 }
@@ -514,6 +523,7 @@ xfs_trans_ail_delete(
514{ 523{
515 xfs_log_item_t *dlip; 524 xfs_log_item_t *dlip;
516 xfs_log_item_t *mlip; 525 xfs_log_item_t *mlip;
526 xfs_lsn_t tail_lsn;
517 527
518 if (lip->li_flags & XFS_LI_IN_AIL) { 528 if (lip->li_flags & XFS_LI_IN_AIL) {
519 mlip = xfs_ail_min(ailp); 529 mlip = xfs_ail_min(ailp);
@@ -527,9 +537,16 @@ xfs_trans_ail_delete(
527 537
528 if (mlip == dlip) { 538 if (mlip == dlip) {
529 mlip = xfs_ail_min(ailp); 539 mlip = xfs_ail_min(ailp);
540 /*
541 * It is not safe to access mlip after the AIL lock
542 * is dropped, so we must get a copy of li_lsn
543 * before we do so. This is especially important
544 * on 32-bit platforms where accessing and updating
545 * 64-bit values like li_lsn is not atomic.
546 */
547 tail_lsn = mlip ? mlip->li_lsn : 0;
530 spin_unlock(&ailp->xa_lock); 548 spin_unlock(&ailp->xa_lock);
531 xfs_log_move_tail(ailp->xa_mount, 549 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
532 (mlip ? mlip->li_lsn : 0));
533 } else { 550 } else {
534 spin_unlock(&ailp->xa_lock); 551 spin_unlock(&ailp->xa_lock);
535 } 552 }
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index a434f287962..b572f7e840e 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2476,12 +2476,6 @@ xfs_reclaim(
2476 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); 2476 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
2477 2477
2478 /* 2478 /*
2479 * Make sure the atime in the XFS inode is correct before freeing the
2480 * Linux inode.
2481 */
2482 xfs_synchronize_atime(ip);
2483
2484 /*
2485 * If we have nothing to flush with this inode then complete the 2479 * If we have nothing to flush with this inode then complete the
2486 * teardown now, otherwise break the link between the xfs inode and the 2480 * teardown now, otherwise break the link between the xfs inode and the
2487 * linux inode and clean up the xfs inode later. This avoids flushing 2481 * linux inode and clean up the xfs inode later. This avoids flushing