aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_dir.c93
-rw-r--r--fs/9p/vfs_inode.c5
-rw-r--r--fs/Kconfig4
-rw-r--r--fs/afs/cache.h12
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/anon_inodes.c2
-rw-r--r--fs/bio.c77
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/btrfs/acl.c6
-rw-r--r--fs/btrfs/async-thread.c81
-rw-r--r--fs/btrfs/async-thread.h10
-rw-r--r--fs/btrfs/btrfs_inode.h16
-rw-r--r--fs/btrfs/ctree.h38
-rw-r--r--fs/btrfs/disk-io.c60
-rw-r--r--fs/btrfs/extent-tree.c724
-rw-r--r--fs/btrfs/extent_io.c134
-rw-r--r--fs/btrfs/extent_io.h31
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/file.c81
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/inode.c442
-rw-r--r--fs/btrfs/ioctl.c69
-rw-r--r--fs/btrfs/ordered-data.c99
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/relocation.c4
-rw-r--r--fs/btrfs/root-tree.c2
-rw-r--r--fs/btrfs/super.c9
-rw-r--r--fs/btrfs/transaction.c72
-rw-r--r--fs/btrfs/transaction.h5
-rw-r--r--fs/btrfs/tree-log.c56
-rw-r--r--fs/btrfs/tree-log.h3
-rw-r--r--fs/btrfs/volumes.c4
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/buffer.c10
-rw-r--r--fs/cifs/Kconfig1
-rw-r--r--fs/cifs/cifsfs.c93
-rw-r--r--fs/cifs/cifsglob.h21
-rw-r--r--fs/cifs/cifsproto.h12
-rw-r--r--fs/cifs/cifssmb.c1
-rw-r--r--fs/cifs/connect.c12
-rw-r--r--fs/cifs/dir.c64
-rw-r--r--fs/cifs/file.c137
-rw-r--r--fs/cifs/inode.c7
-rw-r--r--fs/cifs/misc.c48
-rw-r--r--fs/cifs/readdir.c11
-rw-r--r--fs/cifs/transport.c50
-rw-r--r--fs/coda/psdev.c1
-rw-r--r--fs/compat.c2
-rw-r--r--fs/compat_ioctl.c4
-rw-r--r--fs/dlm/lowcomms.c36
-rw-r--r--fs/ecryptfs/Kconfig5
-rw-r--r--fs/ecryptfs/crypto.c39
-rw-r--r--fs/ecryptfs/inode.c2
-rw-r--r--fs/ecryptfs/keystore.c39
-rw-r--r--fs/ecryptfs/kthread.c24
-rw-r--r--fs/ecryptfs/main.c10
-rw-r--r--fs/ecryptfs/mmap.c4
-rw-r--r--fs/ecryptfs/read_write.c32
-rw-r--r--fs/ecryptfs/super.c2
-rw-r--r--fs/exec.c8
-rw-r--r--fs/ext3/fsync.c36
-rw-r--r--fs/ext3/inode.c36
-rw-r--r--fs/ext3/super.c15
-rw-r--r--fs/ext4/Kconfig14
-rw-r--r--fs/ext4/ext4.h56
-rw-r--r--fs/ext4/ext4_extents.h7
-rw-r--r--fs/ext4/ext4_jbd2.h6
-rw-r--r--fs/ext4/extents.c458
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/fsync.c5
-rw-r--r--fs/ext4/inode.c586
-rw-r--r--fs/ext4/mballoc.c305
-rw-r--r--fs/ext4/mballoc.h35
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c20
-rw-r--r--fs/ext4/namei.c19
-rw-r--r--fs/ext4/super.c150
-rw-r--r--fs/fat/fat.h2
-rw-r--r--fs/fat/inode.c18
-rw-r--r--fs/fat/misc.c8
-rw-r--r--fs/fat/namei_vfat.c15
-rw-r--r--fs/fcntl.c4
-rw-r--r--fs/file.c1
-rw-r--r--fs/fs-writeback.c165
-rw-r--r--fs/fuse/dir.c4
-rw-r--r--fs/fuse/file.c7
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/hfs/btree.c5
-rw-r--r--fs/hfsplus/wrapper.c4
-rw-r--r--fs/ioctl.c2
-rw-r--r--fs/jbd/journal.c3
-rw-r--r--fs/jbd2/checkpoint.c7
-rw-r--r--fs/jbd2/commit.c59
-rw-r--r--fs/jbd2/journal.c200
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/direct.c1
-rw-r--r--fs/nfs/file.c4
-rw-r--r--fs/nfs/nfs4namespace.c12
-rw-r--r--fs/nfs/nfs4proc.c15
-rw-r--r--fs/nfs/nfs4renewd.c6
-rw-r--r--fs/nfs/nfs4xdr.c1
-rw-r--r--fs/nfs/super.c35
-rw-r--r--fs/nfsd/nfs3xdr.c2
-rw-r--r--fs/nfsd/nfsctl.c2
-rw-r--r--fs/nilfs2/btnode.c5
-rw-r--r--fs/nilfs2/cpfile.c2
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/file.c4
-rw-r--r--fs/nilfs2/inode.c2
-rw-r--r--fs/nilfs2/ioctl.c39
-rw-r--r--fs/nilfs2/mdt.c2
-rw-r--r--fs/nilfs2/nilfs.h4
-rw-r--r--fs/nilfs2/segment.c17
-rw-r--r--fs/nls/nls_base.c8
-rw-r--r--fs/notify/dnotify/dnotify.c3
-rw-r--r--fs/notify/inode_mark.c6
-rw-r--r--fs/notify/notification.c2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/cluster/netdebug.c4
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c8
-rw-r--r--fs/ocfs2/mmap.c2
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/omfs/dir.c2
-rw-r--r--fs/omfs/file.c2
-rw-r--r--fs/omfs/omfs.h4
-rw-r--r--fs/pipe.c41
-rw-r--r--fs/proc/array.c9
-rw-r--r--fs/proc/base.c3
-rw-r--r--fs/proc/kcore.c1
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/proc/page.c5
-rw-r--r--fs/romfs/storage.c4
-rw-r--r--fs/select.c1
-rw-r--r--fs/sysfs/bin.c4
-rw-r--r--fs/sysfs/dir.c7
-rw-r--r--fs/sysfs/file.c14
-rw-r--r--fs/ubifs/file.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c38
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c13
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c41
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c59
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c36
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c1
-rw-r--r--fs/xfs/xfs_dfrag.c8
-rw-r--r--fs/xfs/xfs_dir2_leaf.c4
-rw-r--r--fs/xfs/xfs_ialloc.c1
-rw-r--r--fs/xfs/xfs_inode.c4
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.c18
-rw-r--r--fs/xfs/xfs_itable.c21
-rw-r--r--fs/xfs/xfs_log_recover.c4
-rw-r--r--fs/xfs/xfs_trans_ail.c23
-rw-r--r--fs/xfs/xfs_vnodeops.c6
157 files changed, 3630 insertions, 2038 deletions
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 873cd31baa47..15cce53bf61e 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -40,6 +40,24 @@
40#include "fid.h" 40#include "fid.h"
41 41
42/** 42/**
43 * struct p9_rdir - readdir accounting
44 * @mutex: mutex protecting readdir
45 * @head: start offset of current dirread buffer
46 * @tail: end offset of current dirread buffer
47 * @buf: dirread buffer
48 *
49 * private structure for keeping track of readdir
50 * allocated on demand
51 */
52
53struct p9_rdir {
54 struct mutex mutex;
55 int head;
56 int tail;
57 uint8_t *buf;
58};
59
60/**
43 * dt_type - return file type 61 * dt_type - return file type
44 * @mistat: mistat structure 62 * @mistat: mistat structure
45 * 63 *
@@ -70,56 +88,79 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
70{ 88{
71 int over; 89 int over;
72 struct p9_wstat st; 90 struct p9_wstat st;
73 int err; 91 int err = 0;
74 struct p9_fid *fid; 92 struct p9_fid *fid;
75 int buflen; 93 int buflen;
76 char *statbuf; 94 int reclen = 0;
77 int n, i = 0; 95 struct p9_rdir *rdir;
78 96
79 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); 97 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
80 fid = filp->private_data; 98 fid = filp->private_data;
81 99
82 buflen = fid->clnt->msize - P9_IOHDRSZ; 100 buflen = fid->clnt->msize - P9_IOHDRSZ;
83 statbuf = kmalloc(buflen, GFP_KERNEL); 101
84 if (!statbuf) 102 /* allocate rdir on demand */
85 return -ENOMEM; 103 if (!fid->rdir) {
86 104 rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
87 while (1) { 105
88 err = v9fs_file_readn(filp, statbuf, NULL, buflen, 106 if (rdir == NULL) {
89 fid->rdir_fpos); 107 err = -ENOMEM;
90 if (err <= 0) 108 goto exit;
91 break; 109 }
92 110 spin_lock(&filp->f_dentry->d_lock);
93 n = err; 111 if (!fid->rdir) {
94 while (i < n) { 112 rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir);
95 err = p9stat_read(statbuf + i, buflen-i, &st, 113 mutex_init(&rdir->mutex);
96 fid->clnt->dotu); 114 rdir->head = rdir->tail = 0;
115 fid->rdir = (void *) rdir;
116 rdir = NULL;
117 }
118 spin_unlock(&filp->f_dentry->d_lock);
119 kfree(rdir);
120 }
121 rdir = (struct p9_rdir *) fid->rdir;
122
123 err = mutex_lock_interruptible(&rdir->mutex);
124 while (err == 0) {
125 if (rdir->tail == rdir->head) {
126 err = v9fs_file_readn(filp, rdir->buf, NULL,
127 buflen, filp->f_pos);
128 if (err <= 0)
129 goto unlock_and_exit;
130
131 rdir->head = 0;
132 rdir->tail = err;
133 }
134
135 while (rdir->head < rdir->tail) {
136 err = p9stat_read(rdir->buf + rdir->head,
137 buflen - rdir->head, &st,
138 fid->clnt->dotu);
97 if (err) { 139 if (err) {
98 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); 140 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err);
99 err = -EIO; 141 err = -EIO;
100 p9stat_free(&st); 142 p9stat_free(&st);
101 goto free_and_exit; 143 goto unlock_and_exit;
102 } 144 }
103 145 reclen = st.size+2;
104 i += st.size+2;
105 fid->rdir_fpos += st.size+2;
106 146
107 over = filldir(dirent, st.name, strlen(st.name), 147 over = filldir(dirent, st.name, strlen(st.name),
108 filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st)); 148 filp->f_pos, v9fs_qid2ino(&st.qid), dt_type(&st));
109 149
110 filp->f_pos += st.size+2;
111
112 p9stat_free(&st); 150 p9stat_free(&st);
113 151
114 if (over) { 152 if (over) {
115 err = 0; 153 err = 0;
116 goto free_and_exit; 154 goto unlock_and_exit;
117 } 155 }
156 rdir->head += reclen;
157 filp->f_pos += reclen;
118 } 158 }
119 } 159 }
120 160
121free_and_exit: 161unlock_and_exit:
122 kfree(statbuf); 162 mutex_unlock(&rdir->mutex);
163exit:
123 return err; 164 return err;
124} 165}
125 166
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 5947628aefef..18f74ec4dce9 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -994,8 +994,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
994 P9_DPRINTK(P9_DEBUG_VFS, 994 P9_DPRINTK(P9_DEBUG_VFS,
995 "%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer); 995 "%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer);
996 996
997 retval = buflen; 997 retval = strnlen(buffer, buflen);
998
999done: 998done:
1000 kfree(st); 999 kfree(st);
1001 return retval; 1000 return retval;
@@ -1062,7 +1061,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
1062 __putname(link); 1061 __putname(link);
1063 link = ERR_PTR(len); 1062 link = ERR_PTR(len);
1064 } else 1063 } else
1065 link[len] = 0; 1064 link[min(len, PATH_MAX-1)] = 0;
1066 } 1065 }
1067 nd_set_link(nd, link); 1066 nd_set_link(nd, link);
1068 1067
diff --git a/fs/Kconfig b/fs/Kconfig
index d4bf8caad8d0..64d44efad7a5 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -135,8 +135,8 @@ config TMPFS_POSIX_ACL
135 135
136config HUGETLBFS 136config HUGETLBFS
137 bool "HugeTLB file system support" 137 bool "HugeTLB file system support"
138 depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \ 138 depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
139 (S390 && 64BIT) || SYS_SUPPORTS_HUGETLBFS || BROKEN 139 SYS_SUPPORTS_HUGETLBFS || BROKEN
140 help 140 help
141 hugetlbfs is a filesystem backing for HugeTLB pages, based on 141 hugetlbfs is a filesystem backing for HugeTLB pages, based on
142 ramfs. For architectures that support it, say Y here and read 142 ramfs. For architectures that support it, say Y here and read
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
deleted file mode 100644
index 5c4f6b499e90..000000000000
--- a/fs/afs/cache.h
+++ /dev/null
@@ -1,12 +0,0 @@
1/* AFS local cache management interface
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/fscache.h>
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 106be66dafd2..6ece2a13bf71 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -18,10 +18,10 @@
18#include <linux/key.h> 18#include <linux/key.h>
19#include <linux/workqueue.h> 19#include <linux/workqueue.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/fscache.h>
21 22
22#include "afs.h" 23#include "afs.h"
23#include "afs_vl.h" 24#include "afs_vl.h"
24#include "cache.h"
25 25
26#define AFS_CELL_MAX_ADDRS 15 26#define AFS_CELL_MAX_ADDRS 15
27 27
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index d11c51fc2a3f..2ca7a7cafdbf 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -8,8 +8,10 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/cred.h>
11#include <linux/file.h> 12#include <linux/file.h>
12#include <linux/poll.h> 13#include <linux/poll.h>
14#include <linux/sched.h>
13#include <linux/slab.h> 15#include <linux/slab.h>
14#include <linux/init.h> 16#include <linux/init.h>
15#include <linux/fs.h> 17#include <linux/fs.h>
diff --git a/fs/bio.c b/fs/bio.c
index 76738005c8e8..12da5db8682c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -249,6 +249,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
249 249
250 mempool_free(p, bs->bio_pool); 250 mempool_free(p, bs->bio_pool);
251} 251}
252EXPORT_SYMBOL(bio_free);
252 253
253void bio_init(struct bio *bio) 254void bio_init(struct bio *bio)
254{ 255{
@@ -257,6 +258,7 @@ void bio_init(struct bio *bio)
257 bio->bi_comp_cpu = -1; 258 bio->bi_comp_cpu = -1;
258 atomic_set(&bio->bi_cnt, 1); 259 atomic_set(&bio->bi_cnt, 1);
259} 260}
261EXPORT_SYMBOL(bio_init);
260 262
261/** 263/**
262 * bio_alloc_bioset - allocate a bio for I/O 264 * bio_alloc_bioset - allocate a bio for I/O
@@ -311,6 +313,7 @@ err_free:
311 mempool_free(p, bs->bio_pool); 313 mempool_free(p, bs->bio_pool);
312 return NULL; 314 return NULL;
313} 315}
316EXPORT_SYMBOL(bio_alloc_bioset);
314 317
315static void bio_fs_destructor(struct bio *bio) 318static void bio_fs_destructor(struct bio *bio)
316{ 319{
@@ -322,8 +325,16 @@ static void bio_fs_destructor(struct bio *bio)
322 * @gfp_mask: allocation mask to use 325 * @gfp_mask: allocation mask to use
323 * @nr_iovecs: number of iovecs 326 * @nr_iovecs: number of iovecs
324 * 327 *
325 * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask 328 * bio_alloc will allocate a bio and associated bio_vec array that can hold
326 * contains __GFP_WAIT, the allocation is guaranteed to succeed. 329 * at least @nr_iovecs entries. Allocations will be done from the
330 * fs_bio_set. Also see @bio_alloc_bioset and @bio_kmalloc.
331 *
332 * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
333 * a bio. This is due to the mempool guarantees. To make this work, callers
334 * must never allocate more than 1 bio at a time from this pool. Callers
335 * that need to allocate more than 1 bio must always submit the previously
336 * allocated bio for IO before attempting to allocate a new one. Failure to
337 * do so can cause livelocks under memory pressure.
327 * 338 *
328 * RETURNS: 339 * RETURNS:
329 * Pointer to new bio on success, NULL on failure. 340 * Pointer to new bio on success, NULL on failure.
@@ -337,6 +348,7 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
337 348
338 return bio; 349 return bio;
339} 350}
351EXPORT_SYMBOL(bio_alloc);
340 352
341static void bio_kmalloc_destructor(struct bio *bio) 353static void bio_kmalloc_destructor(struct bio *bio)
342{ 354{
@@ -346,21 +358,13 @@ static void bio_kmalloc_destructor(struct bio *bio)
346} 358}
347 359
348/** 360/**
349 * bio_alloc - allocate a bio for I/O 361 * bio_kmalloc - allocate a bio for I/O using kmalloc()
350 * @gfp_mask: the GFP_ mask given to the slab allocator 362 * @gfp_mask: the GFP_ mask given to the slab allocator
351 * @nr_iovecs: number of iovecs to pre-allocate 363 * @nr_iovecs: number of iovecs to pre-allocate
352 * 364 *
353 * Description: 365 * Description:
354 * bio_alloc will allocate a bio and associated bio_vec array that can hold 366 * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask contains
355 * at least @nr_iovecs entries. Allocations will be done from the 367 * %__GFP_WAIT, the allocation is guaranteed to succeed.
356 * fs_bio_set. Also see @bio_alloc_bioset.
357 *
358 * If %__GFP_WAIT is set, then bio_alloc will always be able to allocate
359 * a bio. This is due to the mempool guarantees. To make this work, callers
360 * must never allocate more than 1 bio at a time from this pool. Callers
361 * that need to allocate more than 1 bio must always submit the previously
362 * allocated bio for IO before attempting to allocate a new one. Failure to
363 * do so can cause livelocks under memory pressure.
364 * 368 *
365 **/ 369 **/
366struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) 370struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
@@ -380,6 +384,7 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
380 384
381 return bio; 385 return bio;
382} 386}
387EXPORT_SYMBOL(bio_kmalloc);
383 388
384void zero_fill_bio(struct bio *bio) 389void zero_fill_bio(struct bio *bio)
385{ 390{
@@ -402,7 +407,7 @@ EXPORT_SYMBOL(zero_fill_bio);
402 * 407 *
403 * Description: 408 * Description:
404 * Put a reference to a &struct bio, either one you have gotten with 409 * Put a reference to a &struct bio, either one you have gotten with
405 * bio_alloc or bio_get. The last put of a bio will free it. 410 * bio_alloc, bio_get or bio_clone. The last put of a bio will free it.
406 **/ 411 **/
407void bio_put(struct bio *bio) 412void bio_put(struct bio *bio)
408{ 413{
@@ -416,6 +421,7 @@ void bio_put(struct bio *bio)
416 bio->bi_destructor(bio); 421 bio->bi_destructor(bio);
417 } 422 }
418} 423}
424EXPORT_SYMBOL(bio_put);
419 425
420inline int bio_phys_segments(struct request_queue *q, struct bio *bio) 426inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
421{ 427{
@@ -424,6 +430,7 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
424 430
425 return bio->bi_phys_segments; 431 return bio->bi_phys_segments;
426} 432}
433EXPORT_SYMBOL(bio_phys_segments);
427 434
428/** 435/**
429 * __bio_clone - clone a bio 436 * __bio_clone - clone a bio
@@ -451,6 +458,7 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
451 bio->bi_size = bio_src->bi_size; 458 bio->bi_size = bio_src->bi_size;
452 bio->bi_idx = bio_src->bi_idx; 459 bio->bi_idx = bio_src->bi_idx;
453} 460}
461EXPORT_SYMBOL(__bio_clone);
454 462
455/** 463/**
456 * bio_clone - clone a bio 464 * bio_clone - clone a bio
@@ -482,6 +490,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
482 490
483 return b; 491 return b;
484} 492}
493EXPORT_SYMBOL(bio_clone);
485 494
486/** 495/**
487 * bio_get_nr_vecs - return approx number of vecs 496 * bio_get_nr_vecs - return approx number of vecs
@@ -505,6 +514,7 @@ int bio_get_nr_vecs(struct block_device *bdev)
505 514
506 return nr_pages; 515 return nr_pages;
507} 516}
517EXPORT_SYMBOL(bio_get_nr_vecs);
508 518
509static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page 519static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
510 *page, unsigned int len, unsigned int offset, 520 *page, unsigned int len, unsigned int offset,
@@ -635,6 +645,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
635 return __bio_add_page(q, bio, page, len, offset, 645 return __bio_add_page(q, bio, page, len, offset,
636 queue_max_hw_sectors(q)); 646 queue_max_hw_sectors(q));
637} 647}
648EXPORT_SYMBOL(bio_add_pc_page);
638 649
639/** 650/**
640 * bio_add_page - attempt to add page to bio 651 * bio_add_page - attempt to add page to bio
@@ -655,6 +666,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
655 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 666 struct request_queue *q = bdev_get_queue(bio->bi_bdev);
656 return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q)); 667 return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
657} 668}
669EXPORT_SYMBOL(bio_add_page);
658 670
659struct bio_map_data { 671struct bio_map_data {
660 struct bio_vec *iovecs; 672 struct bio_vec *iovecs;
@@ -776,6 +788,7 @@ int bio_uncopy_user(struct bio *bio)
776 bio_put(bio); 788 bio_put(bio);
777 return ret; 789 return ret;
778} 790}
791EXPORT_SYMBOL(bio_uncopy_user);
779 792
780/** 793/**
781 * bio_copy_user_iov - copy user data to bio 794 * bio_copy_user_iov - copy user data to bio
@@ -920,6 +933,7 @@ struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
920 933
921 return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); 934 return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
922} 935}
936EXPORT_SYMBOL(bio_copy_user);
923 937
924static struct bio *__bio_map_user_iov(struct request_queue *q, 938static struct bio *__bio_map_user_iov(struct request_queue *q,
925 struct block_device *bdev, 939 struct block_device *bdev,
@@ -1050,6 +1064,7 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
1050 1064
1051 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); 1065 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
1052} 1066}
1067EXPORT_SYMBOL(bio_map_user);
1053 1068
1054/** 1069/**
1055 * bio_map_user_iov - map user sg_iovec table into bio 1070 * bio_map_user_iov - map user sg_iovec table into bio
@@ -1117,13 +1132,13 @@ void bio_unmap_user(struct bio *bio)
1117 __bio_unmap_user(bio); 1132 __bio_unmap_user(bio);
1118 bio_put(bio); 1133 bio_put(bio);
1119} 1134}
1135EXPORT_SYMBOL(bio_unmap_user);
1120 1136
1121static void bio_map_kern_endio(struct bio *bio, int err) 1137static void bio_map_kern_endio(struct bio *bio, int err)
1122{ 1138{
1123 bio_put(bio); 1139 bio_put(bio);
1124} 1140}
1125 1141
1126
1127static struct bio *__bio_map_kern(struct request_queue *q, void *data, 1142static struct bio *__bio_map_kern(struct request_queue *q, void *data,
1128 unsigned int len, gfp_t gfp_mask) 1143 unsigned int len, gfp_t gfp_mask)
1129{ 1144{
@@ -1189,6 +1204,7 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
1189 bio_put(bio); 1204 bio_put(bio);
1190 return ERR_PTR(-EINVAL); 1205 return ERR_PTR(-EINVAL);
1191} 1206}
1207EXPORT_SYMBOL(bio_map_kern);
1192 1208
1193static void bio_copy_kern_endio(struct bio *bio, int err) 1209static void bio_copy_kern_endio(struct bio *bio, int err)
1194{ 1210{
@@ -1250,6 +1266,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
1250 1266
1251 return bio; 1267 return bio;
1252} 1268}
1269EXPORT_SYMBOL(bio_copy_kern);
1253 1270
1254/* 1271/*
1255 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions 1272 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
@@ -1400,6 +1417,7 @@ void bio_endio(struct bio *bio, int error)
1400 if (bio->bi_end_io) 1417 if (bio->bi_end_io)
1401 bio->bi_end_io(bio, error); 1418 bio->bi_end_io(bio, error);
1402} 1419}
1420EXPORT_SYMBOL(bio_endio);
1403 1421
1404void bio_pair_release(struct bio_pair *bp) 1422void bio_pair_release(struct bio_pair *bp)
1405{ 1423{
@@ -1410,6 +1428,7 @@ void bio_pair_release(struct bio_pair *bp)
1410 mempool_free(bp, bp->bio2.bi_private); 1428 mempool_free(bp, bp->bio2.bi_private);
1411 } 1429 }
1412} 1430}
1431EXPORT_SYMBOL(bio_pair_release);
1413 1432
1414static void bio_pair_end_1(struct bio *bi, int err) 1433static void bio_pair_end_1(struct bio *bi, int err)
1415{ 1434{
@@ -1477,6 +1496,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1477 1496
1478 return bp; 1497 return bp;
1479} 1498}
1499EXPORT_SYMBOL(bio_split);
1480 1500
1481/** 1501/**
1482 * bio_sector_offset - Find hardware sector offset in bio 1502 * bio_sector_offset - Find hardware sector offset in bio
@@ -1547,6 +1567,7 @@ void bioset_free(struct bio_set *bs)
1547 1567
1548 kfree(bs); 1568 kfree(bs);
1549} 1569}
1570EXPORT_SYMBOL(bioset_free);
1550 1571
1551/** 1572/**
1552 * bioset_create - Create a bio_set 1573 * bioset_create - Create a bio_set
@@ -1592,6 +1613,7 @@ bad:
1592 bioset_free(bs); 1613 bioset_free(bs);
1593 return NULL; 1614 return NULL;
1594} 1615}
1616EXPORT_SYMBOL(bioset_create);
1595 1617
1596static void __init biovec_init_slabs(void) 1618static void __init biovec_init_slabs(void)
1597{ 1619{
@@ -1636,29 +1658,4 @@ static int __init init_bio(void)
1636 1658
1637 return 0; 1659 return 0;
1638} 1660}
1639
1640subsys_initcall(init_bio); 1661subsys_initcall(init_bio);
1641
1642EXPORT_SYMBOL(bio_alloc);
1643EXPORT_SYMBOL(bio_kmalloc);
1644EXPORT_SYMBOL(bio_put);
1645EXPORT_SYMBOL(bio_free);
1646EXPORT_SYMBOL(bio_endio);
1647EXPORT_SYMBOL(bio_init);
1648EXPORT_SYMBOL(__bio_clone);
1649EXPORT_SYMBOL(bio_clone);
1650EXPORT_SYMBOL(bio_phys_segments);
1651EXPORT_SYMBOL(bio_add_page);
1652EXPORT_SYMBOL(bio_add_pc_page);
1653EXPORT_SYMBOL(bio_get_nr_vecs);
1654EXPORT_SYMBOL(bio_map_user);
1655EXPORT_SYMBOL(bio_unmap_user);
1656EXPORT_SYMBOL(bio_map_kern);
1657EXPORT_SYMBOL(bio_copy_kern);
1658EXPORT_SYMBOL(bio_pair_release);
1659EXPORT_SYMBOL(bio_split);
1660EXPORT_SYMBOL(bio_copy_user);
1661EXPORT_SYMBOL(bio_uncopy_user);
1662EXPORT_SYMBOL(bioset_create);
1663EXPORT_SYMBOL(bioset_free);
1664EXPORT_SYMBOL(bio_alloc_bioset);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9cf4b926f8e4..8bed0557d88c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1248,8 +1248,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1248 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); 1248 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
1249 } 1249 }
1250 } else { 1250 } else {
1251 put_disk(disk);
1252 module_put(disk->fops->owner); 1251 module_put(disk->fops->owner);
1252 put_disk(disk);
1253 disk = NULL; 1253 disk = NULL;
1254 if (bdev->bd_contains == bdev) { 1254 if (bdev->bd_contains == bdev) {
1255 if (bdev->bd_disk->fops->open) { 1255 if (bdev->bd_disk->fops->open) {
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index f128427b995b..361604244271 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -27,7 +27,7 @@
27#include "btrfs_inode.h" 27#include "btrfs_inode.h"
28#include "xattr.h" 28#include "xattr.h"
29 29
30#ifdef CONFIG_FS_POSIX_ACL 30#ifdef CONFIG_BTRFS_FS_POSIX_ACL
31 31
32static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) 32static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
33{ 33{
@@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
313 .set = btrfs_xattr_acl_access_set, 313 .set = btrfs_xattr_acl_access_set,
314}; 314};
315 315
316#else /* CONFIG_FS_POSIX_ACL */ 316#else /* CONFIG_BTRFS_FS_POSIX_ACL */
317 317
318int btrfs_acl_chmod(struct inode *inode) 318int btrfs_acl_chmod(struct inode *inode)
319{ 319{
@@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
325 return 0; 325 return 0;
326} 326}
327 327
328#endif /* CONFIG_FS_POSIX_ACL */ 328#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 282ca085c2fb..c0861e781cdb 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -64,6 +64,51 @@ struct btrfs_worker_thread {
64}; 64};
65 65
66/* 66/*
67 * btrfs_start_workers uses kthread_run, which can block waiting for memory
68 * for a very long time. It will actually throttle on page writeback,
69 * and so it may not make progress until after our btrfs worker threads
70 * process all of the pending work structs in their queue
71 *
72 * This means we can't use btrfs_start_workers from inside a btrfs worker
73 * thread that is used as part of cleaning dirty memory, which pretty much
74 * involves all of the worker threads.
75 *
76 * Instead we have a helper queue who never has more than one thread
77 * where we scheduler thread start operations. This worker_start struct
78 * is used to contain the work and hold a pointer to the queue that needs
79 * another worker.
80 */
81struct worker_start {
82 struct btrfs_work work;
83 struct btrfs_workers *queue;
84};
85
86static void start_new_worker_func(struct btrfs_work *work)
87{
88 struct worker_start *start;
89 start = container_of(work, struct worker_start, work);
90 btrfs_start_workers(start->queue, 1);
91 kfree(start);
92}
93
94static int start_new_worker(struct btrfs_workers *queue)
95{
96 struct worker_start *start;
97 int ret;
98
99 start = kzalloc(sizeof(*start), GFP_NOFS);
100 if (!start)
101 return -ENOMEM;
102
103 start->work.func = start_new_worker_func;
104 start->queue = queue;
105 ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
106 if (ret)
107 kfree(start);
108 return ret;
109}
110
111/*
67 * helper function to move a thread onto the idle list after it 112 * helper function to move a thread onto the idle list after it
68 * has finished some requests. 113 * has finished some requests.
69 */ 114 */
@@ -118,11 +163,13 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
118 goto out; 163 goto out;
119 164
120 workers->atomic_start_pending = 0; 165 workers->atomic_start_pending = 0;
121 if (workers->num_workers >= workers->max_workers) 166 if (workers->num_workers + workers->num_workers_starting >=
167 workers->max_workers)
122 goto out; 168 goto out;
123 169
170 workers->num_workers_starting += 1;
124 spin_unlock_irqrestore(&workers->lock, flags); 171 spin_unlock_irqrestore(&workers->lock, flags);
125 btrfs_start_workers(workers, 1); 172 start_new_worker(workers);
126 return; 173 return;
127 174
128out: 175out:
@@ -390,9 +437,11 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
390/* 437/*
391 * simple init on struct btrfs_workers 438 * simple init on struct btrfs_workers
392 */ 439 */
393void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) 440void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
441 struct btrfs_workers *async_helper)
394{ 442{
395 workers->num_workers = 0; 443 workers->num_workers = 0;
444 workers->num_workers_starting = 0;
396 INIT_LIST_HEAD(&workers->worker_list); 445 INIT_LIST_HEAD(&workers->worker_list);
397 INIT_LIST_HEAD(&workers->idle_list); 446 INIT_LIST_HEAD(&workers->idle_list);
398 INIT_LIST_HEAD(&workers->order_list); 447 INIT_LIST_HEAD(&workers->order_list);
@@ -404,14 +453,15 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
404 workers->name = name; 453 workers->name = name;
405 workers->ordered = 0; 454 workers->ordered = 0;
406 workers->atomic_start_pending = 0; 455 workers->atomic_start_pending = 0;
407 workers->atomic_worker_start = 0; 456 workers->atomic_worker_start = async_helper;
408} 457}
409 458
410/* 459/*
411 * starts new worker threads. This does not enforce the max worker 460 * starts new worker threads. This does not enforce the max worker
412 * count in case you need to temporarily go past it. 461 * count in case you need to temporarily go past it.
413 */ 462 */
414int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) 463static int __btrfs_start_workers(struct btrfs_workers *workers,
464 int num_workers)
415{ 465{
416 struct btrfs_worker_thread *worker; 466 struct btrfs_worker_thread *worker;
417 int ret = 0; 467 int ret = 0;
@@ -444,6 +494,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
444 list_add_tail(&worker->worker_list, &workers->idle_list); 494 list_add_tail(&worker->worker_list, &workers->idle_list);
445 worker->idle = 1; 495 worker->idle = 1;
446 workers->num_workers++; 496 workers->num_workers++;
497 workers->num_workers_starting--;
498 WARN_ON(workers->num_workers_starting < 0);
447 spin_unlock_irq(&workers->lock); 499 spin_unlock_irq(&workers->lock);
448 } 500 }
449 return 0; 501 return 0;
@@ -452,6 +504,14 @@ fail:
452 return ret; 504 return ret;
453} 505}
454 506
507int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
508{
509 spin_lock_irq(&workers->lock);
510 workers->num_workers_starting += num_workers;
511 spin_unlock_irq(&workers->lock);
512 return __btrfs_start_workers(workers, num_workers);
513}
514
455/* 515/*
456 * run through the list and find a worker thread that doesn't have a lot 516 * run through the list and find a worker thread that doesn't have a lot
457 * to do right now. This can return null if we aren't yet at the thread 517 * to do right now. This can return null if we aren't yet at the thread
@@ -461,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
461{ 521{
462 struct btrfs_worker_thread *worker; 522 struct btrfs_worker_thread *worker;
463 struct list_head *next; 523 struct list_head *next;
464 int enforce_min = workers->num_workers < workers->max_workers; 524 int enforce_min;
525
526 enforce_min = (workers->num_workers + workers->num_workers_starting) <
527 workers->max_workers;
465 528
466 /* 529 /*
467 * if we find an idle thread, don't move it to the end of the 530 * if we find an idle thread, don't move it to the end of the
@@ -509,15 +572,17 @@ again:
509 worker = next_worker(workers); 572 worker = next_worker(workers);
510 573
511 if (!worker) { 574 if (!worker) {
512 if (workers->num_workers >= workers->max_workers) { 575 if (workers->num_workers + workers->num_workers_starting >=
576 workers->max_workers) {
513 goto fallback; 577 goto fallback;
514 } else if (workers->atomic_worker_start) { 578 } else if (workers->atomic_worker_start) {
515 workers->atomic_start_pending = 1; 579 workers->atomic_start_pending = 1;
516 goto fallback; 580 goto fallback;
517 } else { 581 } else {
582 workers->num_workers_starting++;
518 spin_unlock_irqrestore(&workers->lock, flags); 583 spin_unlock_irqrestore(&workers->lock, flags);
519 /* we're below the limit, start another worker */ 584 /* we're below the limit, start another worker */
520 btrfs_start_workers(workers, 1); 585 __btrfs_start_workers(workers, 1);
521 goto again; 586 goto again;
522 } 587 }
523 } 588 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index fc089b95ec14..5077746cf85e 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -64,6 +64,8 @@ struct btrfs_workers {
64 /* current number of running workers */ 64 /* current number of running workers */
65 int num_workers; 65 int num_workers;
66 66
67 int num_workers_starting;
68
67 /* max number of workers allowed. changed by btrfs_start_workers */ 69 /* max number of workers allowed. changed by btrfs_start_workers */
68 int max_workers; 70 int max_workers;
69 71
@@ -78,9 +80,10 @@ struct btrfs_workers {
78 80
79 /* 81 /*
80 * are we allowed to sleep while starting workers or are we required 82 * are we allowed to sleep while starting workers or are we required
81 * to start them at a later time? 83 * to start them at a later time? If we can't sleep, this indicates
84 * which queue we need to use to schedule thread creation.
82 */ 85 */
83 int atomic_worker_start; 86 struct btrfs_workers *atomic_worker_start;
84 87
85 /* list with all the work threads. The workers on the idle thread 88 /* list with all the work threads. The workers on the idle thread
86 * may be actively servicing jobs, but they haven't yet hit the 89 * may be actively servicing jobs, but they haven't yet hit the
@@ -109,7 +112,8 @@ struct btrfs_workers {
109int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); 112int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
110int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); 113int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
111int btrfs_stop_workers(struct btrfs_workers *workers); 114int btrfs_stop_workers(struct btrfs_workers *workers);
112void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); 115void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
116 struct btrfs_workers *async_starter);
113int btrfs_requeue_work(struct btrfs_work *work); 117int btrfs_requeue_work(struct btrfs_work *work);
114void btrfs_set_work_high_prio(struct btrfs_work *work); 118void btrfs_set_work_high_prio(struct btrfs_work *work);
115#endif 119#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 82ee56bba299..f6783a42f010 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -86,6 +86,12 @@ struct btrfs_inode {
86 * transid of the trans_handle that last modified this inode 86 * transid of the trans_handle that last modified this inode
87 */ 87 */
88 u64 last_trans; 88 u64 last_trans;
89
90 /*
91 * log transid when this inode was last modified
92 */
93 u64 last_sub_trans;
94
89 /* 95 /*
90 * transid that last logged this inode 96 * transid that last logged this inode
91 */ 97 */
@@ -128,6 +134,16 @@ struct btrfs_inode {
128 u64 last_unlink_trans; 134 u64 last_unlink_trans;
129 135
130 /* 136 /*
137 * Counters to keep track of the number of extent item's we may use due
138 * to delalloc and such. outstanding_extents is the number of extent
139 * items we think we'll end up using, and reserved_extents is the number
140 * of extent items we've reserved metadata for.
141 */
142 spinlock_t accounting_lock;
143 int reserved_extents;
144 int outstanding_extents;
145
146 /*
131 * ordered_data_close is set by truncate when a file that used 147 * ordered_data_close is set by truncate when a file that used
132 * to have good data has been truncated to zero. When it is set 148 * to have good data has been truncated to zero. When it is set
133 * the btrfs file release call will add this inode to the 149 * the btrfs file release call will add this inode to the
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 80599b4e42bd..444b3e9b92a4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -675,21 +675,28 @@ struct btrfs_space_info {
675 current allocations */ 675 current allocations */
676 u64 bytes_readonly; /* total bytes that are read only */ 676 u64 bytes_readonly; /* total bytes that are read only */
677 u64 bytes_super; /* total bytes reserved for the super blocks */ 677 u64 bytes_super; /* total bytes reserved for the super blocks */
678 678 u64 bytes_root; /* the number of bytes needed to commit a
679 /* delalloc accounting */ 679 transaction */
680 u64 bytes_delalloc; /* number of bytes reserved for allocation,
681 this space is not necessarily reserved yet
682 by the allocator */
683 u64 bytes_may_use; /* number of bytes that may be used for 680 u64 bytes_may_use; /* number of bytes that may be used for
684 delalloc */ 681 delalloc/allocations */
682 u64 bytes_delalloc; /* number of bytes currently reserved for
683 delayed allocation */
685 684
686 int full; /* indicates that we cannot allocate any more 685 int full; /* indicates that we cannot allocate any more
687 chunks for this space */ 686 chunks for this space */
688 int force_alloc; /* set if we need to force a chunk alloc for 687 int force_alloc; /* set if we need to force a chunk alloc for
689 this space */ 688 this space */
689 int force_delalloc; /* make people start doing filemap_flush until
690 we're under a threshold */
690 691
691 struct list_head list; 692 struct list_head list;
692 693
694 /* for controlling how we free up space for allocations */
695 wait_queue_head_t allocate_wait;
696 wait_queue_head_t flush_wait;
697 int allocating_chunk;
698 int flushing;
699
693 /* for block groups in our same type */ 700 /* for block groups in our same type */
694 struct list_head block_groups; 701 struct list_head block_groups;
695 spinlock_t lock; 702 spinlock_t lock;
@@ -903,6 +910,7 @@ struct btrfs_fs_info {
903 * A third pool does submit_bio to avoid deadlocking with the other 910 * A third pool does submit_bio to avoid deadlocking with the other
904 * two 911 * two
905 */ 912 */
913 struct btrfs_workers generic_worker;
906 struct btrfs_workers workers; 914 struct btrfs_workers workers;
907 struct btrfs_workers delalloc_workers; 915 struct btrfs_workers delalloc_workers;
908 struct btrfs_workers endio_workers; 916 struct btrfs_workers endio_workers;
@@ -910,6 +918,7 @@ struct btrfs_fs_info {
910 struct btrfs_workers endio_meta_write_workers; 918 struct btrfs_workers endio_meta_write_workers;
911 struct btrfs_workers endio_write_workers; 919 struct btrfs_workers endio_write_workers;
912 struct btrfs_workers submit_workers; 920 struct btrfs_workers submit_workers;
921 struct btrfs_workers enospc_workers;
913 /* 922 /*
914 * fixup workers take dirty pages that didn't properly go through 923 * fixup workers take dirty pages that didn't properly go through
915 * the cow mechanism and make them safe to write. It happens 924 * the cow mechanism and make them safe to write. It happens
@@ -1000,7 +1009,10 @@ struct btrfs_root {
1000 atomic_t log_writers; 1009 atomic_t log_writers;
1001 atomic_t log_commit[2]; 1010 atomic_t log_commit[2];
1002 unsigned long log_transid; 1011 unsigned long log_transid;
1012 unsigned long last_log_commit;
1003 unsigned long log_batch; 1013 unsigned long log_batch;
1014 pid_t log_start_pid;
1015 bool log_multiple_pids;
1004 1016
1005 u64 objectid; 1017 u64 objectid;
1006 u64 last_trans; 1018 u64 last_trans;
@@ -1141,6 +1153,7 @@ struct btrfs_root {
1141#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) 1153#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
1142#define BTRFS_MOUNT_SSD_SPREAD (1 << 8) 1154#define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
1143#define BTRFS_MOUNT_NOSSD (1 << 9) 1155#define BTRFS_MOUNT_NOSSD (1 << 9)
1156#define BTRFS_MOUNT_DISCARD (1 << 10)
1144 1157
1145#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1158#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1146#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1159#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2022,7 +2035,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2022void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2035void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2023void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2036void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2024 2037
2025int btrfs_check_metadata_free_space(struct btrfs_root *root); 2038int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
2039int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
2040int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2041 struct inode *inode, int num_items);
2042int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
2043 struct inode *inode, int num_items);
2026int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2044int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2027 u64 bytes); 2045 u64 bytes);
2028void btrfs_free_reserved_data_space(struct btrfs_root *root, 2046void btrfs_free_reserved_data_space(struct btrfs_root *root,
@@ -2314,7 +2332,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2314void btrfs_orphan_cleanup(struct btrfs_root *root); 2332void btrfs_orphan_cleanup(struct btrfs_root *root);
2315int btrfs_cont_expand(struct inode *inode, loff_t size); 2333int btrfs_cont_expand(struct inode *inode, loff_t size);
2316int btrfs_invalidate_inodes(struct btrfs_root *root); 2334int btrfs_invalidate_inodes(struct btrfs_root *root);
2317extern struct dentry_operations btrfs_dentry_operations; 2335extern const struct dentry_operations btrfs_dentry_operations;
2318 2336
2319/* ioctl.c */ 2337/* ioctl.c */
2320long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 2338long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -2326,7 +2344,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
2326int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2344int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2327 int skip_pinned); 2345 int skip_pinned);
2328int btrfs_check_file(struct btrfs_root *root, struct inode *inode); 2346int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
2329extern struct file_operations btrfs_file_operations; 2347extern const struct file_operations btrfs_file_operations;
2330int btrfs_drop_extents(struct btrfs_trans_handle *trans, 2348int btrfs_drop_extents(struct btrfs_trans_handle *trans,
2331 struct btrfs_root *root, struct inode *inode, 2349 struct btrfs_root *root, struct inode *inode,
2332 u64 start, u64 end, u64 locked_end, 2350 u64 start, u64 end, u64 locked_end,
@@ -2357,7 +2375,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
2357int btrfs_sync_fs(struct super_block *sb, int wait); 2375int btrfs_sync_fs(struct super_block *sb, int wait);
2358 2376
2359/* acl.c */ 2377/* acl.c */
2360#ifdef CONFIG_FS_POSIX_ACL 2378#ifdef CONFIG_BTRFS_FS_POSIX_ACL
2361int btrfs_check_acl(struct inode *inode, int mask); 2379int btrfs_check_acl(struct inode *inode, int mask);
2362#else 2380#else
2363#define btrfs_check_acl NULL 2381#define btrfs_check_acl NULL
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 644e796fd643..02b6afbd7450 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -822,14 +822,14 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
822 822
823int btrfs_write_tree_block(struct extent_buffer *buf) 823int btrfs_write_tree_block(struct extent_buffer *buf)
824{ 824{
825 return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, 825 return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
826 buf->start + buf->len - 1, WB_SYNC_ALL); 826 buf->start + buf->len - 1);
827} 827}
828 828
829int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) 829int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
830{ 830{
831 return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, 831 return filemap_fdatawait_range(buf->first_page->mapping,
832 buf->start, buf->start + buf->len - 1); 832 buf->start, buf->start + buf->len - 1);
833} 833}
834 834
835struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, 835struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -917,6 +917,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
917 atomic_set(&root->log_writers, 0); 917 atomic_set(&root->log_writers, 0);
918 root->log_batch = 0; 918 root->log_batch = 0;
919 root->log_transid = 0; 919 root->log_transid = 0;
920 root->last_log_commit = 0;
920 extent_io_tree_init(&root->dirty_log_pages, 921 extent_io_tree_init(&root->dirty_log_pages,
921 fs_info->btree_inode->i_mapping, GFP_NOFS); 922 fs_info->btree_inode->i_mapping, GFP_NOFS);
922 923
@@ -1087,6 +1088,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1087 WARN_ON(root->log_root); 1088 WARN_ON(root->log_root);
1088 root->log_root = log_root; 1089 root->log_root = log_root;
1089 root->log_transid = 0; 1090 root->log_transid = 0;
1091 root->last_log_commit = 0;
1090 return 0; 1092 return 0;
1091} 1093}
1092 1094
@@ -1630,7 +1632,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1630 fs_info->sb = sb; 1632 fs_info->sb = sb;
1631 fs_info->max_extent = (u64)-1; 1633 fs_info->max_extent = (u64)-1;
1632 fs_info->max_inline = 8192 * 1024; 1634 fs_info->max_inline = 8192 * 1024;
1633 fs_info->metadata_ratio = 8; 1635 fs_info->metadata_ratio = 0;
1634 1636
1635 fs_info->thread_pool_size = min_t(unsigned long, 1637 fs_info->thread_pool_size = min_t(unsigned long,
1636 num_online_cpus() + 2, 8); 1638 num_online_cpus() + 2, 8);
@@ -1746,21 +1748,25 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1746 err = -EINVAL; 1748 err = -EINVAL;
1747 goto fail_iput; 1749 goto fail_iput;
1748 } 1750 }
1749printk("thread pool is %d\n", fs_info->thread_pool_size); 1751
1750 /* 1752 btrfs_init_workers(&fs_info->generic_worker,
1751 * we need to start all the end_io workers up front because the 1753 "genwork", 1, NULL);
1752 * queue work function gets called at interrupt time, and so it 1754
1753 * cannot dynamically grow.
1754 */
1755 btrfs_init_workers(&fs_info->workers, "worker", 1755 btrfs_init_workers(&fs_info->workers, "worker",
1756 fs_info->thread_pool_size); 1756 fs_info->thread_pool_size,
1757 &fs_info->generic_worker);
1757 1758
1758 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", 1759 btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
1759 fs_info->thread_pool_size); 1760 fs_info->thread_pool_size,
1761 &fs_info->generic_worker);
1760 1762
1761 btrfs_init_workers(&fs_info->submit_workers, "submit", 1763 btrfs_init_workers(&fs_info->submit_workers, "submit",
1762 min_t(u64, fs_devices->num_devices, 1764 min_t(u64, fs_devices->num_devices,
1763 fs_info->thread_pool_size)); 1765 fs_info->thread_pool_size),
1766 &fs_info->generic_worker);
1767 btrfs_init_workers(&fs_info->enospc_workers, "enospc",
1768 fs_info->thread_pool_size,
1769 &fs_info->generic_worker);
1764 1770
1765 /* a higher idle thresh on the submit workers makes it much more 1771 /* a higher idle thresh on the submit workers makes it much more
1766 * likely that bios will be send down in a sane order to the 1772 * likely that bios will be send down in a sane order to the
@@ -1774,15 +1780,20 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
1774 fs_info->delalloc_workers.idle_thresh = 2; 1780 fs_info->delalloc_workers.idle_thresh = 2;
1775 fs_info->delalloc_workers.ordered = 1; 1781 fs_info->delalloc_workers.ordered = 1;
1776 1782
1777 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); 1783 btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
1784 &fs_info->generic_worker);
1778 btrfs_init_workers(&fs_info->endio_workers, "endio", 1785 btrfs_init_workers(&fs_info->endio_workers, "endio",
1779 fs_info->thread_pool_size); 1786 fs_info->thread_pool_size,
1787 &fs_info->generic_worker);
1780 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", 1788 btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
1781 fs_info->thread_pool_size); 1789 fs_info->thread_pool_size,
1790 &fs_info->generic_worker);
1782 btrfs_init_workers(&fs_info->endio_meta_write_workers, 1791 btrfs_init_workers(&fs_info->endio_meta_write_workers,
1783 "endio-meta-write", fs_info->thread_pool_size); 1792 "endio-meta-write", fs_info->thread_pool_size,
1793 &fs_info->generic_worker);
1784 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 1794 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1785 fs_info->thread_pool_size); 1795 fs_info->thread_pool_size,
1796 &fs_info->generic_worker);
1786 1797
1787 /* 1798 /*
1788 * endios are largely parallel and should have a very 1799 * endios are largely parallel and should have a very
@@ -1794,12 +1805,8 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
1794 fs_info->endio_write_workers.idle_thresh = 2; 1805 fs_info->endio_write_workers.idle_thresh = 2;
1795 fs_info->endio_meta_write_workers.idle_thresh = 2; 1806 fs_info->endio_meta_write_workers.idle_thresh = 2;
1796 1807
1797 fs_info->endio_workers.atomic_worker_start = 1;
1798 fs_info->endio_meta_workers.atomic_worker_start = 1;
1799 fs_info->endio_write_workers.atomic_worker_start = 1;
1800 fs_info->endio_meta_write_workers.atomic_worker_start = 1;
1801
1802 btrfs_start_workers(&fs_info->workers, 1); 1808 btrfs_start_workers(&fs_info->workers, 1);
1809 btrfs_start_workers(&fs_info->generic_worker, 1);
1803 btrfs_start_workers(&fs_info->submit_workers, 1); 1810 btrfs_start_workers(&fs_info->submit_workers, 1);
1804 btrfs_start_workers(&fs_info->delalloc_workers, 1); 1811 btrfs_start_workers(&fs_info->delalloc_workers, 1);
1805 btrfs_start_workers(&fs_info->fixup_workers, 1); 1812 btrfs_start_workers(&fs_info->fixup_workers, 1);
@@ -1807,6 +1814,7 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
1807 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1814 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1808 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1815 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1809 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1816 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1817 btrfs_start_workers(&fs_info->enospc_workers, 1);
1810 1818
1811 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1819 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1812 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1820 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2012,6 +2020,7 @@ fail_chunk_root:
2012 free_extent_buffer(chunk_root->node); 2020 free_extent_buffer(chunk_root->node);
2013 free_extent_buffer(chunk_root->commit_root); 2021 free_extent_buffer(chunk_root->commit_root);
2014fail_sb_buffer: 2022fail_sb_buffer:
2023 btrfs_stop_workers(&fs_info->generic_worker);
2015 btrfs_stop_workers(&fs_info->fixup_workers); 2024 btrfs_stop_workers(&fs_info->fixup_workers);
2016 btrfs_stop_workers(&fs_info->delalloc_workers); 2025 btrfs_stop_workers(&fs_info->delalloc_workers);
2017 btrfs_stop_workers(&fs_info->workers); 2026 btrfs_stop_workers(&fs_info->workers);
@@ -2020,6 +2029,7 @@ fail_sb_buffer:
2020 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2029 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2021 btrfs_stop_workers(&fs_info->endio_write_workers); 2030 btrfs_stop_workers(&fs_info->endio_write_workers);
2022 btrfs_stop_workers(&fs_info->submit_workers); 2031 btrfs_stop_workers(&fs_info->submit_workers);
2032 btrfs_stop_workers(&fs_info->enospc_workers);
2023fail_iput: 2033fail_iput:
2024 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2034 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2025 iput(fs_info->btree_inode); 2035 iput(fs_info->btree_inode);
@@ -2437,6 +2447,7 @@ int close_ctree(struct btrfs_root *root)
2437 2447
2438 iput(fs_info->btree_inode); 2448 iput(fs_info->btree_inode);
2439 2449
2450 btrfs_stop_workers(&fs_info->generic_worker);
2440 btrfs_stop_workers(&fs_info->fixup_workers); 2451 btrfs_stop_workers(&fs_info->fixup_workers);
2441 btrfs_stop_workers(&fs_info->delalloc_workers); 2452 btrfs_stop_workers(&fs_info->delalloc_workers);
2442 btrfs_stop_workers(&fs_info->workers); 2453 btrfs_stop_workers(&fs_info->workers);
@@ -2445,6 +2456,7 @@ int close_ctree(struct btrfs_root *root)
2445 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2456 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2446 btrfs_stop_workers(&fs_info->endio_write_workers); 2457 btrfs_stop_workers(&fs_info->endio_write_workers);
2447 btrfs_stop_workers(&fs_info->submit_workers); 2458 btrfs_stop_workers(&fs_info->submit_workers);
2459 btrfs_stop_workers(&fs_info->enospc_workers);
2448 2460
2449 btrfs_close_devices(fs_info->fs_devices); 2461 btrfs_close_devices(fs_info->fs_devices);
2450 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2462 btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 993f93ff7ba6..94627c4cc193 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -68,6 +68,8 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
68 struct extent_buffer **must_clean); 68 struct extent_buffer **must_clean);
69static int find_next_key(struct btrfs_path *path, int level, 69static int find_next_key(struct btrfs_path *path, int level,
70 struct btrfs_key *key); 70 struct btrfs_key *key);
71static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
72 int dump_block_groups);
71 73
72static noinline int 74static noinline int
73block_group_cache_done(struct btrfs_block_group_cache *cache) 75block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -1566,23 +1568,23 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1566 return ret; 1568 return ret;
1567} 1569}
1568 1570
1569#ifdef BIO_RW_DISCARD
1570static void btrfs_issue_discard(struct block_device *bdev, 1571static void btrfs_issue_discard(struct block_device *bdev,
1571 u64 start, u64 len) 1572 u64 start, u64 len)
1572{ 1573{
1573 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1574 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1574 DISCARD_FL_BARRIER); 1575 DISCARD_FL_BARRIER);
1575} 1576}
1576#endif
1577 1577
1578static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1578static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1579 u64 num_bytes) 1579 u64 num_bytes)
1580{ 1580{
1581#ifdef BIO_RW_DISCARD
1582 int ret; 1581 int ret;
1583 u64 map_length = num_bytes; 1582 u64 map_length = num_bytes;
1584 struct btrfs_multi_bio *multi = NULL; 1583 struct btrfs_multi_bio *multi = NULL;
1585 1584
1585 if (!btrfs_test_opt(root, DISCARD))
1586 return 0;
1587
1586 /* Tell the block device(s) that the sectors can be discarded */ 1588 /* Tell the block device(s) that the sectors can be discarded */
1587 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, 1589 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
1588 bytenr, &map_length, &multi, 0); 1590 bytenr, &map_length, &multi, 0);
@@ -1602,9 +1604,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1602 } 1604 }
1603 1605
1604 return ret; 1606 return ret;
1605#else
1606 return 0;
1607#endif
1608} 1607}
1609 1608
1610int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1609int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
@@ -2765,67 +2764,448 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2765 alloc_target); 2764 alloc_target);
2766} 2765}
2767 2766
2767static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2768{
2769 u64 num_bytes;
2770 int level;
2771
2772 level = BTRFS_MAX_LEVEL - 2;
2773 /*
2774 * NOTE: these calculations are absolutely the worst possible case.
2775 * This assumes that _every_ item we insert will require a new leaf, and
2776 * that the tree has grown to its maximum level size.
2777 */
2778
2779 /*
2780 * for every item we insert we could insert both an extent item and a
2781 * extent ref item. Then for ever item we insert, we will need to cow
2782 * both the original leaf, plus the leaf to the left and right of it.
2783 *
2784 * Unless we are talking about the extent root, then we just want the
2785 * number of items * 2, since we just need the extent item plus its ref.
2786 */
2787 if (root == root->fs_info->extent_root)
2788 num_bytes = num_items * 2;
2789 else
2790 num_bytes = (num_items + (2 * num_items)) * 3;
2791
2792 /*
2793 * num_bytes is total number of leaves we could need times the leaf
2794 * size, and then for every leaf we could end up cow'ing 2 nodes per
2795 * level, down to the leaf level.
2796 */
2797 num_bytes = (num_bytes * root->leafsize) +
2798 (num_bytes * (level * 2)) * root->nodesize;
2799
2800 return num_bytes;
2801}
2802
2768/* 2803/*
2769 * for now this just makes sure we have at least 5% of our metadata space free 2804 * Unreserve metadata space for delalloc. If we have less reserved credits than
2770 * for use. 2805 * we have extents, this function does nothing.
2771 */ 2806 */
2772int btrfs_check_metadata_free_space(struct btrfs_root *root) 2807int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2808 struct inode *inode, int num_items)
2773{ 2809{
2774 struct btrfs_fs_info *info = root->fs_info; 2810 struct btrfs_fs_info *info = root->fs_info;
2775 struct btrfs_space_info *meta_sinfo; 2811 struct btrfs_space_info *meta_sinfo;
2776 u64 alloc_target, thresh; 2812 u64 num_bytes;
2777 int committed = 0, ret; 2813 u64 alloc_target;
2814 bool bug = false;
2778 2815
2779 /* get the space info for where the metadata will live */ 2816 /* get the space info for where the metadata will live */
2780 alloc_target = btrfs_get_alloc_profile(root, 0); 2817 alloc_target = btrfs_get_alloc_profile(root, 0);
2781 meta_sinfo = __find_space_info(info, alloc_target); 2818 meta_sinfo = __find_space_info(info, alloc_target);
2782 if (!meta_sinfo)
2783 goto alloc;
2784 2819
2785again: 2820 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2821 num_items);
2822
2786 spin_lock(&meta_sinfo->lock); 2823 spin_lock(&meta_sinfo->lock);
2787 if (!meta_sinfo->full) 2824 spin_lock(&BTRFS_I(inode)->accounting_lock);
2788 thresh = meta_sinfo->total_bytes * 80; 2825 if (BTRFS_I(inode)->reserved_extents <=
2789 else 2826 BTRFS_I(inode)->outstanding_extents) {
2790 thresh = meta_sinfo->total_bytes * 95; 2827 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2828 spin_unlock(&meta_sinfo->lock);
2829 return 0;
2830 }
2831 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2832
2833 BTRFS_I(inode)->reserved_extents--;
2834 BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2835
2836 if (meta_sinfo->bytes_delalloc < num_bytes) {
2837 bug = true;
2838 meta_sinfo->bytes_delalloc = 0;
2839 } else {
2840 meta_sinfo->bytes_delalloc -= num_bytes;
2841 }
2842 spin_unlock(&meta_sinfo->lock);
2843
2844 BUG_ON(bug);
2791 2845
2846 return 0;
2847}
2848
2849static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2850{
2851 u64 thresh;
2852
2853 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2854 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
2855 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2856 meta_sinfo->bytes_may_use;
2857
2858 thresh = meta_sinfo->total_bytes - thresh;
2859 thresh *= 80;
2792 do_div(thresh, 100); 2860 do_div(thresh, 100);
2861 if (thresh <= meta_sinfo->bytes_delalloc)
2862 meta_sinfo->force_delalloc = 1;
2863 else
2864 meta_sinfo->force_delalloc = 0;
2865}
2793 2866
2794 if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 2867struct async_flush {
2795 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 2868 struct btrfs_root *root;
2796 meta_sinfo->bytes_super > thresh) { 2869 struct btrfs_space_info *info;
2797 struct btrfs_trans_handle *trans; 2870 struct btrfs_work work;
2798 if (!meta_sinfo->full) { 2871};
2799 meta_sinfo->force_alloc = 1; 2872
2873static noinline void flush_delalloc_async(struct btrfs_work *work)
2874{
2875 struct async_flush *async;
2876 struct btrfs_root *root;
2877 struct btrfs_space_info *info;
2878
2879 async = container_of(work, struct async_flush, work);
2880 root = async->root;
2881 info = async->info;
2882
2883 btrfs_start_delalloc_inodes(root);
2884 wake_up(&info->flush_wait);
2885 btrfs_wait_ordered_extents(root, 0);
2886
2887 spin_lock(&info->lock);
2888 info->flushing = 0;
2889 spin_unlock(&info->lock);
2890 wake_up(&info->flush_wait);
2891
2892 kfree(async);
2893}
2894
2895static void wait_on_flush(struct btrfs_space_info *info)
2896{
2897 DEFINE_WAIT(wait);
2898 u64 used;
2899
2900 while (1) {
2901 prepare_to_wait(&info->flush_wait, &wait,
2902 TASK_UNINTERRUPTIBLE);
2903 spin_lock(&info->lock);
2904 if (!info->flushing) {
2905 spin_unlock(&info->lock);
2906 break;
2907 }
2908
2909 used = info->bytes_used + info->bytes_reserved +
2910 info->bytes_pinned + info->bytes_readonly +
2911 info->bytes_super + info->bytes_root +
2912 info->bytes_may_use + info->bytes_delalloc;
2913 if (used < info->total_bytes) {
2914 spin_unlock(&info->lock);
2915 break;
2916 }
2917 spin_unlock(&info->lock);
2918 schedule();
2919 }
2920 finish_wait(&info->flush_wait, &wait);
2921}
2922
2923static void flush_delalloc(struct btrfs_root *root,
2924 struct btrfs_space_info *info)
2925{
2926 struct async_flush *async;
2927 bool wait = false;
2928
2929 spin_lock(&info->lock);
2930
2931 if (!info->flushing) {
2932 info->flushing = 1;
2933 init_waitqueue_head(&info->flush_wait);
2934 } else {
2935 wait = true;
2936 }
2937
2938 spin_unlock(&info->lock);
2939
2940 if (wait) {
2941 wait_on_flush(info);
2942 return;
2943 }
2944
2945 async = kzalloc(sizeof(*async), GFP_NOFS);
2946 if (!async)
2947 goto flush;
2948
2949 async->root = root;
2950 async->info = info;
2951 async->work.func = flush_delalloc_async;
2952
2953 btrfs_queue_worker(&root->fs_info->enospc_workers,
2954 &async->work);
2955 wait_on_flush(info);
2956 return;
2957
2958flush:
2959 btrfs_start_delalloc_inodes(root);
2960 btrfs_wait_ordered_extents(root, 0);
2961
2962 spin_lock(&info->lock);
2963 info->flushing = 0;
2964 spin_unlock(&info->lock);
2965 wake_up(&info->flush_wait);
2966}
2967
2968static int maybe_allocate_chunk(struct btrfs_root *root,
2969 struct btrfs_space_info *info)
2970{
2971 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2972 struct btrfs_trans_handle *trans;
2973 bool wait = false;
2974 int ret = 0;
2975 u64 min_metadata;
2976 u64 free_space;
2977
2978 free_space = btrfs_super_total_bytes(disk_super);
2979 /*
2980 * we allow the metadata to grow to a max of either 10gb or 5% of the
2981 * space in the volume.
2982 */
2983 min_metadata = min((u64)10 * 1024 * 1024 * 1024,
2984 div64_u64(free_space * 5, 100));
2985 if (info->total_bytes >= min_metadata) {
2986 spin_unlock(&info->lock);
2987 return 0;
2988 }
2989
2990 if (info->full) {
2991 spin_unlock(&info->lock);
2992 return 0;
2993 }
2994
2995 if (!info->allocating_chunk) {
2996 info->force_alloc = 1;
2997 info->allocating_chunk = 1;
2998 init_waitqueue_head(&info->allocate_wait);
2999 } else {
3000 wait = true;
3001 }
3002
3003 spin_unlock(&info->lock);
3004
3005 if (wait) {
3006 wait_event(info->allocate_wait,
3007 !info->allocating_chunk);
3008 return 1;
3009 }
3010
3011 trans = btrfs_start_transaction(root, 1);
3012 if (!trans) {
3013 ret = -ENOMEM;
3014 goto out;
3015 }
3016
3017 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3018 4096 + 2 * 1024 * 1024,
3019 info->flags, 0);
3020 btrfs_end_transaction(trans, root);
3021 if (ret)
3022 goto out;
3023out:
3024 spin_lock(&info->lock);
3025 info->allocating_chunk = 0;
3026 spin_unlock(&info->lock);
3027 wake_up(&info->allocate_wait);
3028
3029 if (ret)
3030 return 0;
3031 return 1;
3032}
3033
3034/*
3035 * Reserve metadata space for delalloc.
3036 */
3037int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
3038 struct inode *inode, int num_items)
3039{
3040 struct btrfs_fs_info *info = root->fs_info;
3041 struct btrfs_space_info *meta_sinfo;
3042 u64 num_bytes;
3043 u64 used;
3044 u64 alloc_target;
3045 int flushed = 0;
3046 int force_delalloc;
3047
3048 /* get the space info for where the metadata will live */
3049 alloc_target = btrfs_get_alloc_profile(root, 0);
3050 meta_sinfo = __find_space_info(info, alloc_target);
3051
3052 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
3053 num_items);
3054again:
3055 spin_lock(&meta_sinfo->lock);
3056
3057 force_delalloc = meta_sinfo->force_delalloc;
3058
3059 if (unlikely(!meta_sinfo->bytes_root))
3060 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3061
3062 if (!flushed)
3063 meta_sinfo->bytes_delalloc += num_bytes;
3064
3065 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3066 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3067 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3068 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3069
3070 if (used > meta_sinfo->total_bytes) {
3071 flushed++;
3072
3073 if (flushed == 1) {
3074 if (maybe_allocate_chunk(root, meta_sinfo))
3075 goto again;
3076 flushed++;
3077 } else {
2800 spin_unlock(&meta_sinfo->lock); 3078 spin_unlock(&meta_sinfo->lock);
2801alloc: 3079 }
2802 trans = btrfs_start_transaction(root, 1);
2803 if (!trans)
2804 return -ENOMEM;
2805 3080
2806 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3081 if (flushed == 2) {
2807 2 * 1024 * 1024, alloc_target, 0); 3082 filemap_flush(inode->i_mapping);
2808 btrfs_end_transaction(trans, root); 3083 goto again;
2809 if (!meta_sinfo) { 3084 } else if (flushed == 3) {
2810 meta_sinfo = __find_space_info(info, 3085 flush_delalloc(root, meta_sinfo);
2811 alloc_target);
2812 }
2813 goto again; 3086 goto again;
2814 } 3087 }
3088 spin_lock(&meta_sinfo->lock);
3089 meta_sinfo->bytes_delalloc -= num_bytes;
2815 spin_unlock(&meta_sinfo->lock); 3090 spin_unlock(&meta_sinfo->lock);
3091 printk(KERN_ERR "enospc, has %d, reserved %d\n",
3092 BTRFS_I(inode)->outstanding_extents,
3093 BTRFS_I(inode)->reserved_extents);
3094 dump_space_info(meta_sinfo, 0, 0);
3095 return -ENOSPC;
3096 }
2816 3097
2817 if (!committed) { 3098 BTRFS_I(inode)->reserved_extents++;
2818 committed = 1; 3099 check_force_delalloc(meta_sinfo);
2819 trans = btrfs_join_transaction(root, 1); 3100 spin_unlock(&meta_sinfo->lock);
2820 if (!trans) 3101
2821 return -ENOMEM; 3102 if (!flushed && force_delalloc)
2822 ret = btrfs_commit_transaction(trans, root); 3103 filemap_flush(inode->i_mapping);
2823 if (ret) 3104
2824 return ret; 3105 return 0;
3106}
3107
3108/*
3109 * unreserve num_items number of items worth of metadata space. This needs to
3110 * be paired with btrfs_reserve_metadata_space.
3111 *
3112 * NOTE: if you have the option, run this _AFTER_ you do a
3113 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3114 * oprations which will result in more used metadata, so we want to make sure we
3115 * can do that without issue.
3116 */
3117int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3118{
3119 struct btrfs_fs_info *info = root->fs_info;
3120 struct btrfs_space_info *meta_sinfo;
3121 u64 num_bytes;
3122 u64 alloc_target;
3123 bool bug = false;
3124
3125 /* get the space info for where the metadata will live */
3126 alloc_target = btrfs_get_alloc_profile(root, 0);
3127 meta_sinfo = __find_space_info(info, alloc_target);
3128
3129 num_bytes = calculate_bytes_needed(root, num_items);
3130
3131 spin_lock(&meta_sinfo->lock);
3132 if (meta_sinfo->bytes_may_use < num_bytes) {
3133 bug = true;
3134 meta_sinfo->bytes_may_use = 0;
3135 } else {
3136 meta_sinfo->bytes_may_use -= num_bytes;
3137 }
3138 spin_unlock(&meta_sinfo->lock);
3139
3140 BUG_ON(bug);
3141
3142 return 0;
3143}
3144
3145/*
3146 * Reserve some metadata space for use. We'll calculate the worste case number
3147 * of bytes that would be needed to modify num_items number of items. If we
3148 * have space, fantastic, if not, you get -ENOSPC. Please call
3149 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3150 * items you reserved, since whatever metadata you needed should have already
3151 * been allocated.
3152 *
3153 * This will commit the transaction to make more space if we don't have enough
3154 * metadata space. THe only time we don't do this is if we're reserving space
3155 * inside of a transaction, then we will just return -ENOSPC and it is the
3156 * callers responsibility to handle it properly.
3157 */
3158int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3159{
3160 struct btrfs_fs_info *info = root->fs_info;
3161 struct btrfs_space_info *meta_sinfo;
3162 u64 num_bytes;
3163 u64 used;
3164 u64 alloc_target;
3165 int retries = 0;
3166
3167 /* get the space info for where the metadata will live */
3168 alloc_target = btrfs_get_alloc_profile(root, 0);
3169 meta_sinfo = __find_space_info(info, alloc_target);
3170
3171 num_bytes = calculate_bytes_needed(root, num_items);
3172again:
3173 spin_lock(&meta_sinfo->lock);
3174
3175 if (unlikely(!meta_sinfo->bytes_root))
3176 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3177
3178 if (!retries)
3179 meta_sinfo->bytes_may_use += num_bytes;
3180
3181 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3182 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3183 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3184 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3185
3186 if (used > meta_sinfo->total_bytes) {
3187 retries++;
3188 if (retries == 1) {
3189 if (maybe_allocate_chunk(root, meta_sinfo))
3190 goto again;
3191 retries++;
3192 } else {
3193 spin_unlock(&meta_sinfo->lock);
3194 }
3195
3196 if (retries == 2) {
3197 flush_delalloc(root, meta_sinfo);
2825 goto again; 3198 goto again;
2826 } 3199 }
3200 spin_lock(&meta_sinfo->lock);
3201 meta_sinfo->bytes_may_use -= num_bytes;
3202 spin_unlock(&meta_sinfo->lock);
3203
3204 dump_space_info(meta_sinfo, 0, 0);
2827 return -ENOSPC; 3205 return -ENOSPC;
2828 } 3206 }
3207
3208 check_force_delalloc(meta_sinfo);
2829 spin_unlock(&meta_sinfo->lock); 3209 spin_unlock(&meta_sinfo->lock);
2830 3210
2831 return 0; 3211 return 0;
@@ -2888,7 +3268,7 @@ alloc:
2888 spin_unlock(&data_sinfo->lock); 3268 spin_unlock(&data_sinfo->lock);
2889 3269
2890 /* commit the current transaction and try again */ 3270 /* commit the current transaction and try again */
2891 if (!committed) { 3271 if (!committed && !root->fs_info->open_ioctl_trans) {
2892 committed = 1; 3272 committed = 1;
2893 trans = btrfs_join_transaction(root, 1); 3273 trans = btrfs_join_transaction(root, 1);
2894 if (!trans) 3274 if (!trans)
@@ -2916,7 +3296,7 @@ alloc:
2916 BTRFS_I(inode)->reserved_bytes += bytes; 3296 BTRFS_I(inode)->reserved_bytes += bytes;
2917 spin_unlock(&data_sinfo->lock); 3297 spin_unlock(&data_sinfo->lock);
2918 3298
2919 return btrfs_check_metadata_free_space(root); 3299 return 0;
2920} 3300}
2921 3301
2922/* 3302/*
@@ -3015,17 +3395,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3015 BUG_ON(!space_info); 3395 BUG_ON(!space_info);
3016 3396
3017 spin_lock(&space_info->lock); 3397 spin_lock(&space_info->lock);
3018 if (space_info->force_alloc) { 3398 if (space_info->force_alloc)
3019 force = 1; 3399 force = 1;
3020 space_info->force_alloc = 0;
3021 }
3022 if (space_info->full) { 3400 if (space_info->full) {
3023 spin_unlock(&space_info->lock); 3401 spin_unlock(&space_info->lock);
3024 goto out; 3402 goto out;
3025 } 3403 }
3026 3404
3027 thresh = space_info->total_bytes - space_info->bytes_readonly; 3405 thresh = space_info->total_bytes - space_info->bytes_readonly;
3028 thresh = div_factor(thresh, 6); 3406 thresh = div_factor(thresh, 8);
3029 if (!force && 3407 if (!force &&
3030 (space_info->bytes_used + space_info->bytes_pinned + 3408 (space_info->bytes_used + space_info->bytes_pinned +
3031 space_info->bytes_reserved + alloc_bytes) < thresh) { 3409 space_info->bytes_reserved + alloc_bytes) < thresh) {
@@ -3039,7 +3417,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3039 * we keep a reasonable number of metadata chunks allocated in the 3417 * we keep a reasonable number of metadata chunks allocated in the
3040 * FS as well. 3418 * FS as well.
3041 */ 3419 */
3042 if (flags & BTRFS_BLOCK_GROUP_DATA) { 3420 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3043 fs_info->data_chunk_allocations++; 3421 fs_info->data_chunk_allocations++;
3044 if (!(fs_info->data_chunk_allocations % 3422 if (!(fs_info->data_chunk_allocations %
3045 fs_info->metadata_ratio)) 3423 fs_info->metadata_ratio))
@@ -3047,8 +3425,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3047 } 3425 }
3048 3426
3049 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3427 ret = btrfs_alloc_chunk(trans, extent_root, flags);
3428 spin_lock(&space_info->lock);
3050 if (ret) 3429 if (ret)
3051 space_info->full = 1; 3430 space_info->full = 1;
3431 space_info->force_alloc = 0;
3432 spin_unlock(&space_info->lock);
3052out: 3433out:
3053 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3434 mutex_unlock(&extent_root->fs_info->chunk_mutex);
3054 return ret; 3435 return ret;
@@ -3306,6 +3687,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
3306 if (is_data) 3687 if (is_data)
3307 goto pinit; 3688 goto pinit;
3308 3689
3690 /*
3691 * discard is sloooow, and so triggering discards on
3692 * individual btree blocks isn't a good plan. Just
3693 * pin everything in discard mode.
3694 */
3695 if (btrfs_test_opt(root, DISCARD))
3696 goto pinit;
3697
3309 buf = btrfs_find_tree_block(root, bytenr, num_bytes); 3698 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
3310 if (!buf) 3699 if (!buf)
3311 goto pinit; 3700 goto pinit;
@@ -3713,7 +4102,7 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
3713} 4102}
3714 4103
3715enum btrfs_loop_type { 4104enum btrfs_loop_type {
3716 LOOP_CACHED_ONLY = 0, 4105 LOOP_FIND_IDEAL = 0,
3717 LOOP_CACHING_NOWAIT = 1, 4106 LOOP_CACHING_NOWAIT = 1,
3718 LOOP_CACHING_WAIT = 2, 4107 LOOP_CACHING_WAIT = 2,
3719 LOOP_ALLOC_CHUNK = 3, 4108 LOOP_ALLOC_CHUNK = 3,
@@ -3742,11 +4131,15 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3742 struct btrfs_block_group_cache *block_group = NULL; 4131 struct btrfs_block_group_cache *block_group = NULL;
3743 int empty_cluster = 2 * 1024 * 1024; 4132 int empty_cluster = 2 * 1024 * 1024;
3744 int allowed_chunk_alloc = 0; 4133 int allowed_chunk_alloc = 0;
4134 int done_chunk_alloc = 0;
3745 struct btrfs_space_info *space_info; 4135 struct btrfs_space_info *space_info;
3746 int last_ptr_loop = 0; 4136 int last_ptr_loop = 0;
3747 int loop = 0; 4137 int loop = 0;
3748 bool found_uncached_bg = false; 4138 bool found_uncached_bg = false;
3749 bool failed_cluster_refill = false; 4139 bool failed_cluster_refill = false;
4140 bool failed_alloc = false;
4141 u64 ideal_cache_percent = 0;
4142 u64 ideal_cache_offset = 0;
3750 4143
3751 WARN_ON(num_bytes < root->sectorsize); 4144 WARN_ON(num_bytes < root->sectorsize);
3752 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 4145 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3782,14 +4175,19 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3782 empty_cluster = 0; 4175 empty_cluster = 0;
3783 4176
3784 if (search_start == hint_byte) { 4177 if (search_start == hint_byte) {
4178ideal_cache:
3785 block_group = btrfs_lookup_block_group(root->fs_info, 4179 block_group = btrfs_lookup_block_group(root->fs_info,
3786 search_start); 4180 search_start);
3787 /* 4181 /*
3788 * we don't want to use the block group if it doesn't match our 4182 * we don't want to use the block group if it doesn't match our
3789 * allocation bits, or if its not cached. 4183 * allocation bits, or if its not cached.
4184 *
4185 * However if we are re-searching with an ideal block group
4186 * picked out then we don't care that the block group is cached.
3790 */ 4187 */
3791 if (block_group && block_group_bits(block_group, data) && 4188 if (block_group && block_group_bits(block_group, data) &&
3792 block_group_cache_done(block_group)) { 4189 (block_group->cached != BTRFS_CACHE_NO ||
4190 search_start == ideal_cache_offset)) {
3793 down_read(&space_info->groups_sem); 4191 down_read(&space_info->groups_sem);
3794 if (list_empty(&block_group->list) || 4192 if (list_empty(&block_group->list) ||
3795 block_group->ro) { 4193 block_group->ro) {
@@ -3801,13 +4199,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3801 */ 4199 */
3802 btrfs_put_block_group(block_group); 4200 btrfs_put_block_group(block_group);
3803 up_read(&space_info->groups_sem); 4201 up_read(&space_info->groups_sem);
3804 } else 4202 } else {
3805 goto have_block_group; 4203 goto have_block_group;
4204 }
3806 } else if (block_group) { 4205 } else if (block_group) {
3807 btrfs_put_block_group(block_group); 4206 btrfs_put_block_group(block_group);
3808 } 4207 }
3809 } 4208 }
3810
3811search: 4209search:
3812 down_read(&space_info->groups_sem); 4210 down_read(&space_info->groups_sem);
3813 list_for_each_entry(block_group, &space_info->block_groups, list) { 4211 list_for_each_entry(block_group, &space_info->block_groups, list) {
@@ -3819,28 +4217,45 @@ search:
3819 4217
3820have_block_group: 4218have_block_group:
3821 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 4219 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4220 u64 free_percent;
4221
4222 free_percent = btrfs_block_group_used(&block_group->item);
4223 free_percent *= 100;
4224 free_percent = div64_u64(free_percent,
4225 block_group->key.offset);
4226 free_percent = 100 - free_percent;
4227 if (free_percent > ideal_cache_percent &&
4228 likely(!block_group->ro)) {
4229 ideal_cache_offset = block_group->key.objectid;
4230 ideal_cache_percent = free_percent;
4231 }
4232
3822 /* 4233 /*
3823 * we want to start caching kthreads, but not too many 4234 * We only want to start kthread caching if we are at
3824 * right off the bat so we don't overwhelm the system, 4235 * the point where we will wait for caching to make
3825 * so only start them if there are less than 2 and we're 4236 * progress, or if our ideal search is over and we've
3826 * in the initial allocation phase. 4237 * found somebody to start caching.
3827 */ 4238 */
3828 if (loop > LOOP_CACHING_NOWAIT || 4239 if (loop > LOOP_CACHING_NOWAIT ||
3829 atomic_read(&space_info->caching_threads) < 2) { 4240 (loop > LOOP_FIND_IDEAL &&
4241 atomic_read(&space_info->caching_threads) < 2)) {
3830 ret = cache_block_group(block_group); 4242 ret = cache_block_group(block_group);
3831 BUG_ON(ret); 4243 BUG_ON(ret);
3832 } 4244 }
3833 }
3834
3835 cached = block_group_cache_done(block_group);
3836 if (unlikely(!cached)) {
3837 found_uncached_bg = true; 4245 found_uncached_bg = true;
3838 4246
3839 /* if we only want cached bgs, loop */ 4247 /*
3840 if (loop == LOOP_CACHED_ONLY) 4248 * If loop is set for cached only, try the next block
4249 * group.
4250 */
4251 if (loop == LOOP_FIND_IDEAL)
3841 goto loop; 4252 goto loop;
3842 } 4253 }
3843 4254
4255 cached = block_group_cache_done(block_group);
4256 if (unlikely(!cached))
4257 found_uncached_bg = true;
4258
3844 if (unlikely(block_group->ro)) 4259 if (unlikely(block_group->ro))
3845 goto loop; 4260 goto loop;
3846 4261
@@ -3951,14 +4366,23 @@ refill_cluster:
3951 4366
3952 offset = btrfs_find_space_for_alloc(block_group, search_start, 4367 offset = btrfs_find_space_for_alloc(block_group, search_start,
3953 num_bytes, empty_size); 4368 num_bytes, empty_size);
3954 if (!offset && (cached || (!cached && 4369 /*
3955 loop == LOOP_CACHING_NOWAIT))) { 4370 * If we didn't find a chunk, and we haven't failed on this
3956 goto loop; 4371 * block group before, and this block group is in the middle of
3957 } else if (!offset && (!cached && 4372 * caching and we are ok with waiting, then go ahead and wait
3958 loop > LOOP_CACHING_NOWAIT)) { 4373 * for progress to be made, and set failed_alloc to true.
4374 *
4375 * If failed_alloc is true then we've already waited on this
4376 * block group once and should move on to the next block group.
4377 */
4378 if (!offset && !failed_alloc && !cached &&
4379 loop > LOOP_CACHING_NOWAIT) {
3959 wait_block_group_cache_progress(block_group, 4380 wait_block_group_cache_progress(block_group,
3960 num_bytes + empty_size); 4381 num_bytes + empty_size);
4382 failed_alloc = true;
3961 goto have_block_group; 4383 goto have_block_group;
4384 } else if (!offset) {
4385 goto loop;
3962 } 4386 }
3963checks: 4387checks:
3964 search_start = stripe_align(root, offset); 4388 search_start = stripe_align(root, offset);
@@ -4006,13 +4430,16 @@ checks:
4006 break; 4430 break;
4007loop: 4431loop:
4008 failed_cluster_refill = false; 4432 failed_cluster_refill = false;
4433 failed_alloc = false;
4009 btrfs_put_block_group(block_group); 4434 btrfs_put_block_group(block_group);
4010 } 4435 }
4011 up_read(&space_info->groups_sem); 4436 up_read(&space_info->groups_sem);
4012 4437
4013 /* LOOP_CACHED_ONLY, only search fully cached block groups 4438 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
4014 * LOOP_CACHING_NOWAIT, search partially cached block groups, but 4439 * for them to make caching progress. Also
4015 * dont wait foR them to finish caching 4440 * determine the best possible bg to cache
4441 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
4442 * caching kthreads as we move along
4016 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching 4443 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
4017 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again 4444 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
4018 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 4445 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
@@ -4021,12 +4448,47 @@ loop:
4021 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && 4448 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
4022 (found_uncached_bg || empty_size || empty_cluster || 4449 (found_uncached_bg || empty_size || empty_cluster ||
4023 allowed_chunk_alloc)) { 4450 allowed_chunk_alloc)) {
4024 if (found_uncached_bg) { 4451 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
4025 found_uncached_bg = false; 4452 found_uncached_bg = false;
4026 if (loop < LOOP_CACHING_WAIT) { 4453 loop++;
4027 loop++; 4454 if (!ideal_cache_percent &&
4455 atomic_read(&space_info->caching_threads))
4028 goto search; 4456 goto search;
4029 } 4457
4458 /*
4459 * 1 of the following 2 things have happened so far
4460 *
4461 * 1) We found an ideal block group for caching that
4462 * is mostly full and will cache quickly, so we might
4463 * as well wait for it.
4464 *
4465 * 2) We searched for cached only and we didn't find
4466 * anything, and we didn't start any caching kthreads
4467 * either, so chances are we will loop through and
4468 * start a couple caching kthreads, and then come back
4469 * around and just wait for them. This will be slower
4470 * because we will have 2 caching kthreads reading at
4471 * the same time when we could have just started one
4472 * and waited for it to get far enough to give us an
4473 * allocation, so go ahead and go to the wait caching
4474 * loop.
4475 */
4476 loop = LOOP_CACHING_WAIT;
4477 search_start = ideal_cache_offset;
4478 ideal_cache_percent = 0;
4479 goto ideal_cache;
4480 } else if (loop == LOOP_FIND_IDEAL) {
4481 /*
4482 * Didn't find a uncached bg, wait on anything we find
4483 * next.
4484 */
4485 loop = LOOP_CACHING_WAIT;
4486 goto search;
4487 }
4488
4489 if (loop < LOOP_CACHING_WAIT) {
4490 loop++;
4491 goto search;
4030 } 4492 }
4031 4493
4032 if (loop == LOOP_ALLOC_CHUNK) { 4494 if (loop == LOOP_ALLOC_CHUNK) {
@@ -4038,7 +4500,8 @@ loop:
4038 ret = do_chunk_alloc(trans, root, num_bytes + 4500 ret = do_chunk_alloc(trans, root, num_bytes +
4039 2 * 1024 * 1024, data, 1); 4501 2 * 1024 * 1024, data, 1);
4040 allowed_chunk_alloc = 0; 4502 allowed_chunk_alloc = 0;
4041 } else { 4503 done_chunk_alloc = 1;
4504 } else if (!done_chunk_alloc) {
4042 space_info->force_alloc = 1; 4505 space_info->force_alloc = 1;
4043 } 4506 }
4044 4507
@@ -4063,21 +4526,32 @@ loop:
4063 return ret; 4526 return ret;
4064} 4527}
4065 4528
4066static void dump_space_info(struct btrfs_space_info *info, u64 bytes) 4529static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4530 int dump_block_groups)
4067{ 4531{
4068 struct btrfs_block_group_cache *cache; 4532 struct btrfs_block_group_cache *cache;
4069 4533
4534 spin_lock(&info->lock);
4070 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4535 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4071 (unsigned long long)(info->total_bytes - info->bytes_used - 4536 (unsigned long long)(info->total_bytes - info->bytes_used -
4072 info->bytes_pinned - info->bytes_reserved), 4537 info->bytes_pinned - info->bytes_reserved -
4538 info->bytes_super),
4073 (info->full) ? "" : "not "); 4539 (info->full) ? "" : "not ");
4074 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4540 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
4075 " may_use=%llu, used=%llu\n", 4541 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
4542 "\n",
4076 (unsigned long long)info->total_bytes, 4543 (unsigned long long)info->total_bytes,
4077 (unsigned long long)info->bytes_pinned, 4544 (unsigned long long)info->bytes_pinned,
4078 (unsigned long long)info->bytes_delalloc, 4545 (unsigned long long)info->bytes_delalloc,
4079 (unsigned long long)info->bytes_may_use, 4546 (unsigned long long)info->bytes_may_use,
4080 (unsigned long long)info->bytes_used); 4547 (unsigned long long)info->bytes_used,
4548 (unsigned long long)info->bytes_root,
4549 (unsigned long long)info->bytes_super,
4550 (unsigned long long)info->bytes_reserved);
4551 spin_unlock(&info->lock);
4552
4553 if (!dump_block_groups)
4554 return;
4081 4555
4082 down_read(&info->groups_sem); 4556 down_read(&info->groups_sem);
4083 list_for_each_entry(cache, &info->block_groups, list) { 4557 list_for_each_entry(cache, &info->block_groups, list) {
@@ -4145,7 +4619,7 @@ again:
4145 printk(KERN_ERR "btrfs allocation failed flags %llu, " 4619 printk(KERN_ERR "btrfs allocation failed flags %llu, "
4146 "wanted %llu\n", (unsigned long long)data, 4620 "wanted %llu\n", (unsigned long long)data,
4147 (unsigned long long)num_bytes); 4621 (unsigned long long)num_bytes);
4148 dump_space_info(sinfo, num_bytes); 4622 dump_space_info(sinfo, num_bytes, 1);
4149 } 4623 }
4150 4624
4151 return ret; 4625 return ret;
@@ -4506,6 +4980,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4506 u64 bytenr; 4980 u64 bytenr;
4507 u64 generation; 4981 u64 generation;
4508 u64 refs; 4982 u64 refs;
4983 u64 flags;
4509 u64 last = 0; 4984 u64 last = 0;
4510 u32 nritems; 4985 u32 nritems;
4511 u32 blocksize; 4986 u32 blocksize;
@@ -4543,15 +5018,19 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4543 generation <= root->root_key.offset) 5018 generation <= root->root_key.offset)
4544 continue; 5019 continue;
4545 5020
5021 /* We don't lock the tree block, it's OK to be racy here */
5022 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5023 &refs, &flags);
5024 BUG_ON(ret);
5025 BUG_ON(refs == 0);
5026
4546 if (wc->stage == DROP_REFERENCE) { 5027 if (wc->stage == DROP_REFERENCE) {
4547 ret = btrfs_lookup_extent_info(trans, root,
4548 bytenr, blocksize,
4549 &refs, NULL);
4550 BUG_ON(ret);
4551 BUG_ON(refs == 0);
4552 if (refs == 1) 5028 if (refs == 1)
4553 goto reada; 5029 goto reada;
4554 5030
5031 if (wc->level == 1 &&
5032 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5033 continue;
4555 if (!wc->update_ref || 5034 if (!wc->update_ref ||
4556 generation <= root->root_key.offset) 5035 generation <= root->root_key.offset)
4557 continue; 5036 continue;
@@ -4560,6 +5039,10 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4560 &wc->update_progress); 5039 &wc->update_progress);
4561 if (ret < 0) 5040 if (ret < 0)
4562 continue; 5041 continue;
5042 } else {
5043 if (wc->level == 1 &&
5044 (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5045 continue;
4563 } 5046 }
4564reada: 5047reada:
4565 ret = readahead_tree_block(root, bytenr, blocksize, 5048 ret = readahead_tree_block(root, bytenr, blocksize,
@@ -4583,7 +5066,7 @@ reada:
4583static noinline int walk_down_proc(struct btrfs_trans_handle *trans, 5066static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4584 struct btrfs_root *root, 5067 struct btrfs_root *root,
4585 struct btrfs_path *path, 5068 struct btrfs_path *path,
4586 struct walk_control *wc) 5069 struct walk_control *wc, int lookup_info)
4587{ 5070{
4588 int level = wc->level; 5071 int level = wc->level;
4589 struct extent_buffer *eb = path->nodes[level]; 5072 struct extent_buffer *eb = path->nodes[level];
@@ -4598,8 +5081,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4598 * when reference count of tree block is 1, it won't increase 5081 * when reference count of tree block is 1, it won't increase
4599 * again. once full backref flag is set, we never clear it. 5082 * again. once full backref flag is set, we never clear it.
4600 */ 5083 */
4601 if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || 5084 if (lookup_info &&
4602 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) { 5085 ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5086 (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
4603 BUG_ON(!path->locks[level]); 5087 BUG_ON(!path->locks[level]);
4604 ret = btrfs_lookup_extent_info(trans, root, 5088 ret = btrfs_lookup_extent_info(trans, root,
4605 eb->start, eb->len, 5089 eb->start, eb->len,
@@ -4660,7 +5144,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4660static noinline int do_walk_down(struct btrfs_trans_handle *trans, 5144static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4661 struct btrfs_root *root, 5145 struct btrfs_root *root,
4662 struct btrfs_path *path, 5146 struct btrfs_path *path,
4663 struct walk_control *wc) 5147 struct walk_control *wc, int *lookup_info)
4664{ 5148{
4665 u64 bytenr; 5149 u64 bytenr;
4666 u64 generation; 5150 u64 generation;
@@ -4680,8 +5164,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4680 * for the subtree 5164 * for the subtree
4681 */ 5165 */
4682 if (wc->stage == UPDATE_BACKREF && 5166 if (wc->stage == UPDATE_BACKREF &&
4683 generation <= root->root_key.offset) 5167 generation <= root->root_key.offset) {
5168 *lookup_info = 1;
4684 return 1; 5169 return 1;
5170 }
4685 5171
4686 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 5172 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
4687 blocksize = btrfs_level_size(root, level - 1); 5173 blocksize = btrfs_level_size(root, level - 1);
@@ -4694,14 +5180,19 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4694 btrfs_tree_lock(next); 5180 btrfs_tree_lock(next);
4695 btrfs_set_lock_blocking(next); 5181 btrfs_set_lock_blocking(next);
4696 5182
4697 if (wc->stage == DROP_REFERENCE) { 5183 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
4698 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, 5184 &wc->refs[level - 1],
4699 &wc->refs[level - 1], 5185 &wc->flags[level - 1]);
4700 &wc->flags[level - 1]); 5186 BUG_ON(ret);
4701 BUG_ON(ret); 5187 BUG_ON(wc->refs[level - 1] == 0);
4702 BUG_ON(wc->refs[level - 1] == 0); 5188 *lookup_info = 0;
4703 5189
5190 if (wc->stage == DROP_REFERENCE) {
4704 if (wc->refs[level - 1] > 1) { 5191 if (wc->refs[level - 1] > 1) {
5192 if (level == 1 &&
5193 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5194 goto skip;
5195
4705 if (!wc->update_ref || 5196 if (!wc->update_ref ||
4706 generation <= root->root_key.offset) 5197 generation <= root->root_key.offset)
4707 goto skip; 5198 goto skip;
@@ -4715,12 +5206,17 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4715 wc->stage = UPDATE_BACKREF; 5206 wc->stage = UPDATE_BACKREF;
4716 wc->shared_level = level - 1; 5207 wc->shared_level = level - 1;
4717 } 5208 }
5209 } else {
5210 if (level == 1 &&
5211 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5212 goto skip;
4718 } 5213 }
4719 5214
4720 if (!btrfs_buffer_uptodate(next, generation)) { 5215 if (!btrfs_buffer_uptodate(next, generation)) {
4721 btrfs_tree_unlock(next); 5216 btrfs_tree_unlock(next);
4722 free_extent_buffer(next); 5217 free_extent_buffer(next);
4723 next = NULL; 5218 next = NULL;
5219 *lookup_info = 1;
4724 } 5220 }
4725 5221
4726 if (!next) { 5222 if (!next) {
@@ -4743,21 +5239,22 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4743skip: 5239skip:
4744 wc->refs[level - 1] = 0; 5240 wc->refs[level - 1] = 0;
4745 wc->flags[level - 1] = 0; 5241 wc->flags[level - 1] = 0;
5242 if (wc->stage == DROP_REFERENCE) {
5243 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
5244 parent = path->nodes[level]->start;
5245 } else {
5246 BUG_ON(root->root_key.objectid !=
5247 btrfs_header_owner(path->nodes[level]));
5248 parent = 0;
5249 }
4746 5250
4747 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 5251 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
4748 parent = path->nodes[level]->start; 5252 root->root_key.objectid, level - 1, 0);
4749 } else { 5253 BUG_ON(ret);
4750 BUG_ON(root->root_key.objectid !=
4751 btrfs_header_owner(path->nodes[level]));
4752 parent = 0;
4753 } 5254 }
4754
4755 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
4756 root->root_key.objectid, level - 1, 0);
4757 BUG_ON(ret);
4758
4759 btrfs_tree_unlock(next); 5255 btrfs_tree_unlock(next);
4760 free_extent_buffer(next); 5256 free_extent_buffer(next);
5257 *lookup_info = 1;
4761 return 1; 5258 return 1;
4762} 5259}
4763 5260
@@ -4871,6 +5368,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4871 struct walk_control *wc) 5368 struct walk_control *wc)
4872{ 5369{
4873 int level = wc->level; 5370 int level = wc->level;
5371 int lookup_info = 1;
4874 int ret; 5372 int ret;
4875 5373
4876 while (level >= 0) { 5374 while (level >= 0) {
@@ -4878,14 +5376,14 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4878 btrfs_header_nritems(path->nodes[level])) 5376 btrfs_header_nritems(path->nodes[level]))
4879 break; 5377 break;
4880 5378
4881 ret = walk_down_proc(trans, root, path, wc); 5379 ret = walk_down_proc(trans, root, path, wc, lookup_info);
4882 if (ret > 0) 5380 if (ret > 0)
4883 break; 5381 break;
4884 5382
4885 if (level == 0) 5383 if (level == 0)
4886 break; 5384 break;
4887 5385
4888 ret = do_walk_down(trans, root, path, wc); 5386 ret = do_walk_down(trans, root, path, wc, &lookup_info);
4889 if (ret > 0) { 5387 if (ret > 0) {
4890 path->slots[level]++; 5388 path->slots[level]++;
4891 continue; 5389 continue;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0cb88f8146ea..96577e8bf9fd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
280 return NULL; 280 return NULL;
281} 281}
282 282
283static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
284 struct extent_state *other)
285{
286 if (tree->ops && tree->ops->merge_extent_hook)
287 tree->ops->merge_extent_hook(tree->mapping->host, new,
288 other);
289}
290
283/* 291/*
284 * utility function to look for merge candidates inside a given range. 292 * utility function to look for merge candidates inside a given range.
285 * Any extents with matching state are merged together into a single 293 * Any extents with matching state are merged together into a single
@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
303 other = rb_entry(other_node, struct extent_state, rb_node); 311 other = rb_entry(other_node, struct extent_state, rb_node);
304 if (other->end == state->start - 1 && 312 if (other->end == state->start - 1 &&
305 other->state == state->state) { 313 other->state == state->state) {
314 merge_cb(tree, state, other);
306 state->start = other->start; 315 state->start = other->start;
307 other->tree = NULL; 316 other->tree = NULL;
308 rb_erase(&other->rb_node, &tree->state); 317 rb_erase(&other->rb_node, &tree->state);
@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
314 other = rb_entry(other_node, struct extent_state, rb_node); 323 other = rb_entry(other_node, struct extent_state, rb_node);
315 if (other->start == state->end + 1 && 324 if (other->start == state->end + 1 &&
316 other->state == state->state) { 325 other->state == state->state) {
326 merge_cb(tree, state, other);
317 other->start = state->start; 327 other->start = state->start;
318 state->tree = NULL; 328 state->tree = NULL;
319 rb_erase(&state->rb_node, &tree->state); 329 rb_erase(&state->rb_node, &tree->state);
320 free_extent_state(state); 330 free_extent_state(state);
331 state = NULL;
321 } 332 }
322 } 333 }
334
323 return 0; 335 return 0;
324} 336}
325 337
326static void set_state_cb(struct extent_io_tree *tree, 338static int set_state_cb(struct extent_io_tree *tree,
327 struct extent_state *state, 339 struct extent_state *state,
328 unsigned long bits) 340 unsigned long bits)
329{ 341{
330 if (tree->ops && tree->ops->set_bit_hook) { 342 if (tree->ops && tree->ops->set_bit_hook) {
331 tree->ops->set_bit_hook(tree->mapping->host, state->start, 343 return tree->ops->set_bit_hook(tree->mapping->host,
332 state->end, state->state, bits); 344 state->start, state->end,
345 state->state, bits);
333 } 346 }
347
348 return 0;
334} 349}
335 350
336static void clear_state_cb(struct extent_io_tree *tree, 351static void clear_state_cb(struct extent_io_tree *tree,
337 struct extent_state *state, 352 struct extent_state *state,
338 unsigned long bits) 353 unsigned long bits)
339{ 354{
340 if (tree->ops && tree->ops->clear_bit_hook) { 355 if (tree->ops && tree->ops->clear_bit_hook)
341 tree->ops->clear_bit_hook(tree->mapping->host, state->start, 356 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
342 state->end, state->state, bits);
343 }
344} 357}
345 358
346/* 359/*
@@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree,
358 int bits) 371 int bits)
359{ 372{
360 struct rb_node *node; 373 struct rb_node *node;
374 int ret;
361 375
362 if (end < start) { 376 if (end < start) {
363 printk(KERN_ERR "btrfs end < start %llu %llu\n", 377 printk(KERN_ERR "btrfs end < start %llu %llu\n",
@@ -365,11 +379,14 @@ static int insert_state(struct extent_io_tree *tree,
365 (unsigned long long)start); 379 (unsigned long long)start);
366 WARN_ON(1); 380 WARN_ON(1);
367 } 381 }
368 if (bits & EXTENT_DIRTY)
369 tree->dirty_bytes += end - start + 1;
370 state->start = start; 382 state->start = start;
371 state->end = end; 383 state->end = end;
372 set_state_cb(tree, state, bits); 384 ret = set_state_cb(tree, state, bits);
385 if (ret)
386 return ret;
387
388 if (bits & EXTENT_DIRTY)
389 tree->dirty_bytes += end - start + 1;
373 state->state |= bits; 390 state->state |= bits;
374 node = tree_insert(&tree->state, end, &state->rb_node); 391 node = tree_insert(&tree->state, end, &state->rb_node);
375 if (node) { 392 if (node) {
@@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree,
387 return 0; 404 return 0;
388} 405}
389 406
407static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
408 u64 split)
409{
410 if (tree->ops && tree->ops->split_extent_hook)
411 return tree->ops->split_extent_hook(tree->mapping->host,
412 orig, split);
413 return 0;
414}
415
390/* 416/*
391 * split a given extent state struct in two, inserting the preallocated 417 * split a given extent state struct in two, inserting the preallocated
392 * struct 'prealloc' as the newly created second half. 'split' indicates an 418 * struct 'prealloc' as the newly created second half. 'split' indicates an
@@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
405 struct extent_state *prealloc, u64 split) 431 struct extent_state *prealloc, u64 split)
406{ 432{
407 struct rb_node *node; 433 struct rb_node *node;
434
435 split_cb(tree, orig, split);
436
408 prealloc->start = orig->start; 437 prealloc->start = orig->start;
409 prealloc->end = split - 1; 438 prealloc->end = split - 1;
410 prealloc->state = orig->state; 439 prealloc->state = orig->state;
@@ -431,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
431 struct extent_state *state, int bits, int wake, 460 struct extent_state *state, int bits, int wake,
432 int delete) 461 int delete)
433{ 462{
434 int ret = state->state & bits; 463 int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
464 int ret = state->state & bits_to_clear;
435 465
436 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 466 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
437 u64 range = state->end - state->start + 1; 467 u64 range = state->end - state->start + 1;
@@ -439,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree,
439 tree->dirty_bytes -= range; 469 tree->dirty_bytes -= range;
440 } 470 }
441 clear_state_cb(tree, state, bits); 471 clear_state_cb(tree, state, bits);
442 state->state &= ~bits; 472 state->state &= ~bits_to_clear;
443 if (wake) 473 if (wake)
444 wake_up(&state->wq); 474 wake_up(&state->wq);
445 if (delete || state->state == 0) { 475 if (delete || state->state == 0) {
@@ -542,8 +572,8 @@ hit_next:
542 if (err) 572 if (err)
543 goto out; 573 goto out;
544 if (state->end <= end) { 574 if (state->end <= end) {
545 set |= clear_state_bit(tree, state, bits, 575 set |= clear_state_bit(tree, state, bits, wake,
546 wake, delete); 576 delete);
547 if (last_end == (u64)-1) 577 if (last_end == (u64)-1)
548 goto out; 578 goto out;
549 start = last_end + 1; 579 start = last_end + 1;
@@ -561,12 +591,11 @@ hit_next:
561 prealloc = alloc_extent_state(GFP_ATOMIC); 591 prealloc = alloc_extent_state(GFP_ATOMIC);
562 err = split_state(tree, state, prealloc, end + 1); 592 err = split_state(tree, state, prealloc, end + 1);
563 BUG_ON(err == -EEXIST); 593 BUG_ON(err == -EEXIST);
564
565 if (wake) 594 if (wake)
566 wake_up(&state->wq); 595 wake_up(&state->wq);
567 596
568 set |= clear_state_bit(tree, prealloc, bits, 597 set |= clear_state_bit(tree, prealloc, bits, wake, delete);
569 wake, delete); 598
570 prealloc = NULL; 599 prealloc = NULL;
571 goto out; 600 goto out;
572 } 601 }
@@ -667,16 +696,23 @@ out:
667 return 0; 696 return 0;
668} 697}
669 698
670static void set_state_bits(struct extent_io_tree *tree, 699static int set_state_bits(struct extent_io_tree *tree,
671 struct extent_state *state, 700 struct extent_state *state,
672 int bits) 701 int bits)
673{ 702{
703 int ret;
704
705 ret = set_state_cb(tree, state, bits);
706 if (ret)
707 return ret;
708
674 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 709 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
675 u64 range = state->end - state->start + 1; 710 u64 range = state->end - state->start + 1;
676 tree->dirty_bytes += range; 711 tree->dirty_bytes += range;
677 } 712 }
678 set_state_cb(tree, state, bits);
679 state->state |= bits; 713 state->state |= bits;
714
715 return 0;
680} 716}
681 717
682static void cache_state(struct extent_state *state, 718static void cache_state(struct extent_state *state,
@@ -758,7 +794,10 @@ hit_next:
758 goto out; 794 goto out;
759 } 795 }
760 796
761 set_state_bits(tree, state, bits); 797 err = set_state_bits(tree, state, bits);
798 if (err)
799 goto out;
800
762 cache_state(state, cached_state); 801 cache_state(state, cached_state);
763 merge_state(tree, state); 802 merge_state(tree, state);
764 if (last_end == (u64)-1) 803 if (last_end == (u64)-1)
@@ -805,7 +844,9 @@ hit_next:
805 if (err) 844 if (err)
806 goto out; 845 goto out;
807 if (state->end <= end) { 846 if (state->end <= end) {
808 set_state_bits(tree, state, bits); 847 err = set_state_bits(tree, state, bits);
848 if (err)
849 goto out;
809 cache_state(state, cached_state); 850 cache_state(state, cached_state);
810 merge_state(tree, state); 851 merge_state(tree, state);
811 if (last_end == (u64)-1) 852 if (last_end == (u64)-1)
@@ -829,11 +870,13 @@ hit_next:
829 this_end = last_start - 1; 870 this_end = last_start - 1;
830 err = insert_state(tree, prealloc, start, this_end, 871 err = insert_state(tree, prealloc, start, this_end,
831 bits); 872 bits);
832 cache_state(prealloc, cached_state);
833 prealloc = NULL;
834 BUG_ON(err == -EEXIST); 873 BUG_ON(err == -EEXIST);
835 if (err) 874 if (err) {
875 prealloc = NULL;
836 goto out; 876 goto out;
877 }
878 cache_state(prealloc, cached_state);
879 prealloc = NULL;
837 start = this_end + 1; 880 start = this_end + 1;
838 goto search_again; 881 goto search_again;
839 } 882 }
@@ -852,7 +895,11 @@ hit_next:
852 err = split_state(tree, state, prealloc, end + 1); 895 err = split_state(tree, state, prealloc, end + 1);
853 BUG_ON(err == -EEXIST); 896 BUG_ON(err == -EEXIST);
854 897
855 set_state_bits(tree, prealloc, bits); 898 err = set_state_bits(tree, prealloc, bits);
899 if (err) {
900 prealloc = NULL;
901 goto out;
902 }
856 cache_state(prealloc, cached_state); 903 cache_state(prealloc, cached_state);
857 merge_state(tree, prealloc); 904 merge_state(tree, prealloc);
858 prealloc = NULL; 905 prealloc = NULL;
@@ -910,7 +957,8 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
910 gfp_t mask) 957 gfp_t mask)
911{ 958{
912 return clear_extent_bit(tree, start, end, 959 return clear_extent_bit(tree, start, end,
913 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, 960 EXTENT_DIRTY | EXTENT_DELALLOC |
961 EXTENT_DO_ACCOUNTING, 0, 0,
914 NULL, mask); 962 NULL, mask);
915} 963}
916 964
@@ -1355,12 +1403,7 @@ out_failed:
1355int extent_clear_unlock_delalloc(struct inode *inode, 1403int extent_clear_unlock_delalloc(struct inode *inode,
1356 struct extent_io_tree *tree, 1404 struct extent_io_tree *tree,
1357 u64 start, u64 end, struct page *locked_page, 1405 u64 start, u64 end, struct page *locked_page,
1358 int unlock_pages, 1406 unsigned long op)
1359 int clear_unlock,
1360 int clear_delalloc, int clear_dirty,
1361 int set_writeback,
1362 int end_writeback,
1363 int set_private2)
1364{ 1407{
1365 int ret; 1408 int ret;
1366 struct page *pages[16]; 1409 struct page *pages[16];
@@ -1370,17 +1413,21 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1370 int i; 1413 int i;
1371 int clear_bits = 0; 1414 int clear_bits = 0;
1372 1415
1373 if (clear_unlock) 1416 if (op & EXTENT_CLEAR_UNLOCK)
1374 clear_bits |= EXTENT_LOCKED; 1417 clear_bits |= EXTENT_LOCKED;
1375 if (clear_dirty) 1418 if (op & EXTENT_CLEAR_DIRTY)
1376 clear_bits |= EXTENT_DIRTY; 1419 clear_bits |= EXTENT_DIRTY;
1377 1420
1378 if (clear_delalloc) 1421 if (op & EXTENT_CLEAR_DELALLOC)
1379 clear_bits |= EXTENT_DELALLOC; 1422 clear_bits |= EXTENT_DELALLOC;
1380 1423
1424 if (op & EXTENT_CLEAR_ACCOUNTING)
1425 clear_bits |= EXTENT_DO_ACCOUNTING;
1426
1381 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1427 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1382 if (!(unlock_pages || clear_dirty || set_writeback || end_writeback || 1428 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1383 set_private2)) 1429 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1430 EXTENT_SET_PRIVATE2)))
1384 return 0; 1431 return 0;
1385 1432
1386 while (nr_pages > 0) { 1433 while (nr_pages > 0) {
@@ -1389,20 +1436,20 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1389 nr_pages, ARRAY_SIZE(pages)), pages); 1436 nr_pages, ARRAY_SIZE(pages)), pages);
1390 for (i = 0; i < ret; i++) { 1437 for (i = 0; i < ret; i++) {
1391 1438
1392 if (set_private2) 1439 if (op & EXTENT_SET_PRIVATE2)
1393 SetPagePrivate2(pages[i]); 1440 SetPagePrivate2(pages[i]);
1394 1441
1395 if (pages[i] == locked_page) { 1442 if (pages[i] == locked_page) {
1396 page_cache_release(pages[i]); 1443 page_cache_release(pages[i]);
1397 continue; 1444 continue;
1398 } 1445 }
1399 if (clear_dirty) 1446 if (op & EXTENT_CLEAR_DIRTY)
1400 clear_page_dirty_for_io(pages[i]); 1447 clear_page_dirty_for_io(pages[i]);
1401 if (set_writeback) 1448 if (op & EXTENT_SET_WRITEBACK)
1402 set_page_writeback(pages[i]); 1449 set_page_writeback(pages[i]);
1403 if (end_writeback) 1450 if (op & EXTENT_END_WRITEBACK)
1404 end_page_writeback(pages[i]); 1451 end_page_writeback(pages[i]);
1405 if (unlock_pages) 1452 if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1406 unlock_page(pages[i]); 1453 unlock_page(pages[i]);
1407 page_cache_release(pages[i]); 1454 page_cache_release(pages[i]);
1408 } 1455 }
@@ -2668,7 +2715,8 @@ int extent_invalidatepage(struct extent_io_tree *tree,
2668 lock_extent(tree, start, end, GFP_NOFS); 2715 lock_extent(tree, start, end, GFP_NOFS);
2669 wait_on_page_writeback(page); 2716 wait_on_page_writeback(page);
2670 clear_extent_bit(tree, start, end, 2717 clear_extent_bit(tree, start, end,
2671 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, 2718 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2719 EXTENT_DO_ACCOUNTING,
2672 1, 1, NULL, GFP_NOFS); 2720 1, 1, NULL, GFP_NOFS);
2673 return 0; 2721 return 0;
2674} 2722}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 14ed16fd862d..36de250a7b2b 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -15,6 +15,7 @@
15#define EXTENT_BUFFER_FILLED (1 << 8) 15#define EXTENT_BUFFER_FILLED (1 << 8)
16#define EXTENT_BOUNDARY (1 << 9) 16#define EXTENT_BOUNDARY (1 << 9)
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11)
18#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
19 20
20/* flags for bio submission */ 21/* flags for bio submission */
@@ -25,6 +26,16 @@
25#define EXTENT_BUFFER_BLOCKING 1 26#define EXTENT_BUFFER_BLOCKING 1
26#define EXTENT_BUFFER_DIRTY 2 27#define EXTENT_BUFFER_DIRTY 2
27 28
29/* these are flags for extent_clear_unlock_delalloc */
30#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
31#define EXTENT_CLEAR_UNLOCK 0x2
32#define EXTENT_CLEAR_DELALLOC 0x4
33#define EXTENT_CLEAR_DIRTY 0x8
34#define EXTENT_SET_WRITEBACK 0x10
35#define EXTENT_END_WRITEBACK 0x20
36#define EXTENT_SET_PRIVATE2 0x40
37#define EXTENT_CLEAR_ACCOUNTING 0x80
38
28/* 39/*
29 * page->private values. Every page that is controlled by the extent 40 * page->private values. Every page that is controlled by the extent
30 * map has page->private set to one. 41 * map has page->private set to one.
@@ -60,8 +71,13 @@ struct extent_io_ops {
60 struct extent_state *state, int uptodate); 71 struct extent_state *state, int uptodate);
61 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, 72 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
62 unsigned long old, unsigned long bits); 73 unsigned long old, unsigned long bits);
63 int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end, 74 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
64 unsigned long old, unsigned long bits); 75 unsigned long bits);
76 int (*merge_extent_hook)(struct inode *inode,
77 struct extent_state *new,
78 struct extent_state *other);
79 int (*split_extent_hook)(struct inode *inode,
80 struct extent_state *orig, u64 split);
65 int (*write_cache_pages_lock_hook)(struct page *page); 81 int (*write_cache_pages_lock_hook)(struct page *page);
66}; 82};
67 83
@@ -79,10 +95,14 @@ struct extent_state {
79 u64 start; 95 u64 start;
80 u64 end; /* inclusive */ 96 u64 end; /* inclusive */
81 struct rb_node rb_node; 97 struct rb_node rb_node;
98
99 /* ADD NEW ELEMENTS AFTER THIS */
82 struct extent_io_tree *tree; 100 struct extent_io_tree *tree;
83 wait_queue_head_t wq; 101 wait_queue_head_t wq;
84 atomic_t refs; 102 atomic_t refs;
85 unsigned long state; 103 unsigned long state;
104 u64 split_start;
105 u64 split_end;
86 106
87 /* for use by the FS */ 107 /* for use by the FS */
88 u64 private; 108 u64 private;
@@ -279,10 +299,5 @@ int extent_range_uptodate(struct extent_io_tree *tree,
279int extent_clear_unlock_delalloc(struct inode *inode, 299int extent_clear_unlock_delalloc(struct inode *inode,
280 struct extent_io_tree *tree, 300 struct extent_io_tree *tree,
281 u64 start, u64 end, struct page *locked_page, 301 u64 start, u64 end, struct page *locked_page,
282 int unlock_page, 302 unsigned long op);
283 int clear_unlock,
284 int clear_delalloc, int clear_dirty,
285 int set_writeback,
286 int end_writeback,
287 int set_private2);
288#endif 303#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 2c726b7b9faa..ccbdcb54ec5d 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -208,7 +208,7 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
208 write_lock(&tree->lock); 208 write_lock(&tree->lock);
209 em = lookup_extent_mapping(tree, start, len); 209 em = lookup_extent_mapping(tree, start, len);
210 210
211 WARN_ON(em->start != start || !em); 211 WARN_ON(!em || em->start != start);
212 212
213 if (!em) 213 if (!em)
214 goto out; 214 goto out;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 571ad3c13b47..06550affbd27 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -123,7 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
124 124
125 end_of_last_block = start_pos + num_bytes - 1; 125 end_of_last_block = start_pos + num_bytes - 1;
126 btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); 126 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
127 if (err)
128 return err;
129
127 for (i = 0; i < num_pages; i++) { 130 for (i = 0; i < num_pages; i++) {
128 struct page *p = pages[i]; 131 struct page *p = pages[i];
129 SetPageUptodate(p); 132 SetPageUptodate(p);
@@ -875,7 +878,8 @@ again:
875 btrfs_put_ordered_extent(ordered); 878 btrfs_put_ordered_extent(ordered);
876 879
877 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, 880 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
878 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC, 881 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
882 EXTENT_DO_ACCOUNTING,
879 GFP_NOFS); 883 GFP_NOFS);
880 unlock_extent(&BTRFS_I(inode)->io_tree, 884 unlock_extent(&BTRFS_I(inode)->io_tree,
881 start_pos, last_pos - 1, GFP_NOFS); 885 start_pos, last_pos - 1, GFP_NOFS);
@@ -917,21 +921,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
917 start_pos = pos; 921 start_pos = pos;
918 922
919 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 923 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
924
925 /* do the reserve before the mutex lock in case we have to do some
926 * flushing. We wouldn't deadlock, but this is more polite.
927 */
928 err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
929 if (err)
930 goto out_nolock;
931
932 mutex_lock(&inode->i_mutex);
933
920 current->backing_dev_info = inode->i_mapping->backing_dev_info; 934 current->backing_dev_info = inode->i_mapping->backing_dev_info;
921 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 935 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
922 if (err) 936 if (err)
923 goto out_nolock; 937 goto out;
938
924 if (count == 0) 939 if (count == 0)
925 goto out_nolock; 940 goto out;
926 941
927 err = file_remove_suid(file); 942 err = file_remove_suid(file);
928 if (err) 943 if (err)
929 goto out_nolock; 944 goto out;
945
930 file_update_time(file); 946 file_update_time(file);
931 947
932 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 948 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
933 949
934 mutex_lock(&inode->i_mutex); 950 /* generic_write_checks can change our pos */
951 start_pos = pos;
952
935 BTRFS_I(inode)->sequence++; 953 BTRFS_I(inode)->sequence++;
936 first_index = pos >> PAGE_CACHE_SHIFT; 954 first_index = pos >> PAGE_CACHE_SHIFT;
937 last_index = (pos + count) >> PAGE_CACHE_SHIFT; 955 last_index = (pos + count) >> PAGE_CACHE_SHIFT;
@@ -1005,9 +1023,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1005 } 1023 }
1006 1024
1007 if (will_write) { 1025 if (will_write) {
1008 btrfs_fdatawrite_range(inode->i_mapping, pos, 1026 filemap_fdatawrite_range(inode->i_mapping, pos,
1009 pos + write_bytes - 1, 1027 pos + write_bytes - 1);
1010 WB_SYNC_ALL);
1011 } else { 1028 } else {
1012 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1029 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1013 num_pages); 1030 num_pages);
@@ -1028,6 +1045,7 @@ out:
1028 mutex_unlock(&inode->i_mutex); 1045 mutex_unlock(&inode->i_mutex);
1029 if (ret) 1046 if (ret)
1030 err = ret; 1047 err = ret;
1048 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
1031 1049
1032out_nolock: 1050out_nolock:
1033 kfree(pages); 1051 kfree(pages);
@@ -1068,8 +1086,10 @@ out_nolock:
1068 btrfs_end_transaction(trans, root); 1086 btrfs_end_transaction(trans, root);
1069 else 1087 else
1070 btrfs_commit_transaction(trans, root); 1088 btrfs_commit_transaction(trans, root);
1071 } else { 1089 } else if (ret != BTRFS_NO_LOG_SYNC) {
1072 btrfs_commit_transaction(trans, root); 1090 btrfs_commit_transaction(trans, root);
1091 } else {
1092 btrfs_end_transaction(trans, root);
1073 } 1093 }
1074 } 1094 }
1075 if (file->f_flags & O_DIRECT) { 1095 if (file->f_flags & O_DIRECT) {
@@ -1119,6 +1139,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1119 int ret = 0; 1139 int ret = 0;
1120 struct btrfs_trans_handle *trans; 1140 struct btrfs_trans_handle *trans;
1121 1141
1142
1143 /* we wait first, since the writeback may change the inode */
1144 root->log_batch++;
1145 /* the VFS called filemap_fdatawrite for us */
1146 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1147 root->log_batch++;
1148
1122 /* 1149 /*
1123 * check the transaction that last modified this inode 1150 * check the transaction that last modified this inode
1124 * and see if its already been committed 1151 * and see if its already been committed
@@ -1126,6 +1153,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1126 if (!BTRFS_I(inode)->last_trans) 1153 if (!BTRFS_I(inode)->last_trans)
1127 goto out; 1154 goto out;
1128 1155
1156 /*
1157 * if the last transaction that changed this file was before
1158 * the current transaction, we can bail out now without any
1159 * syncing
1160 */
1129 mutex_lock(&root->fs_info->trans_mutex); 1161 mutex_lock(&root->fs_info->trans_mutex);
1130 if (BTRFS_I(inode)->last_trans <= 1162 if (BTRFS_I(inode)->last_trans <=
1131 root->fs_info->last_trans_committed) { 1163 root->fs_info->last_trans_committed) {
@@ -1135,13 +1167,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1135 } 1167 }
1136 mutex_unlock(&root->fs_info->trans_mutex); 1168 mutex_unlock(&root->fs_info->trans_mutex);
1137 1169
1138 root->log_batch++;
1139 filemap_fdatawrite(inode->i_mapping);
1140 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1141 root->log_batch++;
1142
1143 if (datasync && !(inode->i_state & I_DIRTY_PAGES))
1144 goto out;
1145 /* 1170 /*
1146 * ok we haven't committed the transaction yet, lets do a commit 1171 * ok we haven't committed the transaction yet, lets do a commit
1147 */ 1172 */
@@ -1170,21 +1195,25 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1170 */ 1195 */
1171 mutex_unlock(&dentry->d_inode->i_mutex); 1196 mutex_unlock(&dentry->d_inode->i_mutex);
1172 1197
1173 if (ret > 0) { 1198 if (ret != BTRFS_NO_LOG_SYNC) {
1174 ret = btrfs_commit_transaction(trans, root); 1199 if (ret > 0) {
1175 } else {
1176 ret = btrfs_sync_log(trans, root);
1177 if (ret == 0)
1178 ret = btrfs_end_transaction(trans, root);
1179 else
1180 ret = btrfs_commit_transaction(trans, root); 1200 ret = btrfs_commit_transaction(trans, root);
1201 } else {
1202 ret = btrfs_sync_log(trans, root);
1203 if (ret == 0)
1204 ret = btrfs_end_transaction(trans, root);
1205 else
1206 ret = btrfs_commit_transaction(trans, root);
1207 }
1208 } else {
1209 ret = btrfs_end_transaction(trans, root);
1181 } 1210 }
1182 mutex_lock(&dentry->d_inode->i_mutex); 1211 mutex_lock(&dentry->d_inode->i_mutex);
1183out: 1212out:
1184 return ret > 0 ? EIO : ret; 1213 return ret > 0 ? EIO : ret;
1185} 1214}
1186 1215
1187static struct vm_operations_struct btrfs_file_vm_ops = { 1216static const struct vm_operations_struct btrfs_file_vm_ops = {
1188 .fault = filemap_fault, 1217 .fault = filemap_fault,
1189 .page_mkwrite = btrfs_page_mkwrite, 1218 .page_mkwrite = btrfs_page_mkwrite,
1190}; 1219};
@@ -1196,7 +1225,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1196 return 0; 1225 return 0;
1197} 1226}
1198 1227
1199struct file_operations btrfs_file_operations = { 1228const struct file_operations btrfs_file_operations = {
1200 .llseek = generic_file_llseek, 1229 .llseek = generic_file_llseek,
1201 .read = do_sync_read, 1230 .read = do_sync_read,
1202 .aio_read = generic_file_aio_read, 1231 .aio_read = generic_file_aio_read,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5c2caad76212..cb2849f03251 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1296,7 +1296,7 @@ again:
1296 window_start = entry->offset; 1296 window_start = entry->offset;
1297 window_free = entry->bytes; 1297 window_free = entry->bytes;
1298 last = entry; 1298 last = entry;
1299 max_extent = 0; 1299 max_extent = entry->bytes;
1300 } else { 1300 } else {
1301 last = next; 1301 last = next;
1302 window_free += next->bytes; 1302 window_free += next->bytes;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e9b76bcd1c12..b3ad168a0bfc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -62,7 +62,7 @@ static const struct inode_operations btrfs_special_inode_operations;
62static const struct inode_operations btrfs_file_inode_operations; 62static const struct inode_operations btrfs_file_inode_operations;
63static const struct address_space_operations btrfs_aops; 63static const struct address_space_operations btrfs_aops;
64static const struct address_space_operations btrfs_symlink_aops; 64static const struct address_space_operations btrfs_symlink_aops;
65static struct file_operations btrfs_dir_file_operations; 65static const struct file_operations btrfs_dir_file_operations;
66static struct extent_io_ops btrfs_extent_io_ops; 66static struct extent_io_ops btrfs_extent_io_ops;
67 67
68static struct kmem_cache *btrfs_inode_cachep; 68static struct kmem_cache *btrfs_inode_cachep;
@@ -424,9 +424,12 @@ again:
424 * and free up our temp pages. 424 * and free up our temp pages.
425 */ 425 */
426 extent_clear_unlock_delalloc(inode, 426 extent_clear_unlock_delalloc(inode,
427 &BTRFS_I(inode)->io_tree, 427 &BTRFS_I(inode)->io_tree,
428 start, end, NULL, 1, 0, 428 start, end, NULL,
429 0, 1, 1, 1, 0); 429 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
430 EXTENT_CLEAR_DELALLOC |
431 EXTENT_CLEAR_ACCOUNTING |
432 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
430 ret = 0; 433 ret = 0;
431 goto free_pages_out; 434 goto free_pages_out;
432 } 435 }
@@ -535,7 +538,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
535 struct btrfs_root *root = BTRFS_I(inode)->root; 538 struct btrfs_root *root = BTRFS_I(inode)->root;
536 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 539 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
537 struct extent_io_tree *io_tree; 540 struct extent_io_tree *io_tree;
538 int ret; 541 int ret = 0;
539 542
540 if (list_empty(&async_cow->extents)) 543 if (list_empty(&async_cow->extents))
541 return 0; 544 return 0;
@@ -549,6 +552,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
549 552
550 io_tree = &BTRFS_I(inode)->io_tree; 553 io_tree = &BTRFS_I(inode)->io_tree;
551 554
555retry:
552 /* did the compression code fall back to uncompressed IO? */ 556 /* did the compression code fall back to uncompressed IO? */
553 if (!async_extent->pages) { 557 if (!async_extent->pages) {
554 int page_started = 0; 558 int page_started = 0;
@@ -559,11 +563,11 @@ static noinline int submit_compressed_extents(struct inode *inode,
559 async_extent->ram_size - 1, GFP_NOFS); 563 async_extent->ram_size - 1, GFP_NOFS);
560 564
561 /* allocate blocks */ 565 /* allocate blocks */
562 cow_file_range(inode, async_cow->locked_page, 566 ret = cow_file_range(inode, async_cow->locked_page,
563 async_extent->start, 567 async_extent->start,
564 async_extent->start + 568 async_extent->start +
565 async_extent->ram_size - 1, 569 async_extent->ram_size - 1,
566 &page_started, &nr_written, 0); 570 &page_started, &nr_written, 0);
567 571
568 /* 572 /*
569 * if page_started, cow_file_range inserted an 573 * if page_started, cow_file_range inserted an
@@ -571,7 +575,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
571 * and IO for us. Otherwise, we need to submit 575 * and IO for us. Otherwise, we need to submit
572 * all those pages down to the drive. 576 * all those pages down to the drive.
573 */ 577 */
574 if (!page_started) 578 if (!page_started && !ret)
575 extent_write_locked_range(io_tree, 579 extent_write_locked_range(io_tree,
576 inode, async_extent->start, 580 inode, async_extent->start,
577 async_extent->start + 581 async_extent->start +
@@ -599,7 +603,21 @@ static noinline int submit_compressed_extents(struct inode *inode,
599 async_extent->compressed_size, 603 async_extent->compressed_size,
600 0, alloc_hint, 604 0, alloc_hint,
601 (u64)-1, &ins, 1); 605 (u64)-1, &ins, 1);
602 BUG_ON(ret); 606 if (ret) {
607 int i;
608 for (i = 0; i < async_extent->nr_pages; i++) {
609 WARN_ON(async_extent->pages[i]->mapping);
610 page_cache_release(async_extent->pages[i]);
611 }
612 kfree(async_extent->pages);
613 async_extent->nr_pages = 0;
614 async_extent->pages = NULL;
615 unlock_extent(io_tree, async_extent->start,
616 async_extent->start +
617 async_extent->ram_size - 1, GFP_NOFS);
618 goto retry;
619 }
620
603 em = alloc_extent_map(GFP_NOFS); 621 em = alloc_extent_map(GFP_NOFS);
604 em->start = async_extent->start; 622 em->start = async_extent->start;
605 em->len = async_extent->ram_size; 623 em->len = async_extent->ram_size;
@@ -637,11 +655,14 @@ static noinline int submit_compressed_extents(struct inode *inode,
637 * clear dirty, set writeback and unlock the pages. 655 * clear dirty, set writeback and unlock the pages.
638 */ 656 */
639 extent_clear_unlock_delalloc(inode, 657 extent_clear_unlock_delalloc(inode,
640 &BTRFS_I(inode)->io_tree, 658 &BTRFS_I(inode)->io_tree,
641 async_extent->start, 659 async_extent->start,
642 async_extent->start + 660 async_extent->start +
643 async_extent->ram_size - 1, 661 async_extent->ram_size - 1,
644 NULL, 1, 1, 0, 1, 1, 0, 0); 662 NULL, EXTENT_CLEAR_UNLOCK_PAGE |
663 EXTENT_CLEAR_UNLOCK |
664 EXTENT_CLEAR_DELALLOC |
665 EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
645 666
646 ret = btrfs_submit_compressed_write(inode, 667 ret = btrfs_submit_compressed_write(inode,
647 async_extent->start, 668 async_extent->start,
@@ -712,9 +733,15 @@ static noinline int cow_file_range(struct inode *inode,
712 start, end, 0, NULL); 733 start, end, 0, NULL);
713 if (ret == 0) { 734 if (ret == 0) {
714 extent_clear_unlock_delalloc(inode, 735 extent_clear_unlock_delalloc(inode,
715 &BTRFS_I(inode)->io_tree, 736 &BTRFS_I(inode)->io_tree,
716 start, end, NULL, 1, 1, 737 start, end, NULL,
717 1, 1, 1, 1, 0); 738 EXTENT_CLEAR_UNLOCK_PAGE |
739 EXTENT_CLEAR_UNLOCK |
740 EXTENT_CLEAR_DELALLOC |
741 EXTENT_CLEAR_ACCOUNTING |
742 EXTENT_CLEAR_DIRTY |
743 EXTENT_SET_WRITEBACK |
744 EXTENT_END_WRITEBACK);
718 *nr_written = *nr_written + 745 *nr_written = *nr_written +
719 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; 746 (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
720 *page_started = 1; 747 *page_started = 1;
@@ -731,13 +758,29 @@ static noinline int cow_file_range(struct inode *inode,
731 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, 758 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
732 start, num_bytes); 759 start, num_bytes);
733 if (em) { 760 if (em) {
734 alloc_hint = em->block_start; 761 /*
735 free_extent_map(em); 762 * if block start isn't an actual block number then find the
763 * first block in this inode and use that as a hint. If that
764 * block is also bogus then just don't worry about it.
765 */
766 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
767 free_extent_map(em);
768 em = search_extent_mapping(em_tree, 0, 0);
769 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
770 alloc_hint = em->block_start;
771 if (em)
772 free_extent_map(em);
773 } else {
774 alloc_hint = em->block_start;
775 free_extent_map(em);
776 }
736 } 777 }
737 read_unlock(&BTRFS_I(inode)->extent_tree.lock); 778 read_unlock(&BTRFS_I(inode)->extent_tree.lock);
738 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 779 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
739 780
740 while (disk_num_bytes > 0) { 781 while (disk_num_bytes > 0) {
782 unsigned long op;
783
741 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); 784 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
742 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 785 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
743 root->sectorsize, 0, alloc_hint, 786 root->sectorsize, 0, alloc_hint,
@@ -789,10 +832,13 @@ static noinline int cow_file_range(struct inode *inode,
789 * Do set the Private2 bit so we know this page was properly 832 * Do set the Private2 bit so we know this page was properly
790 * setup for writepage 833 * setup for writepage
791 */ 834 */
835 op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
836 op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
837 EXTENT_SET_PRIVATE2;
838
792 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 839 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
793 start, start + ram_size - 1, 840 start, start + ram_size - 1,
794 locked_page, unlock, 1, 841 locked_page, op);
795 1, 0, 0, 0, 1);
796 disk_num_bytes -= cur_alloc_size; 842 disk_num_bytes -= cur_alloc_size;
797 num_bytes -= cur_alloc_size; 843 num_bytes -= cur_alloc_size;
798 alloc_hint = ins.objectid + ins.offset; 844 alloc_hint = ins.objectid + ins.offset;
@@ -864,8 +910,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
864 u64 cur_end; 910 u64 cur_end;
865 int limit = 10 * 1024 * 1042; 911 int limit = 10 * 1024 * 1042;
866 912
867 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | 913 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
868 EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS); 914 1, 0, NULL, GFP_NOFS);
869 while (start < end) { 915 while (start < end) {
870 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 916 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
871 async_cow->inode = inode; 917 async_cow->inode = inode;
@@ -1006,6 +1052,7 @@ next_slot:
1006 1052
1007 if (found_key.offset > cur_offset) { 1053 if (found_key.offset > cur_offset) {
1008 extent_end = found_key.offset; 1054 extent_end = found_key.offset;
1055 extent_type = 0;
1009 goto out_check; 1056 goto out_check;
1010 } 1057 }
1011 1058
@@ -1112,8 +1159,10 @@ out_check:
1112 BUG_ON(ret); 1159 BUG_ON(ret);
1113 1160
1114 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1161 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1115 cur_offset, cur_offset + num_bytes - 1, 1162 cur_offset, cur_offset + num_bytes - 1,
1116 locked_page, 1, 1, 1, 0, 0, 0, 1); 1163 locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
1164 EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
1165 EXTENT_SET_PRIVATE2);
1117 cur_offset = extent_end; 1166 cur_offset = extent_end;
1118 if (cur_offset > end) 1167 if (cur_offset > end)
1119 break; 1168 break;
@@ -1159,6 +1208,89 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1159 return ret; 1208 return ret;
1160} 1209}
1161 1210
1211static int btrfs_split_extent_hook(struct inode *inode,
1212 struct extent_state *orig, u64 split)
1213{
1214 struct btrfs_root *root = BTRFS_I(inode)->root;
1215 u64 size;
1216
1217 if (!(orig->state & EXTENT_DELALLOC))
1218 return 0;
1219
1220 size = orig->end - orig->start + 1;
1221 if (size > root->fs_info->max_extent) {
1222 u64 num_extents;
1223 u64 new_size;
1224
1225 new_size = orig->end - split + 1;
1226 num_extents = div64_u64(size + root->fs_info->max_extent - 1,
1227 root->fs_info->max_extent);
1228
1229 /*
1230 * if we break a large extent up then leave oustanding_extents
1231 * be, since we've already accounted for the large extent.
1232 */
1233 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1234 root->fs_info->max_extent) < num_extents)
1235 return 0;
1236 }
1237
1238 spin_lock(&BTRFS_I(inode)->accounting_lock);
1239 BTRFS_I(inode)->outstanding_extents++;
1240 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1241
1242 return 0;
1243}
1244
1245/*
1246 * extent_io.c merge_extent_hook, used to track merged delayed allocation
1247 * extents so we can keep track of new extents that are just merged onto old
1248 * extents, such as when we are doing sequential writes, so we can properly
1249 * account for the metadata space we'll need.
1250 */
1251static int btrfs_merge_extent_hook(struct inode *inode,
1252 struct extent_state *new,
1253 struct extent_state *other)
1254{
1255 struct btrfs_root *root = BTRFS_I(inode)->root;
1256 u64 new_size, old_size;
1257 u64 num_extents;
1258
1259 /* not delalloc, ignore it */
1260 if (!(other->state & EXTENT_DELALLOC))
1261 return 0;
1262
1263 old_size = other->end - other->start + 1;
1264 if (new->start < other->start)
1265 new_size = other->end - new->start + 1;
1266 else
1267 new_size = new->end - other->start + 1;
1268
1269 /* we're not bigger than the max, unreserve the space and go */
1270 if (new_size <= root->fs_info->max_extent) {
1271 spin_lock(&BTRFS_I(inode)->accounting_lock);
1272 BTRFS_I(inode)->outstanding_extents--;
1273 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1274 return 0;
1275 }
1276
1277 /*
1278 * If we grew by another max_extent, just return, we want to keep that
1279 * reserved amount.
1280 */
1281 num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
1282 root->fs_info->max_extent);
1283 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1284 root->fs_info->max_extent) > num_extents)
1285 return 0;
1286
1287 spin_lock(&BTRFS_I(inode)->accounting_lock);
1288 BTRFS_I(inode)->outstanding_extents--;
1289 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1290
1291 return 0;
1292}
1293
1162/* 1294/*
1163 * extent_io.c set_bit_hook, used to track delayed allocation 1295 * extent_io.c set_bit_hook, used to track delayed allocation
1164 * bytes in this file, and to maintain the list of inodes that 1296 * bytes in this file, and to maintain the list of inodes that
@@ -1167,6 +1299,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1167static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1299static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1168 unsigned long old, unsigned long bits) 1300 unsigned long old, unsigned long bits)
1169{ 1301{
1302
1170 /* 1303 /*
1171 * set_bit and clear bit hooks normally require _irqsave/restore 1304 * set_bit and clear bit hooks normally require _irqsave/restore
1172 * but in this case, we are only testeing for the DELALLOC 1305 * but in this case, we are only testeing for the DELALLOC
@@ -1174,6 +1307,10 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1174 */ 1307 */
1175 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1308 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1176 struct btrfs_root *root = BTRFS_I(inode)->root; 1309 struct btrfs_root *root = BTRFS_I(inode)->root;
1310
1311 spin_lock(&BTRFS_I(inode)->accounting_lock);
1312 BTRFS_I(inode)->outstanding_extents++;
1313 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1177 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1314 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1178 spin_lock(&root->fs_info->delalloc_lock); 1315 spin_lock(&root->fs_info->delalloc_lock);
1179 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1316 BTRFS_I(inode)->delalloc_bytes += end - start + 1;
@@ -1190,22 +1327,31 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1190/* 1327/*
1191 * extent_io.c clear_bit_hook, see set_bit_hook for why 1328 * extent_io.c clear_bit_hook, see set_bit_hook for why
1192 */ 1329 */
1193static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, 1330static int btrfs_clear_bit_hook(struct inode *inode,
1194 unsigned long old, unsigned long bits) 1331 struct extent_state *state, unsigned long bits)
1195{ 1332{
1196 /* 1333 /*
1197 * set_bit and clear bit hooks normally require _irqsave/restore 1334 * set_bit and clear bit hooks normally require _irqsave/restore
1198 * but in this case, we are only testeing for the DELALLOC 1335 * but in this case, we are only testeing for the DELALLOC
1199 * bit, which is only set or cleared with irqs on 1336 * bit, which is only set or cleared with irqs on
1200 */ 1337 */
1201 if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1338 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
1202 struct btrfs_root *root = BTRFS_I(inode)->root; 1339 struct btrfs_root *root = BTRFS_I(inode)->root;
1203 1340
1341 if (bits & EXTENT_DO_ACCOUNTING) {
1342 spin_lock(&BTRFS_I(inode)->accounting_lock);
1343 BTRFS_I(inode)->outstanding_extents--;
1344 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1345 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
1346 }
1347
1204 spin_lock(&root->fs_info->delalloc_lock); 1348 spin_lock(&root->fs_info->delalloc_lock);
1205 if (end - start + 1 > root->fs_info->delalloc_bytes) { 1349 if (state->end - state->start + 1 >
1350 root->fs_info->delalloc_bytes) {
1206 printk(KERN_INFO "btrfs warning: delalloc account " 1351 printk(KERN_INFO "btrfs warning: delalloc account "
1207 "%llu %llu\n", 1352 "%llu %llu\n",
1208 (unsigned long long)end - start + 1, 1353 (unsigned long long)
1354 state->end - state->start + 1,
1209 (unsigned long long) 1355 (unsigned long long)
1210 root->fs_info->delalloc_bytes); 1356 root->fs_info->delalloc_bytes);
1211 btrfs_delalloc_free_space(root, inode, (u64)-1); 1357 btrfs_delalloc_free_space(root, inode, (u64)-1);
@@ -1213,9 +1359,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
1213 BTRFS_I(inode)->delalloc_bytes = 0; 1359 BTRFS_I(inode)->delalloc_bytes = 0;
1214 } else { 1360 } else {
1215 btrfs_delalloc_free_space(root, inode, 1361 btrfs_delalloc_free_space(root, inode,
1216 end - start + 1); 1362 state->end -
1217 root->fs_info->delalloc_bytes -= end - start + 1; 1363 state->start + 1);
1218 BTRFS_I(inode)->delalloc_bytes -= end - start + 1; 1364 root->fs_info->delalloc_bytes -= state->end -
1365 state->start + 1;
1366 BTRFS_I(inode)->delalloc_bytes -= state->end -
1367 state->start + 1;
1219 } 1368 }
1220 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1369 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1221 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1370 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
@@ -2354,7 +2503,19 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2354 2503
2355 root = BTRFS_I(dir)->root; 2504 root = BTRFS_I(dir)->root;
2356 2505
2506 /*
2507 * 5 items for unlink inode
2508 * 1 for orphan
2509 */
2510 ret = btrfs_reserve_metadata_space(root, 6);
2511 if (ret)
2512 return ret;
2513
2357 trans = btrfs_start_transaction(root, 1); 2514 trans = btrfs_start_transaction(root, 1);
2515 if (IS_ERR(trans)) {
2516 btrfs_unreserve_metadata_space(root, 6);
2517 return PTR_ERR(trans);
2518 }
2358 2519
2359 btrfs_set_trans_block_group(trans, dir); 2520 btrfs_set_trans_block_group(trans, dir);
2360 2521
@@ -2369,6 +2530,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2369 nr = trans->blocks_used; 2530 nr = trans->blocks_used;
2370 2531
2371 btrfs_end_transaction_throttle(trans, root); 2532 btrfs_end_transaction_throttle(trans, root);
2533 btrfs_unreserve_metadata_space(root, 6);
2372 btrfs_btree_balance_dirty(root, nr); 2534 btrfs_btree_balance_dirty(root, nr);
2373 return ret; 2535 return ret;
2374} 2536}
@@ -2449,7 +2611,16 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2449 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 2611 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
2450 return -ENOTEMPTY; 2612 return -ENOTEMPTY;
2451 2613
2614 ret = btrfs_reserve_metadata_space(root, 5);
2615 if (ret)
2616 return ret;
2617
2452 trans = btrfs_start_transaction(root, 1); 2618 trans = btrfs_start_transaction(root, 1);
2619 if (IS_ERR(trans)) {
2620 btrfs_unreserve_metadata_space(root, 5);
2621 return PTR_ERR(trans);
2622 }
2623
2453 btrfs_set_trans_block_group(trans, dir); 2624 btrfs_set_trans_block_group(trans, dir);
2454 2625
2455 if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 2626 if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
@@ -2472,6 +2643,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2472out: 2643out:
2473 nr = trans->blocks_used; 2644 nr = trans->blocks_used;
2474 ret = btrfs_end_transaction_throttle(trans, root); 2645 ret = btrfs_end_transaction_throttle(trans, root);
2646 btrfs_unreserve_metadata_space(root, 5);
2475 btrfs_btree_balance_dirty(root, nr); 2647 btrfs_btree_balance_dirty(root, nr);
2476 2648
2477 if (ret && !err) 2649 if (ret && !err)
@@ -2912,12 +3084,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
2912 3084
2913 if ((offset & (blocksize - 1)) == 0) 3085 if ((offset & (blocksize - 1)) == 0)
2914 goto out; 3086 goto out;
3087 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
3088 if (ret)
3089 goto out;
3090
3091 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
3092 if (ret)
3093 goto out;
2915 3094
2916 ret = -ENOMEM; 3095 ret = -ENOMEM;
2917again: 3096again:
2918 page = grab_cache_page(mapping, index); 3097 page = grab_cache_page(mapping, index);
2919 if (!page) 3098 if (!page) {
3099 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
3100 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
2920 goto out; 3101 goto out;
3102 }
2921 3103
2922 page_start = page_offset(page); 3104 page_start = page_offset(page);
2923 page_end = page_start + PAGE_CACHE_SIZE - 1; 3105 page_end = page_start + PAGE_CACHE_SIZE - 1;
@@ -2950,7 +3132,16 @@ again:
2950 goto again; 3132 goto again;
2951 } 3133 }
2952 3134
2953 btrfs_set_extent_delalloc(inode, page_start, page_end); 3135 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
3136 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
3137 GFP_NOFS);
3138
3139 ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
3140 if (ret) {
3141 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
3142 goto out_unlock;
3143 }
3144
2954 ret = 0; 3145 ret = 0;
2955 if (offset != PAGE_CACHE_SIZE) { 3146 if (offset != PAGE_CACHE_SIZE) {
2956 kaddr = kmap(page); 3147 kaddr = kmap(page);
@@ -2963,6 +3154,9 @@ again:
2963 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3154 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
2964 3155
2965out_unlock: 3156out_unlock:
3157 if (ret)
3158 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
3159 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
2966 unlock_page(page); 3160 unlock_page(page);
2967 page_cache_release(page); 3161 page_cache_release(page);
2968out: 3162out:
@@ -2981,17 +3175,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
2981 u64 last_byte; 3175 u64 last_byte;
2982 u64 cur_offset; 3176 u64 cur_offset;
2983 u64 hole_size; 3177 u64 hole_size;
2984 int err; 3178 int err = 0;
2985 3179
2986 if (size <= hole_start) 3180 if (size <= hole_start)
2987 return 0; 3181 return 0;
2988 3182
2989 err = btrfs_check_metadata_free_space(root); 3183 err = btrfs_truncate_page(inode->i_mapping, inode->i_size);
2990 if (err) 3184 if (err)
2991 return err; 3185 return err;
2992 3186
2993 btrfs_truncate_page(inode->i_mapping, inode->i_size);
2994
2995 while (1) { 3187 while (1) {
2996 struct btrfs_ordered_extent *ordered; 3188 struct btrfs_ordered_extent *ordered;
2997 btrfs_wait_ordered_range(inode, hole_start, 3189 btrfs_wait_ordered_range(inode, hole_start,
@@ -3024,12 +3216,18 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3024 cur_offset, &hint_byte, 1); 3216 cur_offset, &hint_byte, 1);
3025 if (err) 3217 if (err)
3026 break; 3218 break;
3219
3220 err = btrfs_reserve_metadata_space(root, 1);
3221 if (err)
3222 break;
3223
3027 err = btrfs_insert_file_extent(trans, root, 3224 err = btrfs_insert_file_extent(trans, root,
3028 inode->i_ino, cur_offset, 0, 3225 inode->i_ino, cur_offset, 0,
3029 0, hole_size, 0, hole_size, 3226 0, hole_size, 0, hole_size,
3030 0, 0, 0); 3227 0, 0, 0);
3031 btrfs_drop_extent_cache(inode, hole_start, 3228 btrfs_drop_extent_cache(inode, hole_start,
3032 last_byte - 1, 0); 3229 last_byte - 1, 0);
3230 btrfs_unreserve_metadata_space(root, 1);
3033 } 3231 }
3034 free_extent_map(em); 3232 free_extent_map(em);
3035 cur_offset = last_byte; 3233 cur_offset = last_byte;
@@ -3353,6 +3551,7 @@ static noinline void init_btrfs_i(struct inode *inode)
3353 bi->generation = 0; 3551 bi->generation = 0;
3354 bi->sequence = 0; 3552 bi->sequence = 0;
3355 bi->last_trans = 0; 3553 bi->last_trans = 0;
3554 bi->last_sub_trans = 0;
3356 bi->logged_trans = 0; 3555 bi->logged_trans = 0;
3357 bi->delalloc_bytes = 0; 3556 bi->delalloc_bytes = 0;
3358 bi->reserved_bytes = 0; 3557 bi->reserved_bytes = 0;
@@ -3503,12 +3702,14 @@ static int btrfs_dentry_delete(struct dentry *dentry)
3503{ 3702{
3504 struct btrfs_root *root; 3703 struct btrfs_root *root;
3505 3704
3506 if (!dentry->d_inode) 3705 if (!dentry->d_inode && !IS_ROOT(dentry))
3507 return 0; 3706 dentry = dentry->d_parent;
3508 3707
3509 root = BTRFS_I(dentry->d_inode)->root; 3708 if (dentry->d_inode) {
3510 if (btrfs_root_refs(&root->root_item) == 0) 3709 root = BTRFS_I(dentry->d_inode)->root;
3511 return 1; 3710 if (btrfs_root_refs(&root->root_item) == 0)
3711 return 1;
3712 }
3512 return 0; 3713 return 0;
3513} 3714}
3514 3715
@@ -3990,11 +4191,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
3990 if (!new_valid_dev(rdev)) 4191 if (!new_valid_dev(rdev))
3991 return -EINVAL; 4192 return -EINVAL;
3992 4193
3993 err = btrfs_check_metadata_free_space(root); 4194 /*
4195 * 2 for inode item and ref
4196 * 2 for dir items
4197 * 1 for xattr if selinux is on
4198 */
4199 err = btrfs_reserve_metadata_space(root, 5);
3994 if (err) 4200 if (err)
3995 goto fail; 4201 return err;
3996 4202
3997 trans = btrfs_start_transaction(root, 1); 4203 trans = btrfs_start_transaction(root, 1);
4204 if (!trans)
4205 goto fail;
3998 btrfs_set_trans_block_group(trans, dir); 4206 btrfs_set_trans_block_group(trans, dir);
3999 4207
4000 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4208 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -4032,6 +4240,7 @@ out_unlock:
4032 nr = trans->blocks_used; 4240 nr = trans->blocks_used;
4033 btrfs_end_transaction_throttle(trans, root); 4241 btrfs_end_transaction_throttle(trans, root);
4034fail: 4242fail:
4243 btrfs_unreserve_metadata_space(root, 5);
4035 if (drop_inode) { 4244 if (drop_inode) {
4036 inode_dec_link_count(inode); 4245 inode_dec_link_count(inode);
4037 iput(inode); 4246 iput(inode);
@@ -4052,10 +4261,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4052 u64 objectid; 4261 u64 objectid;
4053 u64 index = 0; 4262 u64 index = 0;
4054 4263
4055 err = btrfs_check_metadata_free_space(root); 4264 /*
4265 * 2 for inode item and ref
4266 * 2 for dir items
4267 * 1 for xattr if selinux is on
4268 */
4269 err = btrfs_reserve_metadata_space(root, 5);
4056 if (err) 4270 if (err)
4057 goto fail; 4271 return err;
4272
4058 trans = btrfs_start_transaction(root, 1); 4273 trans = btrfs_start_transaction(root, 1);
4274 if (!trans)
4275 goto fail;
4059 btrfs_set_trans_block_group(trans, dir); 4276 btrfs_set_trans_block_group(trans, dir);
4060 4277
4061 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4278 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -4096,6 +4313,7 @@ out_unlock:
4096 nr = trans->blocks_used; 4313 nr = trans->blocks_used;
4097 btrfs_end_transaction_throttle(trans, root); 4314 btrfs_end_transaction_throttle(trans, root);
4098fail: 4315fail:
4316 btrfs_unreserve_metadata_space(root, 5);
4099 if (drop_inode) { 4317 if (drop_inode) {
4100 inode_dec_link_count(inode); 4318 inode_dec_link_count(inode);
4101 iput(inode); 4319 iput(inode);
@@ -4118,10 +4336,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4118 if (inode->i_nlink == 0) 4336 if (inode->i_nlink == 0)
4119 return -ENOENT; 4337 return -ENOENT;
4120 4338
4121 btrfs_inc_nlink(inode); 4339 /*
4122 err = btrfs_check_metadata_free_space(root); 4340 * 1 item for inode ref
4341 * 2 items for dir items
4342 */
4343 err = btrfs_reserve_metadata_space(root, 3);
4123 if (err) 4344 if (err)
4124 goto fail; 4345 return err;
4346
4347 btrfs_inc_nlink(inode);
4348
4125 err = btrfs_set_inode_index(dir, &index); 4349 err = btrfs_set_inode_index(dir, &index);
4126 if (err) 4350 if (err)
4127 goto fail; 4351 goto fail;
@@ -4145,6 +4369,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4145 nr = trans->blocks_used; 4369 nr = trans->blocks_used;
4146 btrfs_end_transaction_throttle(trans, root); 4370 btrfs_end_transaction_throttle(trans, root);
4147fail: 4371fail:
4372 btrfs_unreserve_metadata_space(root, 3);
4148 if (drop_inode) { 4373 if (drop_inode) {
4149 inode_dec_link_count(inode); 4374 inode_dec_link_count(inode);
4150 iput(inode); 4375 iput(inode);
@@ -4164,17 +4389,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4164 u64 index = 0; 4389 u64 index = 0;
4165 unsigned long nr = 1; 4390 unsigned long nr = 1;
4166 4391
4167 err = btrfs_check_metadata_free_space(root); 4392 /*
4393 * 2 items for inode and ref
4394 * 2 items for dir items
4395 * 1 for xattr if selinux is on
4396 */
4397 err = btrfs_reserve_metadata_space(root, 5);
4168 if (err) 4398 if (err)
4169 goto out_unlock; 4399 return err;
4170 4400
4171 trans = btrfs_start_transaction(root, 1); 4401 trans = btrfs_start_transaction(root, 1);
4172 btrfs_set_trans_block_group(trans, dir); 4402 if (!trans) {
4173 4403 err = -ENOMEM;
4174 if (IS_ERR(trans)) {
4175 err = PTR_ERR(trans);
4176 goto out_unlock; 4404 goto out_unlock;
4177 } 4405 }
4406 btrfs_set_trans_block_group(trans, dir);
4178 4407
4179 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4408 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4180 if (err) { 4409 if (err) {
@@ -4223,6 +4452,7 @@ out_fail:
4223 btrfs_end_transaction_throttle(trans, root); 4452 btrfs_end_transaction_throttle(trans, root);
4224 4453
4225out_unlock: 4454out_unlock:
4455 btrfs_unreserve_metadata_space(root, 5);
4226 if (drop_on_err) 4456 if (drop_on_err)
4227 iput(inode); 4457 iput(inode);
4228 btrfs_btree_balance_dirty(root, nr); 4458 btrfs_btree_balance_dirty(root, nr);
@@ -4684,7 +4914,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4684 */ 4914 */
4685 clear_extent_bit(tree, page_start, page_end, 4915 clear_extent_bit(tree, page_start, page_end,
4686 EXTENT_DIRTY | EXTENT_DELALLOC | 4916 EXTENT_DIRTY | EXTENT_DELALLOC |
4687 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); 4917 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
4918 NULL, GFP_NOFS);
4688 /* 4919 /*
4689 * whoever cleared the private bit is responsible 4920 * whoever cleared the private bit is responsible
4690 * for the finish_ordered_io 4921 * for the finish_ordered_io
@@ -4697,8 +4928,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4697 lock_extent(tree, page_start, page_end, GFP_NOFS); 4928 lock_extent(tree, page_start, page_end, GFP_NOFS);
4698 } 4929 }
4699 clear_extent_bit(tree, page_start, page_end, 4930 clear_extent_bit(tree, page_start, page_end,
4700 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, 4931 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4701 1, 1, NULL, GFP_NOFS); 4932 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
4702 __btrfs_releasepage(page, GFP_NOFS); 4933 __btrfs_releasepage(page, GFP_NOFS);
4703 4934
4704 ClearPageChecked(page); 4935 ClearPageChecked(page);
@@ -4747,6 +4978,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
4747 goto out; 4978 goto out;
4748 } 4979 }
4749 4980
4981 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
4982 if (ret) {
4983 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
4984 ret = VM_FAULT_SIGBUS;
4985 goto out;
4986 }
4987
4750 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 4988 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
4751again: 4989again:
4752 lock_page(page); 4990 lock_page(page);
@@ -4778,7 +5016,24 @@ again:
4778 goto again; 5016 goto again;
4779 } 5017 }
4780 5018
4781 btrfs_set_extent_delalloc(inode, page_start, page_end); 5019 /*
5020 * XXX - page_mkwrite gets called every time the page is dirtied, even
5021 * if it was already dirty, so for space accounting reasons we need to
5022 * clear any delalloc bits for the range we are fixing to save. There
5023 * is probably a better way to do this, but for now keep consistent with
5024 * prepare_pages in the normal write path.
5025 */
5026 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
5027 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
5028 GFP_NOFS);
5029
5030 ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
5031 if (ret) {
5032 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
5033 ret = VM_FAULT_SIGBUS;
5034 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5035 goto out_unlock;
5036 }
4782 ret = 0; 5037 ret = 0;
4783 5038
4784 /* page is wholly or partially inside EOF */ 5039 /* page is wholly or partially inside EOF */
@@ -4797,10 +5052,13 @@ again:
4797 set_page_dirty(page); 5052 set_page_dirty(page);
4798 SetPageUptodate(page); 5053 SetPageUptodate(page);
4799 5054
4800 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 5055 BTRFS_I(inode)->last_trans = root->fs_info->generation;
5056 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
5057
4801 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5058 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4802 5059
4803out_unlock: 5060out_unlock:
5061 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
4804 if (!ret) 5062 if (!ret)
4805 return VM_FAULT_LOCKED; 5063 return VM_FAULT_LOCKED;
4806 unlock_page(page); 5064 unlock_page(page);
@@ -4821,7 +5079,9 @@ static void btrfs_truncate(struct inode *inode)
4821 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 5079 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4822 return; 5080 return;
4823 5081
4824 btrfs_truncate_page(inode->i_mapping, inode->i_size); 5082 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
5083 if (ret)
5084 return;
4825 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 5085 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
4826 5086
4827 trans = btrfs_start_transaction(root, 1); 5087 trans = btrfs_start_transaction(root, 1);
@@ -4916,7 +5176,12 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
4916 if (!ei) 5176 if (!ei)
4917 return NULL; 5177 return NULL;
4918 ei->last_trans = 0; 5178 ei->last_trans = 0;
5179 ei->last_sub_trans = 0;
4919 ei->logged_trans = 0; 5180 ei->logged_trans = 0;
5181 ei->outstanding_extents = 0;
5182 ei->reserved_extents = 0;
5183 ei->root = NULL;
5184 spin_lock_init(&ei->accounting_lock);
4920 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 5185 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
4921 INIT_LIST_HEAD(&ei->i_orphan); 5186 INIT_LIST_HEAD(&ei->i_orphan);
4922 INIT_LIST_HEAD(&ei->ordered_operations); 5187 INIT_LIST_HEAD(&ei->ordered_operations);
@@ -4932,6 +5197,14 @@ void btrfs_destroy_inode(struct inode *inode)
4932 WARN_ON(inode->i_data.nrpages); 5197 WARN_ON(inode->i_data.nrpages);
4933 5198
4934 /* 5199 /*
5200 * This can happen where we create an inode, but somebody else also
5201 * created the same inode and we need to destroy the one we already
5202 * created.
5203 */
5204 if (!root)
5205 goto free;
5206
5207 /*
4935 * Make sure we're properly removed from the ordered operation 5208 * Make sure we're properly removed from the ordered operation
4936 * lists. 5209 * lists.
4937 */ 5210 */
@@ -4966,6 +5239,7 @@ void btrfs_destroy_inode(struct inode *inode)
4966 } 5239 }
4967 inode_tree_del(inode); 5240 inode_tree_del(inode);
4968 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 5241 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
5242free:
4969 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 5243 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
4970} 5244}
4971 5245
@@ -5070,7 +5344,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5070 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 5344 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
5071 return -ENOTEMPTY; 5345 return -ENOTEMPTY;
5072 5346
5073 ret = btrfs_check_metadata_free_space(root); 5347 /*
5348 * We want to reserve the absolute worst case amount of items. So if
5349 * both inodes are subvols and we need to unlink them then that would
5350 * require 4 item modifications, but if they are both normal inodes it
5351 * would require 5 item modifications, so we'll assume their normal
5352 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
5353 * should cover the worst case number of items we'll modify.
5354 */
5355 ret = btrfs_reserve_metadata_space(root, 11);
5074 if (ret) 5356 if (ret)
5075 return ret; 5357 return ret;
5076 5358
@@ -5185,6 +5467,8 @@ out_fail:
5185 5467
5186 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 5468 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5187 up_read(&root->fs_info->subvol_sem); 5469 up_read(&root->fs_info->subvol_sem);
5470
5471 btrfs_unreserve_metadata_space(root, 11);
5188 return ret; 5472 return ret;
5189} 5473}
5190 5474
@@ -5256,11 +5540,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5256 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 5540 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
5257 return -ENAMETOOLONG; 5541 return -ENAMETOOLONG;
5258 5542
5259 err = btrfs_check_metadata_free_space(root); 5543 /*
5544 * 2 items for inode item and ref
5545 * 2 items for dir items
5546 * 1 item for xattr if selinux is on
5547 */
5548 err = btrfs_reserve_metadata_space(root, 5);
5260 if (err) 5549 if (err)
5261 goto out_fail; 5550 return err;
5262 5551
5263 trans = btrfs_start_transaction(root, 1); 5552 trans = btrfs_start_transaction(root, 1);
5553 if (!trans)
5554 goto out_fail;
5264 btrfs_set_trans_block_group(trans, dir); 5555 btrfs_set_trans_block_group(trans, dir);
5265 5556
5266 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 5557 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -5341,6 +5632,7 @@ out_unlock:
5341 nr = trans->blocks_used; 5632 nr = trans->blocks_used;
5342 btrfs_end_transaction_throttle(trans, root); 5633 btrfs_end_transaction_throttle(trans, root);
5343out_fail: 5634out_fail:
5635 btrfs_unreserve_metadata_space(root, 5);
5344 if (drop_inode) { 5636 if (drop_inode) {
5345 inode_dec_link_count(inode); 5637 inode_dec_link_count(inode);
5346 iput(inode); 5638 iput(inode);
@@ -5362,6 +5654,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
5362 5654
5363 while (num_bytes > 0) { 5655 while (num_bytes > 0) {
5364 alloc_size = min(num_bytes, root->fs_info->max_extent); 5656 alloc_size = min(num_bytes, root->fs_info->max_extent);
5657
5658 ret = btrfs_reserve_metadata_space(root, 1);
5659 if (ret)
5660 goto out;
5661
5365 ret = btrfs_reserve_extent(trans, root, alloc_size, 5662 ret = btrfs_reserve_extent(trans, root, alloc_size,
5366 root->sectorsize, 0, alloc_hint, 5663 root->sectorsize, 0, alloc_hint,
5367 (u64)-1, &ins, 1); 5664 (u64)-1, &ins, 1);
@@ -5381,6 +5678,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
5381 num_bytes -= ins.offset; 5678 num_bytes -= ins.offset;
5382 cur_offset += ins.offset; 5679 cur_offset += ins.offset;
5383 alloc_hint = ins.objectid + ins.offset; 5680 alloc_hint = ins.objectid + ins.offset;
5681 btrfs_unreserve_metadata_space(root, 1);
5384 } 5682 }
5385out: 5683out:
5386 if (cur_offset > start) { 5684 if (cur_offset > start) {
@@ -5544,7 +5842,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
5544 .permission = btrfs_permission, 5842 .permission = btrfs_permission,
5545}; 5843};
5546 5844
5547static struct file_operations btrfs_dir_file_operations = { 5845static const struct file_operations btrfs_dir_file_operations = {
5548 .llseek = generic_file_llseek, 5846 .llseek = generic_file_llseek,
5549 .read = generic_read_dir, 5847 .read = generic_read_dir,
5550 .readdir = btrfs_real_readdir, 5848 .readdir = btrfs_real_readdir,
@@ -5566,6 +5864,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
5566 .readpage_io_failed_hook = btrfs_io_failed_hook, 5864 .readpage_io_failed_hook = btrfs_io_failed_hook,
5567 .set_bit_hook = btrfs_set_bit_hook, 5865 .set_bit_hook = btrfs_set_bit_hook,
5568 .clear_bit_hook = btrfs_clear_bit_hook, 5866 .clear_bit_hook = btrfs_clear_bit_hook,
5867 .merge_extent_hook = btrfs_merge_extent_hook,
5868 .split_extent_hook = btrfs_split_extent_hook,
5569}; 5869};
5570 5870
5571/* 5871/*
@@ -5632,6 +5932,6 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
5632 .removexattr = btrfs_removexattr, 5932 .removexattr = btrfs_removexattr,
5633}; 5933};
5634 5934
5635struct dentry_operations btrfs_dentry_operations = { 5935const struct dentry_operations btrfs_dentry_operations = {
5636 .d_delete = btrfs_dentry_delete, 5936 .d_delete = btrfs_dentry_delete,
5637}; 5937};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a8577a7f26ab..cdbb054102b9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -239,7 +239,13 @@ static noinline int create_subvol(struct btrfs_root *root,
239 u64 index = 0; 239 u64 index = 0;
240 unsigned long nr = 1; 240 unsigned long nr = 1;
241 241
242 ret = btrfs_check_metadata_free_space(root); 242 /*
243 * 1 - inode item
244 * 2 - refs
245 * 1 - root item
246 * 2 - dir items
247 */
248 ret = btrfs_reserve_metadata_space(root, 6);
243 if (ret) 249 if (ret)
244 return ret; 250 return ret;
245 251
@@ -340,6 +346,9 @@ fail:
340 err = btrfs_commit_transaction(trans, root); 346 err = btrfs_commit_transaction(trans, root);
341 if (err && !ret) 347 if (err && !ret)
342 ret = err; 348 ret = err;
349
350 btrfs_unreserve_metadata_space(root, 6);
351 btrfs_btree_balance_dirty(root, nr);
343 return ret; 352 return ret;
344} 353}
345 354
@@ -355,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
355 if (!root->ref_cows) 364 if (!root->ref_cows)
356 return -EINVAL; 365 return -EINVAL;
357 366
358 ret = btrfs_check_metadata_free_space(root); 367 /*
368 * 1 - inode item
369 * 2 - refs
370 * 1 - root item
371 * 2 - dir items
372 */
373 ret = btrfs_reserve_metadata_space(root, 6);
359 if (ret) 374 if (ret)
360 goto fail_unlock; 375 goto fail_unlock;
361 376
362 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 377 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
363 if (!pending_snapshot) { 378 if (!pending_snapshot) {
364 ret = -ENOMEM; 379 ret = -ENOMEM;
380 btrfs_unreserve_metadata_space(root, 6);
365 goto fail_unlock; 381 goto fail_unlock;
366 } 382 }
367 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); 383 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
368 if (!pending_snapshot->name) { 384 if (!pending_snapshot->name) {
369 ret = -ENOMEM; 385 ret = -ENOMEM;
370 kfree(pending_snapshot); 386 kfree(pending_snapshot);
387 btrfs_unreserve_metadata_space(root, 6);
371 goto fail_unlock; 388 goto fail_unlock;
372 } 389 }
373 memcpy(pending_snapshot->name, name, namelen); 390 memcpy(pending_snapshot->name, name, namelen);
@@ -813,6 +830,7 @@ out_up_write:
813out_unlock: 830out_unlock:
814 mutex_unlock(&inode->i_mutex); 831 mutex_unlock(&inode->i_mutex);
815 if (!err) { 832 if (!err) {
833 shrink_dcache_sb(root->fs_info->sb);
816 btrfs_invalidate_inodes(dest); 834 btrfs_invalidate_inodes(dest);
817 d_delete(dentry); 835 d_delete(dentry);
818 } 836 }
@@ -1105,8 +1123,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1105 datao += off - key.offset; 1123 datao += off - key.offset;
1106 datal -= off - key.offset; 1124 datal -= off - key.offset;
1107 } 1125 }
1108 if (key.offset + datao + datal > off + len) 1126
1109 datal = off + len - key.offset - datao; 1127 if (key.offset + datal > off + len)
1128 datal = off + len - key.offset;
1129
1110 /* disko == 0 means it's a hole */ 1130 /* disko == 0 means it's a hole */
1111 if (!disko) 1131 if (!disko)
1112 datao = 0; 1132 datao = 0;
@@ -1215,15 +1235,15 @@ static long btrfs_ioctl_trans_start(struct file *file)
1215 struct inode *inode = fdentry(file)->d_inode; 1235 struct inode *inode = fdentry(file)->d_inode;
1216 struct btrfs_root *root = BTRFS_I(inode)->root; 1236 struct btrfs_root *root = BTRFS_I(inode)->root;
1217 struct btrfs_trans_handle *trans; 1237 struct btrfs_trans_handle *trans;
1218 int ret = 0; 1238 int ret;
1219 1239
1240 ret = -EPERM;
1220 if (!capable(CAP_SYS_ADMIN)) 1241 if (!capable(CAP_SYS_ADMIN))
1221 return -EPERM; 1242 goto out;
1222 1243
1223 if (file->private_data) { 1244 ret = -EINPROGRESS;
1224 ret = -EINPROGRESS; 1245 if (file->private_data)
1225 goto out; 1246 goto out;
1226 }
1227 1247
1228 ret = mnt_want_write(file->f_path.mnt); 1248 ret = mnt_want_write(file->f_path.mnt);
1229 if (ret) 1249 if (ret)
@@ -1233,12 +1253,19 @@ static long btrfs_ioctl_trans_start(struct file *file)
1233 root->fs_info->open_ioctl_trans++; 1253 root->fs_info->open_ioctl_trans++;
1234 mutex_unlock(&root->fs_info->trans_mutex); 1254 mutex_unlock(&root->fs_info->trans_mutex);
1235 1255
1256 ret = -ENOMEM;
1236 trans = btrfs_start_ioctl_transaction(root, 0); 1257 trans = btrfs_start_ioctl_transaction(root, 0);
1237 if (trans) 1258 if (!trans)
1238 file->private_data = trans; 1259 goto out_drop;
1239 else 1260
1240 ret = -ENOMEM; 1261 file->private_data = trans;
1241 /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ 1262 return 0;
1263
1264out_drop:
1265 mutex_lock(&root->fs_info->trans_mutex);
1266 root->fs_info->open_ioctl_trans--;
1267 mutex_unlock(&root->fs_info->trans_mutex);
1268 mnt_drop_write(file->f_path.mnt);
1242out: 1269out:
1243 return ret; 1270 return ret;
1244} 1271}
@@ -1254,24 +1281,20 @@ long btrfs_ioctl_trans_end(struct file *file)
1254 struct inode *inode = fdentry(file)->d_inode; 1281 struct inode *inode = fdentry(file)->d_inode;
1255 struct btrfs_root *root = BTRFS_I(inode)->root; 1282 struct btrfs_root *root = BTRFS_I(inode)->root;
1256 struct btrfs_trans_handle *trans; 1283 struct btrfs_trans_handle *trans;
1257 int ret = 0;
1258 1284
1259 trans = file->private_data; 1285 trans = file->private_data;
1260 if (!trans) { 1286 if (!trans)
1261 ret = -EINVAL; 1287 return -EINVAL;
1262 goto out;
1263 }
1264 btrfs_end_transaction(trans, root);
1265 file->private_data = NULL; 1288 file->private_data = NULL;
1266 1289
1290 btrfs_end_transaction(trans, root);
1291
1267 mutex_lock(&root->fs_info->trans_mutex); 1292 mutex_lock(&root->fs_info->trans_mutex);
1268 root->fs_info->open_ioctl_trans--; 1293 root->fs_info->open_ioctl_trans--;
1269 mutex_unlock(&root->fs_info->trans_mutex); 1294 mutex_unlock(&root->fs_info->trans_mutex);
1270 1295
1271 mnt_drop_write(file->f_path.mnt); 1296 mnt_drop_write(file->f_path.mnt);
1272 1297 return 0;
1273out:
1274 return ret;
1275} 1298}
1276 1299
1277long btrfs_ioctl(struct file *file, unsigned int 1300long btrfs_ioctl(struct file *file, unsigned int
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b5d6d24726b0..5799bc46a309 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -306,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode,
306 tree->last = NULL; 306 tree->last = NULL;
307 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 307 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
308 308
309 spin_lock(&BTRFS_I(inode)->accounting_lock);
310 BTRFS_I(inode)->outstanding_extents--;
311 spin_unlock(&BTRFS_I(inode)->accounting_lock);
312 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
313 inode, 1);
314
309 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 315 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
310 list_del_init(&entry->root_extent_list); 316 list_del_init(&entry->root_extent_list);
311 317
@@ -458,7 +464,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
458 * start IO on any dirty ones so the wait doesn't stall waiting 464 * start IO on any dirty ones so the wait doesn't stall waiting
459 * for pdflush to find them 465 * for pdflush to find them
460 */ 466 */
461 btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL); 467 filemap_fdatawrite_range(inode->i_mapping, start, end);
462 if (wait) { 468 if (wait) {
463 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 469 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
464 &entry->flags)); 470 &entry->flags));
@@ -488,17 +494,15 @@ again:
488 /* start IO across the range first to instantiate any delalloc 494 /* start IO across the range first to instantiate any delalloc
489 * extents 495 * extents
490 */ 496 */
491 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 497 filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
492 498
493 /* The compression code will leave pages locked but return from 499 /* The compression code will leave pages locked but return from
494 * writepage without setting the page writeback. Starting again 500 * writepage without setting the page writeback. Starting again
495 * with WB_SYNC_ALL will end up waiting for the IO to actually start. 501 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
496 */ 502 */
497 btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); 503 filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
498 504
499 btrfs_wait_on_page_writeback_range(inode->i_mapping, 505 filemap_fdatawait_range(inode->i_mapping, start, orig_end);
500 start >> PAGE_CACHE_SHIFT,
501 orig_end >> PAGE_CACHE_SHIFT);
502 506
503 end = orig_end; 507 end = orig_end;
504 found = 0; 508 found = 0;
@@ -716,89 +720,6 @@ out:
716} 720}
717 721
718 722
719/**
720 * taken from mm/filemap.c because it isn't exported
721 *
722 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
723 * @mapping: address space structure to write
724 * @start: offset in bytes where the range starts
725 * @end: offset in bytes where the range ends (inclusive)
726 * @sync_mode: enable synchronous operation
727 *
728 * Start writeback against all of a mapping's dirty pages that lie
729 * within the byte offsets <start, end> inclusive.
730 *
731 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
732 * opposed to a regular memory cleansing writeback. The difference between
733 * these two operations is that if a dirty page/buffer is encountered, it must
734 * be waited upon, and not just skipped over.
735 */
736int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
737 loff_t end, int sync_mode)
738{
739 struct writeback_control wbc = {
740 .sync_mode = sync_mode,
741 .nr_to_write = mapping->nrpages * 2,
742 .range_start = start,
743 .range_end = end,
744 };
745 return btrfs_writepages(mapping, &wbc);
746}
747
748/**
749 * taken from mm/filemap.c because it isn't exported
750 *
751 * wait_on_page_writeback_range - wait for writeback to complete
752 * @mapping: target address_space
753 * @start: beginning page index
754 * @end: ending page index
755 *
756 * Wait for writeback to complete against pages indexed by start->end
757 * inclusive
758 */
759int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
760 pgoff_t start, pgoff_t end)
761{
762 struct pagevec pvec;
763 int nr_pages;
764 int ret = 0;
765 pgoff_t index;
766
767 if (end < start)
768 return 0;
769
770 pagevec_init(&pvec, 0);
771 index = start;
772 while ((index <= end) &&
773 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
774 PAGECACHE_TAG_WRITEBACK,
775 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
776 unsigned i;
777
778 for (i = 0; i < nr_pages; i++) {
779 struct page *page = pvec.pages[i];
780
781 /* until radix tree lookup accepts end_index */
782 if (page->index > end)
783 continue;
784
785 wait_on_page_writeback(page);
786 if (PageError(page))
787 ret = -EIO;
788 }
789 pagevec_release(&pvec);
790 cond_resched();
791 }
792
793 /* Check for outstanding write errors */
794 if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
795 ret = -ENOSPC;
796 if (test_and_clear_bit(AS_EIO, &mapping->flags))
797 ret = -EIO;
798
799 return ret;
800}
801
802/* 723/*
803 * add a given inode to the list of inodes that must be fully on 724 * add a given inode to the list of inodes that must be fully on
804 * disk before a transaction commit finishes. 725 * disk before a transaction commit finishes.
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 993a7ea45c70..f82e87488ca8 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -153,10 +153,6 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
153int btrfs_ordered_update_i_size(struct inode *inode, 153int btrfs_ordered_update_i_size(struct inode *inode,
154 struct btrfs_ordered_extent *ordered); 154 struct btrfs_ordered_extent *ordered);
155int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 155int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
156int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
157 pgoff_t start, pgoff_t end);
158int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
159 loff_t end, int sync_mode);
160int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); 156int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
161int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); 157int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
162int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, 158int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 361ad323faac..cfcc93c93a7b 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3518,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3518 BUG_ON(!rc->block_group); 3518 BUG_ON(!rc->block_group);
3519 3519
3520 btrfs_init_workers(&rc->workers, "relocate", 3520 btrfs_init_workers(&rc->workers, "relocate",
3521 fs_info->thread_pool_size); 3521 fs_info->thread_pool_size, NULL);
3522 3522
3523 rc->extent_root = extent_root; 3523 rc->extent_root = extent_root;
3524 btrfs_prepare_block_group_relocation(extent_root, rc->block_group); 3524 btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
@@ -3701,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3701 mapping_tree_init(&rc->reloc_root_tree); 3701 mapping_tree_init(&rc->reloc_root_tree);
3702 INIT_LIST_HEAD(&rc->reloc_roots); 3702 INIT_LIST_HEAD(&rc->reloc_roots);
3703 btrfs_init_workers(&rc->workers, "relocate", 3703 btrfs_init_workers(&rc->workers, "relocate",
3704 root->fs_info->thread_pool_size); 3704 root->fs_info->thread_pool_size, NULL);
3705 rc->extent_root = root->fs_info->extent_root; 3705 rc->extent_root = root->fs_info->extent_root;
3706 3706
3707 set_reloc_control(rc); 3707 set_reloc_control(rc);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 9351428f30e2..67fa2d29d663 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -159,7 +159,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
159 write_extent_buffer(l, item, ptr, sizeof(*item)); 159 write_extent_buffer(l, item, ptr, sizeof(*item));
160 btrfs_mark_buffer_dirty(path->nodes[0]); 160 btrfs_mark_buffer_dirty(path->nodes[0]);
161out: 161out:
162 btrfs_release_path(root, path);
163 btrfs_free_path(path); 162 btrfs_free_path(path);
164 return ret; 163 return ret;
165} 164}
@@ -332,7 +331,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
332 BUG_ON(refs != 0); 331 BUG_ON(refs != 0);
333 ret = btrfs_del_item(trans, root, path); 332 ret = btrfs_del_item(trans, root, path);
334out: 333out:
335 btrfs_release_path(root, path);
336 btrfs_free_path(path); 334 btrfs_free_path(path);
337 return ret; 335 return ret;
338} 336}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 67035385444c..752a5463bf53 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,7 +66,8 @@ enum {
66 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 66 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
67 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 67 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
68 Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, 68 Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
69 Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err, 69 Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
70 Opt_discard, Opt_err,
70}; 71};
71 72
72static match_table_t tokens = { 73static match_table_t tokens = {
@@ -88,6 +89,7 @@ static match_table_t tokens = {
88 {Opt_notreelog, "notreelog"}, 89 {Opt_notreelog, "notreelog"},
89 {Opt_flushoncommit, "flushoncommit"}, 90 {Opt_flushoncommit, "flushoncommit"},
90 {Opt_ratio, "metadata_ratio=%d"}, 91 {Opt_ratio, "metadata_ratio=%d"},
92 {Opt_discard, "discard"},
91 {Opt_err, NULL}, 93 {Opt_err, NULL},
92}; 94};
93 95
@@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
257 info->metadata_ratio); 259 info->metadata_ratio);
258 } 260 }
259 break; 261 break;
262 case Opt_discard:
263 btrfs_set_opt(info->mount_opt, DISCARD);
264 break;
260 default: 265 default:
261 break; 266 break;
262 } 267 }
@@ -344,7 +349,9 @@ static int btrfs_fill_super(struct super_block *sb,
344 sb->s_export_op = &btrfs_export_ops; 349 sb->s_export_op = &btrfs_export_ops;
345 sb->s_xattr = btrfs_xattr_handlers; 350 sb->s_xattr = btrfs_xattr_handlers;
346 sb->s_time_gran = 1; 351 sb->s_time_gran = 1;
352#ifdef CONFIG_BTRFS_FS_POSIX_ACL
347 sb->s_flags |= MS_POSIXACL; 353 sb->s_flags |= MS_POSIXACL;
354#endif
348 355
349 tree_root = open_ctree(sb, fs_devices, (char *)data); 356 tree_root = open_ctree(sb, fs_devices, (char *)data);
350 357
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 88f866f85e7a..c207e8c32c9b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -163,8 +163,14 @@ static void wait_current_trans(struct btrfs_root *root)
163 } 163 }
164} 164}
165 165
166enum btrfs_trans_type {
167 TRANS_START,
168 TRANS_JOIN,
169 TRANS_USERSPACE,
170};
171
166static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 172static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
167 int num_blocks, int wait) 173 int num_blocks, int type)
168{ 174{
169 struct btrfs_trans_handle *h = 175 struct btrfs_trans_handle *h =
170 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 176 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
@@ -172,7 +178,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
172 178
173 mutex_lock(&root->fs_info->trans_mutex); 179 mutex_lock(&root->fs_info->trans_mutex);
174 if (!root->fs_info->log_root_recovering && 180 if (!root->fs_info->log_root_recovering &&
175 ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2)) 181 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
182 type == TRANS_USERSPACE))
176 wait_current_trans(root); 183 wait_current_trans(root);
177 ret = join_transaction(root); 184 ret = join_transaction(root);
178 BUG_ON(ret); 185 BUG_ON(ret);
@@ -186,6 +193,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
186 h->alloc_exclude_start = 0; 193 h->alloc_exclude_start = 0;
187 h->delayed_ref_updates = 0; 194 h->delayed_ref_updates = 0;
188 195
196 if (!current->journal_info && type != TRANS_USERSPACE)
197 current->journal_info = h;
198
189 root->fs_info->running_transaction->use_count++; 199 root->fs_info->running_transaction->use_count++;
190 record_root_in_trans(h, root); 200 record_root_in_trans(h, root);
191 mutex_unlock(&root->fs_info->trans_mutex); 201 mutex_unlock(&root->fs_info->trans_mutex);
@@ -195,18 +205,18 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
195struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 205struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
196 int num_blocks) 206 int num_blocks)
197{ 207{
198 return start_transaction(root, num_blocks, 1); 208 return start_transaction(root, num_blocks, TRANS_START);
199} 209}
200struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 210struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
201 int num_blocks) 211 int num_blocks)
202{ 212{
203 return start_transaction(root, num_blocks, 0); 213 return start_transaction(root, num_blocks, TRANS_JOIN);
204} 214}
205 215
206struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 216struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
207 int num_blocks) 217 int num_blocks)
208{ 218{
209 return start_transaction(r, num_blocks, 2); 219 return start_transaction(r, num_blocks, TRANS_USERSPACE);
210} 220}
211 221
212/* wait for a transaction commit to be fully complete */ 222/* wait for a transaction commit to be fully complete */
@@ -317,6 +327,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
317 wake_up(&cur_trans->writer_wait); 327 wake_up(&cur_trans->writer_wait);
318 put_transaction(cur_trans); 328 put_transaction(cur_trans);
319 mutex_unlock(&info->trans_mutex); 329 mutex_unlock(&info->trans_mutex);
330
331 if (current->journal_info == trans)
332 current->journal_info = NULL;
320 memset(trans, 0, sizeof(*trans)); 333 memset(trans, 0, sizeof(*trans));
321 kmem_cache_free(btrfs_trans_handle_cachep, trans); 334 kmem_cache_free(btrfs_trans_handle_cachep, trans);
322 335
@@ -338,10 +351,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
338/* 351/*
339 * when btree blocks are allocated, they have some corresponding bits set for 352 * when btree blocks are allocated, they have some corresponding bits set for
340 * them in one of two extent_io trees. This is used to make sure all of 353 * them in one of two extent_io trees. This is used to make sure all of
341 * those extents are on disk for transaction or log commit 354 * those extents are sent to disk but does not wait on them
342 */ 355 */
343int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, 356int btrfs_write_marked_extents(struct btrfs_root *root,
344 struct extent_io_tree *dirty_pages) 357 struct extent_io_tree *dirty_pages)
345{ 358{
346 int ret; 359 int ret;
347 int err = 0; 360 int err = 0;
@@ -388,6 +401,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
388 page_cache_release(page); 401 page_cache_release(page);
389 } 402 }
390 } 403 }
404 if (err)
405 werr = err;
406 return werr;
407}
408
409/*
410 * when btree blocks are allocated, they have some corresponding bits set for
411 * them in one of two extent_io trees. This is used to make sure all of
412 * those extents are on disk for transaction or log commit. We wait
413 * on all the pages and clear them from the dirty pages state tree
414 */
415int btrfs_wait_marked_extents(struct btrfs_root *root,
416 struct extent_io_tree *dirty_pages)
417{
418 int ret;
419 int err = 0;
420 int werr = 0;
421 struct page *page;
422 struct inode *btree_inode = root->fs_info->btree_inode;
423 u64 start = 0;
424 u64 end;
425 unsigned long index;
426
391 while (1) { 427 while (1) {
392 ret = find_first_extent_bit(dirty_pages, 0, &start, &end, 428 ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
393 EXTENT_DIRTY); 429 EXTENT_DIRTY);
@@ -418,6 +454,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
418 return werr; 454 return werr;
419} 455}
420 456
457/*
458 * when btree blocks are allocated, they have some corresponding bits set for
459 * them in one of two extent_io trees. This is used to make sure all of
460 * those extents are on disk for transaction or log commit
461 */
462int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
463 struct extent_io_tree *dirty_pages)
464{
465 int ret;
466 int ret2;
467
468 ret = btrfs_write_marked_extents(root, dirty_pages);
469 ret2 = btrfs_wait_marked_extents(root, dirty_pages);
470 return ret || ret2;
471}
472
421int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 473int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
422 struct btrfs_root *root) 474 struct btrfs_root *root)
423{ 475{
@@ -743,6 +795,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
743 memcpy(&pending->root_key, &key, sizeof(key)); 795 memcpy(&pending->root_key, &key, sizeof(key));
744fail: 796fail:
745 kfree(new_root_item); 797 kfree(new_root_item);
798 btrfs_unreserve_metadata_space(root, 6);
746 return ret; 799 return ret;
747} 800}
748 801
@@ -1059,6 +1112,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1059 1112
1060 mutex_unlock(&root->fs_info->trans_mutex); 1113 mutex_unlock(&root->fs_info->trans_mutex);
1061 1114
1115 if (current->journal_info == trans)
1116 current->journal_info = NULL;
1117
1062 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1118 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1063 return ret; 1119 return ret;
1064} 1120}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 663c67404918..d4e3e7a6938c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
79 struct inode *inode) 79 struct inode *inode)
80{ 80{
81 BTRFS_I(inode)->last_trans = trans->transaction->transid; 81 BTRFS_I(inode)->last_trans = trans->transaction->transid;
82 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
82} 83}
83 84
84int btrfs_end_transaction(struct btrfs_trans_handle *trans, 85int btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -107,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
107 struct btrfs_root *root); 108 struct btrfs_root *root);
108int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, 109int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
109 struct extent_io_tree *dirty_pages); 110 struct extent_io_tree *dirty_pages);
111int btrfs_write_marked_extents(struct btrfs_root *root,
112 struct extent_io_tree *dirty_pages);
113int btrfs_wait_marked_extents(struct btrfs_root *root,
114 struct extent_io_tree *dirty_pages);
110int btrfs_transaction_in_commit(struct btrfs_fs_info *info); 115int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
111#endif 116#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7827841b55cb..741666a7676a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
137 137
138 mutex_lock(&root->log_mutex); 138 mutex_lock(&root->log_mutex);
139 if (root->log_root) { 139 if (root->log_root) {
140 if (!root->log_start_pid) {
141 root->log_start_pid = current->pid;
142 root->log_multiple_pids = false;
143 } else if (root->log_start_pid != current->pid) {
144 root->log_multiple_pids = true;
145 }
146
140 root->log_batch++; 147 root->log_batch++;
141 atomic_inc(&root->log_writers); 148 atomic_inc(&root->log_writers);
142 mutex_unlock(&root->log_mutex); 149 mutex_unlock(&root->log_mutex);
143 return 0; 150 return 0;
144 } 151 }
152 root->log_multiple_pids = false;
153 root->log_start_pid = current->pid;
145 mutex_lock(&root->fs_info->tree_log_mutex); 154 mutex_lock(&root->fs_info->tree_log_mutex);
146 if (!root->fs_info->log_root_tree) { 155 if (!root->fs_info->log_root_tree) {
147 ret = btrfs_init_log_root_tree(trans, root->fs_info); 156 ret = btrfs_init_log_root_tree(trans, root->fs_info);
@@ -1971,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1971 int ret; 1980 int ret;
1972 struct btrfs_root *log = root->log_root; 1981 struct btrfs_root *log = root->log_root;
1973 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; 1982 struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
1983 u64 log_transid = 0;
1974 1984
1975 mutex_lock(&root->log_mutex); 1985 mutex_lock(&root->log_mutex);
1976 index1 = root->log_transid % 2; 1986 index1 = root->log_transid % 2;
@@ -1987,10 +1997,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
1987 1997
1988 while (1) { 1998 while (1) {
1989 unsigned long batch = root->log_batch; 1999 unsigned long batch = root->log_batch;
1990 mutex_unlock(&root->log_mutex); 2000 if (root->log_multiple_pids) {
1991 schedule_timeout_uninterruptible(1); 2001 mutex_unlock(&root->log_mutex);
1992 mutex_lock(&root->log_mutex); 2002 schedule_timeout_uninterruptible(1);
1993 2003 mutex_lock(&root->log_mutex);
2004 }
1994 wait_for_writer(trans, root); 2005 wait_for_writer(trans, root);
1995 if (batch == root->log_batch) 2006 if (batch == root->log_batch)
1996 break; 2007 break;
@@ -2003,14 +2014,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2003 goto out; 2014 goto out;
2004 } 2015 }
2005 2016
2006 ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); 2017 /* we start IO on all the marked extents here, but we don't actually
2018 * wait for them until later.
2019 */
2020 ret = btrfs_write_marked_extents(log, &log->dirty_log_pages);
2007 BUG_ON(ret); 2021 BUG_ON(ret);
2008 2022
2009 btrfs_set_root_node(&log->root_item, log->node); 2023 btrfs_set_root_node(&log->root_item, log->node);
2010 2024
2011 root->log_batch = 0; 2025 root->log_batch = 0;
2026 log_transid = root->log_transid;
2012 root->log_transid++; 2027 root->log_transid++;
2013 log->log_transid = root->log_transid; 2028 log->log_transid = root->log_transid;
2029 root->log_start_pid = 0;
2014 smp_mb(); 2030 smp_mb();
2015 /* 2031 /*
2016 * log tree has been flushed to disk, new modifications of 2032 * log tree has been flushed to disk, new modifications of
@@ -2036,6 +2052,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2036 2052
2037 index2 = log_root_tree->log_transid % 2; 2053 index2 = log_root_tree->log_transid % 2;
2038 if (atomic_read(&log_root_tree->log_commit[index2])) { 2054 if (atomic_read(&log_root_tree->log_commit[index2])) {
2055 btrfs_wait_marked_extents(log, &log->dirty_log_pages);
2039 wait_log_commit(trans, log_root_tree, 2056 wait_log_commit(trans, log_root_tree,
2040 log_root_tree->log_transid); 2057 log_root_tree->log_transid);
2041 mutex_unlock(&log_root_tree->log_mutex); 2058 mutex_unlock(&log_root_tree->log_mutex);
@@ -2055,6 +2072,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2055 * check the full commit flag again 2072 * check the full commit flag again
2056 */ 2073 */
2057 if (root->fs_info->last_trans_log_full_commit == trans->transid) { 2074 if (root->fs_info->last_trans_log_full_commit == trans->transid) {
2075 btrfs_wait_marked_extents(log, &log->dirty_log_pages);
2058 mutex_unlock(&log_root_tree->log_mutex); 2076 mutex_unlock(&log_root_tree->log_mutex);
2059 ret = -EAGAIN; 2077 ret = -EAGAIN;
2060 goto out_wake_log_root; 2078 goto out_wake_log_root;
@@ -2063,6 +2081,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2063 ret = btrfs_write_and_wait_marked_extents(log_root_tree, 2081 ret = btrfs_write_and_wait_marked_extents(log_root_tree,
2064 &log_root_tree->dirty_log_pages); 2082 &log_root_tree->dirty_log_pages);
2065 BUG_ON(ret); 2083 BUG_ON(ret);
2084 btrfs_wait_marked_extents(log, &log->dirty_log_pages);
2066 2085
2067 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2086 btrfs_set_super_log_root(&root->fs_info->super_for_commit,
2068 log_root_tree->node->start); 2087 log_root_tree->node->start);
@@ -2082,9 +2101,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2082 * the running transaction open, so a full commit can't hop 2101 * the running transaction open, so a full commit can't hop
2083 * in and cause problems either. 2102 * in and cause problems either.
2084 */ 2103 */
2085 write_ctree_super(trans, root->fs_info->tree_root, 2); 2104 write_ctree_super(trans, root->fs_info->tree_root, 1);
2086 ret = 0; 2105 ret = 0;
2087 2106
2107 mutex_lock(&root->log_mutex);
2108 if (root->last_log_commit < log_transid)
2109 root->last_log_commit = log_transid;
2110 mutex_unlock(&root->log_mutex);
2111
2088out_wake_log_root: 2112out_wake_log_root:
2089 atomic_set(&log_root_tree->log_commit[index2], 0); 2113 atomic_set(&log_root_tree->log_commit[index2], 0);
2090 smp_mb(); 2114 smp_mb();
@@ -2852,6 +2876,21 @@ out:
2852 return ret; 2876 return ret;
2853} 2877}
2854 2878
2879static int inode_in_log(struct btrfs_trans_handle *trans,
2880 struct inode *inode)
2881{
2882 struct btrfs_root *root = BTRFS_I(inode)->root;
2883 int ret = 0;
2884
2885 mutex_lock(&root->log_mutex);
2886 if (BTRFS_I(inode)->logged_trans == trans->transid &&
2887 BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
2888 ret = 1;
2889 mutex_unlock(&root->log_mutex);
2890 return ret;
2891}
2892
2893
2855/* 2894/*
2856 * helper function around btrfs_log_inode to make sure newly created 2895 * helper function around btrfs_log_inode to make sure newly created
2857 * parent directories also end up in the log. A minimal inode and backref 2896 * parent directories also end up in the log. A minimal inode and backref
@@ -2891,6 +2930,11 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2891 if (ret) 2930 if (ret)
2892 goto end_no_trans; 2931 goto end_no_trans;
2893 2932
2933 if (inode_in_log(trans, inode)) {
2934 ret = BTRFS_NO_LOG_SYNC;
2935 goto end_no_trans;
2936 }
2937
2894 start_log_trans(trans, root); 2938 start_log_trans(trans, root);
2895 2939
2896 ret = btrfs_log_inode(trans, root, inode, inode_only); 2940 ret = btrfs_log_inode(trans, root, inode, inode_only);
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index d09c7609e16b..0776eacb5083 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -19,6 +19,9 @@
19#ifndef __TREE_LOG_ 19#ifndef __TREE_LOG_
20#define __TREE_LOG_ 20#define __TREE_LOG_
21 21
22/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
23#define BTRFS_NO_LOG_SYNC 256
24
22int btrfs_sync_log(struct btrfs_trans_handle *trans, 25int btrfs_sync_log(struct btrfs_trans_handle *trans,
23 struct btrfs_root *root); 26 struct btrfs_root *root);
24int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 23e7d36ff325..7eda483d7b5a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
446 goto error; 446 goto error;
447 447
448 device->name = kstrdup(orig_dev->name, GFP_NOFS); 448 device->name = kstrdup(orig_dev->name, GFP_NOFS);
449 if (!device->name) 449 if (!device->name) {
450 kfree(device);
450 goto error; 451 goto error;
452 }
451 453
452 device->devid = orig_dev->devid; 454 device->devid = orig_dev->devid;
453 device->work.func = pending_bios_fn; 455 device->work.func = pending_bios_fn;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a9d3bf4d2689..b6dd5967c48a 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -260,7 +260,7 @@ err:
260 * attributes are handled directly. 260 * attributes are handled directly.
261 */ 261 */
262struct xattr_handler *btrfs_xattr_handlers[] = { 262struct xattr_handler *btrfs_xattr_handlers[] = {
263#ifdef CONFIG_FS_POSIX_ACL 263#ifdef CONFIG_BTRFS_FS_POSIX_ACL
264 &btrfs_xattr_acl_access_handler, 264 &btrfs_xattr_acl_access_handler,
265 &btrfs_xattr_acl_default_handler, 265 &btrfs_xattr_acl_default_handler,
266#endif 266#endif
diff --git a/fs/buffer.c b/fs/buffer.c
index 24afd7422ae8..6fa530256bfd 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -280,7 +280,7 @@ void invalidate_bdev(struct block_device *bdev)
280EXPORT_SYMBOL(invalidate_bdev); 280EXPORT_SYMBOL(invalidate_bdev);
281 281
282/* 282/*
283 * Kick pdflush then try to free up some ZONE_NORMAL memory. 283 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
284 */ 284 */
285static void free_more_memory(void) 285static void free_more_memory(void)
286{ 286{
@@ -1709,9 +1709,9 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1709 /* 1709 /*
1710 * If it's a fully non-blocking write attempt and we cannot 1710 * If it's a fully non-blocking write attempt and we cannot
1711 * lock the buffer then redirty the page. Note that this can 1711 * lock the buffer then redirty the page. Note that this can
1712 * potentially cause a busy-wait loop from pdflush and kswapd 1712 * potentially cause a busy-wait loop from writeback threads
1713 * activity, but those code paths have their own higher-level 1713 * and kswapd activity, but those code paths have their own
1714 * throttling. 1714 * higher-level throttling.
1715 */ 1715 */
1716 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { 1716 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1717 lock_buffer(bh); 1717 lock_buffer(bh);
@@ -3208,7 +3208,7 @@ EXPORT_SYMBOL(block_sync_page);
3208 * still running obsolete flush daemons, so we terminate them here. 3208 * still running obsolete flush daemons, so we terminate them here.
3209 * 3209 *
3210 * Use of bdflush() is deprecated and will be removed in a future kernel. 3210 * Use of bdflush() is deprecated and will be removed in a future kernel.
3211 * The `pdflush' kernel threads fully replace bdflush daemons and this call. 3211 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3212 */ 3212 */
3213SYSCALL_DEFINE2(bdflush, int, func, long, data) 3213SYSCALL_DEFINE2(bdflush, int, func, long, data)
3214{ 3214{
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 6994a0f54f02..80f352596807 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,6 +2,7 @@ config CIFS
2 tristate "CIFS support (advanced network filesystem, SMBFS successor)" 2 tristate "CIFS support (advanced network filesystem, SMBFS successor)"
3 depends on INET 3 depends on INET
4 select NLS 4 select NLS
5 select SLOW_WORK
5 help 6 help
6 This is the client VFS module for the Common Internet File System 7 This is the client VFS module for the Common Internet File System
7 (CIFS) protocol which is the successor to the Server Message Block 8 (CIFS) protocol which is the successor to the Server Message Block
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 90c5b39f0313..9a5e4f5f3122 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -64,9 +64,6 @@ unsigned int multiuser_mount = 0;
64unsigned int extended_security = CIFSSEC_DEF; 64unsigned int extended_security = CIFSSEC_DEF;
65/* unsigned int ntlmv2_support = 0; */ 65/* unsigned int ntlmv2_support = 0; */
66unsigned int sign_CIFS_PDUs = 1; 66unsigned int sign_CIFS_PDUs = 1;
67extern struct task_struct *oplockThread; /* remove sparse warning */
68struct task_struct *oplockThread = NULL;
69/* extern struct task_struct * dnotifyThread; remove sparse warning */
70static const struct super_operations cifs_super_ops; 67static const struct super_operations cifs_super_ops;
71unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; 68unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
72module_param(CIFSMaxBufSize, int, 0); 69module_param(CIFSMaxBufSize, int, 0);
@@ -972,89 +969,12 @@ cifs_destroy_mids(void)
972 kmem_cache_destroy(cifs_oplock_cachep); 969 kmem_cache_destroy(cifs_oplock_cachep);
973} 970}
974 971
975static int cifs_oplock_thread(void *dummyarg)
976{
977 struct oplock_q_entry *oplock_item;
978 struct cifsTconInfo *pTcon;
979 struct inode *inode;
980 __u16 netfid;
981 int rc, waitrc = 0;
982
983 set_freezable();
984 do {
985 if (try_to_freeze())
986 continue;
987
988 spin_lock(&cifs_oplock_lock);
989 if (list_empty(&cifs_oplock_list)) {
990 spin_unlock(&cifs_oplock_lock);
991 set_current_state(TASK_INTERRUPTIBLE);
992 schedule_timeout(39*HZ);
993 } else {
994 oplock_item = list_entry(cifs_oplock_list.next,
995 struct oplock_q_entry, qhead);
996 cFYI(1, ("found oplock item to write out"));
997 pTcon = oplock_item->tcon;
998 inode = oplock_item->pinode;
999 netfid = oplock_item->netfid;
1000 spin_unlock(&cifs_oplock_lock);
1001 DeleteOplockQEntry(oplock_item);
1002 /* can not grab inode sem here since it would
1003 deadlock when oplock received on delete
1004 since vfs_unlink holds the i_mutex across
1005 the call */
1006 /* mutex_lock(&inode->i_mutex);*/
1007 if (S_ISREG(inode->i_mode)) {
1008#ifdef CONFIG_CIFS_EXPERIMENTAL
1009 if (CIFS_I(inode)->clientCanCacheAll == 0)
1010 break_lease(inode, FMODE_READ);
1011 else if (CIFS_I(inode)->clientCanCacheRead == 0)
1012 break_lease(inode, FMODE_WRITE);
1013#endif
1014 rc = filemap_fdatawrite(inode->i_mapping);
1015 if (CIFS_I(inode)->clientCanCacheRead == 0) {
1016 waitrc = filemap_fdatawait(
1017 inode->i_mapping);
1018 invalidate_remote_inode(inode);
1019 }
1020 if (rc == 0)
1021 rc = waitrc;
1022 } else
1023 rc = 0;
1024 /* mutex_unlock(&inode->i_mutex);*/
1025 if (rc)
1026 CIFS_I(inode)->write_behind_rc = rc;
1027 cFYI(1, ("Oplock flush inode %p rc %d",
1028 inode, rc));
1029
1030 /* releasing stale oplock after recent reconnect
1031 of smb session using a now incorrect file
1032 handle is not a data integrity issue but do
1033 not bother sending an oplock release if session
1034 to server still is disconnected since oplock
1035 already released by the server in that case */
1036 if (!pTcon->need_reconnect) {
1037 rc = CIFSSMBLock(0, pTcon, netfid,
1038 0 /* len */ , 0 /* offset */, 0,
1039 0, LOCKING_ANDX_OPLOCK_RELEASE,
1040 false /* wait flag */);
1041 cFYI(1, ("Oplock release rc = %d", rc));
1042 }
1043 set_current_state(TASK_INTERRUPTIBLE);
1044 schedule_timeout(1); /* yield in case q were corrupt */
1045 }
1046 } while (!kthread_should_stop());
1047
1048 return 0;
1049}
1050
1051static int __init 972static int __init
1052init_cifs(void) 973init_cifs(void)
1053{ 974{
1054 int rc = 0; 975 int rc = 0;
1055 cifs_proc_init(); 976 cifs_proc_init();
1056 INIT_LIST_HEAD(&cifs_tcp_ses_list); 977 INIT_LIST_HEAD(&cifs_tcp_ses_list);
1057 INIT_LIST_HEAD(&cifs_oplock_list);
1058#ifdef CONFIG_CIFS_EXPERIMENTAL 978#ifdef CONFIG_CIFS_EXPERIMENTAL
1059 INIT_LIST_HEAD(&GlobalDnotifyReqList); 979 INIT_LIST_HEAD(&GlobalDnotifyReqList);
1060 INIT_LIST_HEAD(&GlobalDnotifyRsp_Q); 980 INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
@@ -1083,7 +1003,6 @@ init_cifs(void)
1083 rwlock_init(&GlobalSMBSeslock); 1003 rwlock_init(&GlobalSMBSeslock);
1084 rwlock_init(&cifs_tcp_ses_lock); 1004 rwlock_init(&cifs_tcp_ses_lock);
1085 spin_lock_init(&GlobalMid_Lock); 1005 spin_lock_init(&GlobalMid_Lock);
1086 spin_lock_init(&cifs_oplock_lock);
1087 1006
1088 if (cifs_max_pending < 2) { 1007 if (cifs_max_pending < 2) {
1089 cifs_max_pending = 2; 1008 cifs_max_pending = 2;
@@ -1118,16 +1037,13 @@ init_cifs(void)
1118 if (rc) 1037 if (rc)
1119 goto out_unregister_key_type; 1038 goto out_unregister_key_type;
1120#endif 1039#endif
1121 oplockThread = kthread_run(cifs_oplock_thread, NULL, "cifsoplockd"); 1040 rc = slow_work_register_user();
1122 if (IS_ERR(oplockThread)) { 1041 if (rc)
1123 rc = PTR_ERR(oplockThread); 1042 goto out_unregister_resolver_key;
1124 cERROR(1, ("error %d create oplock thread", rc));
1125 goto out_unregister_dfs_key_type;
1126 }
1127 1043
1128 return 0; 1044 return 0;
1129 1045
1130 out_unregister_dfs_key_type: 1046 out_unregister_resolver_key:
1131#ifdef CONFIG_CIFS_DFS_UPCALL 1047#ifdef CONFIG_CIFS_DFS_UPCALL
1132 unregister_key_type(&key_type_dns_resolver); 1048 unregister_key_type(&key_type_dns_resolver);
1133 out_unregister_key_type: 1049 out_unregister_key_type:
@@ -1164,7 +1080,6 @@ exit_cifs(void)
1164 cifs_destroy_inodecache(); 1080 cifs_destroy_inodecache();
1165 cifs_destroy_mids(); 1081 cifs_destroy_mids();
1166 cifs_destroy_request_bufs(); 1082 cifs_destroy_request_bufs();
1167 kthread_stop(oplockThread);
1168} 1083}
1169 1084
1170MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>"); 1085MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>");
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6cfc81a32703..5d0fde18039c 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -18,6 +18,7 @@
18 */ 18 */
19#include <linux/in.h> 19#include <linux/in.h>
20#include <linux/in6.h> 20#include <linux/in6.h>
21#include <linux/slow-work.h>
21#include "cifs_fs_sb.h" 22#include "cifs_fs_sb.h"
22#include "cifsacl.h" 23#include "cifsacl.h"
23/* 24/*
@@ -346,14 +347,16 @@ struct cifsFileInfo {
346 /* lock scope id (0 if none) */ 347 /* lock scope id (0 if none) */
347 struct file *pfile; /* needed for writepage */ 348 struct file *pfile; /* needed for writepage */
348 struct inode *pInode; /* needed for oplock break */ 349 struct inode *pInode; /* needed for oplock break */
350 struct vfsmount *mnt;
349 struct mutex lock_mutex; 351 struct mutex lock_mutex;
350 struct list_head llist; /* list of byte range locks we have. */ 352 struct list_head llist; /* list of byte range locks we have. */
351 bool closePend:1; /* file is marked to close */ 353 bool closePend:1; /* file is marked to close */
352 bool invalidHandle:1; /* file closed via session abend */ 354 bool invalidHandle:1; /* file closed via session abend */
353 bool messageMode:1; /* for pipes: message vs byte mode */ 355 bool oplock_break_cancelled:1;
354 atomic_t count; /* reference count */ 356 atomic_t count; /* reference count */
355 struct mutex fh_mutex; /* prevents reopen race after dead ses*/ 357 struct mutex fh_mutex; /* prevents reopen race after dead ses*/
356 struct cifs_search_info srch_inf; 358 struct cifs_search_info srch_inf;
359 struct slow_work oplock_break; /* slow_work job for oplock breaks */
357}; 360};
358 361
359/* Take a reference on the file private data */ 362/* Take a reference on the file private data */
@@ -365,8 +368,10 @@ static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
365/* Release a reference on the file private data */ 368/* Release a reference on the file private data */
366static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file) 369static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
367{ 370{
368 if (atomic_dec_and_test(&cifs_file->count)) 371 if (atomic_dec_and_test(&cifs_file->count)) {
372 iput(cifs_file->pInode);
369 kfree(cifs_file); 373 kfree(cifs_file);
374 }
370} 375}
371 376
372/* 377/*
@@ -382,7 +387,6 @@ struct cifsInodeInfo {
382 unsigned long time; /* jiffies of last update/check of inode */ 387 unsigned long time; /* jiffies of last update/check of inode */
383 bool clientCanCacheRead:1; /* read oplock */ 388 bool clientCanCacheRead:1; /* read oplock */
384 bool clientCanCacheAll:1; /* read and writebehind oplock */ 389 bool clientCanCacheAll:1; /* read and writebehind oplock */
385 bool oplockPending:1;
386 bool delete_pending:1; /* DELETE_ON_CLOSE is set */ 390 bool delete_pending:1; /* DELETE_ON_CLOSE is set */
387 u64 server_eof; /* current file size on server */ 391 u64 server_eof; /* current file size on server */
388 u64 uniqueid; /* server inode number */ 392 u64 uniqueid; /* server inode number */
@@ -585,9 +589,9 @@ require use of the stronger protocol */
585#define CIFSSEC_MUST_LANMAN 0x10010 589#define CIFSSEC_MUST_LANMAN 0x10010
586#define CIFSSEC_MUST_PLNTXT 0x20020 590#define CIFSSEC_MUST_PLNTXT 0x20020
587#ifdef CONFIG_CIFS_UPCALL 591#ifdef CONFIG_CIFS_UPCALL
588#define CIFSSEC_MASK 0xAF0AF /* allows weak security but also krb5 */ 592#define CIFSSEC_MASK 0xBF0BF /* allows weak security but also krb5 */
589#else 593#else
590#define CIFSSEC_MASK 0xA70A7 /* current flags supported if weak */ 594#define CIFSSEC_MASK 0xB70B7 /* current flags supported if weak */
591#endif /* UPCALL */ 595#endif /* UPCALL */
592#else /* do not allow weak pw hash */ 596#else /* do not allow weak pw hash */
593#ifdef CONFIG_CIFS_UPCALL 597#ifdef CONFIG_CIFS_UPCALL
@@ -669,12 +673,6 @@ GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock;
669 */ 673 */
670GLOBAL_EXTERN rwlock_t GlobalSMBSeslock; 674GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
671 675
672/* Global list of oplocks */
673GLOBAL_EXTERN struct list_head cifs_oplock_list;
674
675/* Protects the cifs_oplock_list */
676GLOBAL_EXTERN spinlock_t cifs_oplock_lock;
677
678/* Outstanding dir notify requests */ 676/* Outstanding dir notify requests */
679GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; 677GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
680/* DirNotify response queue */ 678/* DirNotify response queue */
@@ -725,3 +723,4 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
725GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ 723GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */
726GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ 724GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
727 725
726extern const struct slow_work_ops cifs_oplock_break_ops;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index da8fbf565991..5646727e33f5 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -86,18 +86,17 @@ extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
86 const int stage, 86 const int stage,
87 const struct nls_table *nls_cp); 87 const struct nls_table *nls_cp);
88extern __u16 GetNextMid(struct TCP_Server_Info *server); 88extern __u16 GetNextMid(struct TCP_Server_Info *server);
89extern struct oplock_q_entry *AllocOplockQEntry(struct inode *, u16,
90 struct cifsTconInfo *);
91extern void DeleteOplockQEntry(struct oplock_q_entry *);
92extern void DeleteTconOplockQEntries(struct cifsTconInfo *);
93extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); 89extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
94extern u64 cifs_UnixTimeToNT(struct timespec); 90extern u64 cifs_UnixTimeToNT(struct timespec);
95extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, 91extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
96 int offset); 92 int offset);
97 93
94extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode,
95 __u16 fileHandle, struct file *file,
96 struct vfsmount *mnt, unsigned int oflags);
98extern int cifs_posix_open(char *full_path, struct inode **pinode, 97extern int cifs_posix_open(char *full_path, struct inode **pinode,
99 struct super_block *sb, int mode, int oflags, 98 struct vfsmount *mnt, int mode, int oflags,
100 int *poplock, __u16 *pnetfid, int xid); 99 __u32 *poplock, __u16 *pnetfid, int xid);
101extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, 100extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
102 FILE_UNIX_BASIC_INFO *info, 101 FILE_UNIX_BASIC_INFO *info,
103 struct cifs_sb_info *cifs_sb); 102 struct cifs_sb_info *cifs_sb);
@@ -389,4 +388,5 @@ extern int CIFSSMBSetPosixACL(const int xid, struct cifsTconInfo *tcon,
389 const struct nls_table *nls_codepage, int remap_special_chars); 388 const struct nls_table *nls_codepage, int remap_special_chars);
390extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon, 389extern int CIFSGetExtAttr(const int xid, struct cifsTconInfo *tcon,
391 const int netfid, __u64 *pExtAttrBits, __u64 *pMask); 390 const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
391extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
392#endif /* _CIFSPROTO_H */ 392#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 301e307e1279..941441d3e386 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -94,6 +94,7 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
94 list_for_each_safe(tmp, tmp1, &pTcon->openFileList) { 94 list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
95 open_file = list_entry(tmp, struct cifsFileInfo, tlist); 95 open_file = list_entry(tmp, struct cifsFileInfo, tlist);
96 open_file->invalidHandle = true; 96 open_file->invalidHandle = true;
97 open_file->oplock_break_cancelled = true;
97 } 98 }
98 write_unlock(&GlobalSMBSeslock); 99 write_unlock(&GlobalSMBSeslock);
99 /* BB Add call to invalidate_inodes(sb) for all superblocks mounted 100 /* BB Add call to invalidate_inodes(sb) for all superblocks mounted
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d49682433c20..63ea83ff687f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1577,7 +1577,8 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1577 1577
1578out_err: 1578out_err:
1579 if (tcp_ses) { 1579 if (tcp_ses) {
1580 kfree(tcp_ses->hostname); 1580 if (!IS_ERR(tcp_ses->hostname))
1581 kfree(tcp_ses->hostname);
1581 if (tcp_ses->ssocket) 1582 if (tcp_ses->ssocket)
1582 sock_release(tcp_ses->ssocket); 1583 sock_release(tcp_ses->ssocket);
1583 kfree(tcp_ses); 1584 kfree(tcp_ses);
@@ -1670,7 +1671,6 @@ cifs_put_tcon(struct cifsTconInfo *tcon)
1670 CIFSSMBTDis(xid, tcon); 1671 CIFSSMBTDis(xid, tcon);
1671 _FreeXid(xid); 1672 _FreeXid(xid);
1672 1673
1673 DeleteTconOplockQEntries(tcon);
1674 tconInfoFree(tcon); 1674 tconInfoFree(tcon);
1675 cifs_put_smb_ses(ses); 1675 cifs_put_smb_ses(ses);
1676} 1676}
@@ -2220,16 +2220,8 @@ is_path_accessible(int xid, struct cifsTconInfo *tcon,
2220 struct cifs_sb_info *cifs_sb, const char *full_path) 2220 struct cifs_sb_info *cifs_sb, const char *full_path)
2221{ 2221{
2222 int rc; 2222 int rc;
2223 __u64 inode_num;
2224 FILE_ALL_INFO *pfile_info; 2223 FILE_ALL_INFO *pfile_info;
2225 2224
2226 rc = CIFSGetSrvInodeNumber(xid, tcon, full_path, &inode_num,
2227 cifs_sb->local_nls,
2228 cifs_sb->mnt_cifs_flags &
2229 CIFS_MOUNT_MAP_SPECIAL_CHR);
2230 if (rc != -EOPNOTSUPP)
2231 return rc;
2232
2233 pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); 2225 pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
2234 if (pfile_info == NULL) 2226 if (pfile_info == NULL)
2235 return -ENOMEM; 2227 return -ENOMEM;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index a6424cfc0121..627a60a6c1b1 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -24,6 +24,7 @@
24#include <linux/stat.h> 24#include <linux/stat.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/namei.h> 26#include <linux/namei.h>
27#include <linux/mount.h>
27#include "cifsfs.h" 28#include "cifsfs.h"
28#include "cifspdu.h" 29#include "cifspdu.h"
29#include "cifsglob.h" 30#include "cifsglob.h"
@@ -129,44 +130,45 @@ cifs_bp_rename_retry:
129 return full_path; 130 return full_path;
130} 131}
131 132
132static void 133struct cifsFileInfo *
133cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle, 134cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle,
134 struct cifsTconInfo *tcon, bool write_only) 135 struct file *file, struct vfsmount *mnt, unsigned int oflags)
135{ 136{
136 int oplock = 0; 137 int oplock = 0;
137 struct cifsFileInfo *pCifsFile; 138 struct cifsFileInfo *pCifsFile;
138 struct cifsInodeInfo *pCifsInode; 139 struct cifsInodeInfo *pCifsInode;
140 struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
139 141
140 pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 142 pCifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
141
142 if (pCifsFile == NULL) 143 if (pCifsFile == NULL)
143 return; 144 return pCifsFile;
144 145
145 if (oplockEnabled) 146 if (oplockEnabled)
146 oplock = REQ_OPLOCK; 147 oplock = REQ_OPLOCK;
147 148
148 pCifsFile->netfid = fileHandle; 149 pCifsFile->netfid = fileHandle;
149 pCifsFile->pid = current->tgid; 150 pCifsFile->pid = current->tgid;
150 pCifsFile->pInode = newinode; 151 pCifsFile->pInode = igrab(newinode);
152 pCifsFile->mnt = mnt;
153 pCifsFile->pfile = file;
151 pCifsFile->invalidHandle = false; 154 pCifsFile->invalidHandle = false;
152 pCifsFile->closePend = false; 155 pCifsFile->closePend = false;
153 mutex_init(&pCifsFile->fh_mutex); 156 mutex_init(&pCifsFile->fh_mutex);
154 mutex_init(&pCifsFile->lock_mutex); 157 mutex_init(&pCifsFile->lock_mutex);
155 INIT_LIST_HEAD(&pCifsFile->llist); 158 INIT_LIST_HEAD(&pCifsFile->llist);
156 atomic_set(&pCifsFile->count, 1); 159 atomic_set(&pCifsFile->count, 1);
160 slow_work_init(&pCifsFile->oplock_break, &cifs_oplock_break_ops);
157 161
158 /* set the following in open now
159 pCifsFile->pfile = file; */
160 write_lock(&GlobalSMBSeslock); 162 write_lock(&GlobalSMBSeslock);
161 list_add(&pCifsFile->tlist, &tcon->openFileList); 163 list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
162 pCifsInode = CIFS_I(newinode); 164 pCifsInode = CIFS_I(newinode);
163 if (pCifsInode) { 165 if (pCifsInode) {
164 /* if readable file instance put first in list*/ 166 /* if readable file instance put first in list*/
165 if (write_only) 167 if (oflags & FMODE_READ)
168 list_add(&pCifsFile->flist, &pCifsInode->openFileList);
169 else
166 list_add_tail(&pCifsFile->flist, 170 list_add_tail(&pCifsFile->flist,
167 &pCifsInode->openFileList); 171 &pCifsInode->openFileList);
168 else
169 list_add(&pCifsFile->flist, &pCifsInode->openFileList);
170 172
171 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { 173 if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
172 pCifsInode->clientCanCacheAll = true; 174 pCifsInode->clientCanCacheAll = true;
@@ -176,18 +178,18 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
176 pCifsInode->clientCanCacheRead = true; 178 pCifsInode->clientCanCacheRead = true;
177 } 179 }
178 write_unlock(&GlobalSMBSeslock); 180 write_unlock(&GlobalSMBSeslock);
181
182 return pCifsFile;
179} 183}
180 184
181int cifs_posix_open(char *full_path, struct inode **pinode, 185int cifs_posix_open(char *full_path, struct inode **pinode,
182 struct super_block *sb, int mode, int oflags, 186 struct vfsmount *mnt, int mode, int oflags,
183 int *poplock, __u16 *pnetfid, int xid) 187 __u32 *poplock, __u16 *pnetfid, int xid)
184{ 188{
185 int rc; 189 int rc;
186 __u32 oplock;
187 bool write_only = false;
188 FILE_UNIX_BASIC_INFO *presp_data; 190 FILE_UNIX_BASIC_INFO *presp_data;
189 __u32 posix_flags = 0; 191 __u32 posix_flags = 0;
190 struct cifs_sb_info *cifs_sb = CIFS_SB(sb); 192 struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb);
191 struct cifs_fattr fattr; 193 struct cifs_fattr fattr;
192 194
193 cFYI(1, ("posix open %s", full_path)); 195 cFYI(1, ("posix open %s", full_path));
@@ -223,12 +225,9 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
223 if (oflags & O_DIRECT) 225 if (oflags & O_DIRECT)
224 posix_flags |= SMB_O_DIRECT; 226 posix_flags |= SMB_O_DIRECT;
225 227
226 if (!(oflags & FMODE_READ))
227 write_only = true;
228
229 mode &= ~current_umask(); 228 mode &= ~current_umask();
230 rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode, 229 rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
231 pnetfid, presp_data, &oplock, full_path, 230 pnetfid, presp_data, poplock, full_path,
232 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 231 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
233 CIFS_MOUNT_MAP_SPECIAL_CHR); 232 CIFS_MOUNT_MAP_SPECIAL_CHR);
234 if (rc) 233 if (rc)
@@ -244,7 +243,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
244 243
245 /* get new inode and set it up */ 244 /* get new inode and set it up */
246 if (*pinode == NULL) { 245 if (*pinode == NULL) {
247 *pinode = cifs_iget(sb, &fattr); 246 *pinode = cifs_iget(mnt->mnt_sb, &fattr);
248 if (!*pinode) { 247 if (!*pinode) {
249 rc = -ENOMEM; 248 rc = -ENOMEM;
250 goto posix_open_ret; 249 goto posix_open_ret;
@@ -253,7 +252,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode,
253 cifs_fattr_to_inode(*pinode, &fattr); 252 cifs_fattr_to_inode(*pinode, &fattr);
254 } 253 }
255 254
256 cifs_fill_fileinfo(*pinode, *pnetfid, cifs_sb->tcon, write_only); 255 cifs_new_fileinfo(*pinode, *pnetfid, NULL, mnt, oflags);
257 256
258posix_open_ret: 257posix_open_ret:
259 kfree(presp_data); 258 kfree(presp_data);
@@ -280,7 +279,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
280 int rc = -ENOENT; 279 int rc = -ENOENT;
281 int xid; 280 int xid;
282 int create_options = CREATE_NOT_DIR; 281 int create_options = CREATE_NOT_DIR;
283 int oplock = 0; 282 __u32 oplock = 0;
284 int oflags; 283 int oflags;
285 bool posix_create = false; 284 bool posix_create = false;
286 /* 285 /*
@@ -298,7 +297,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
298 FILE_ALL_INFO *buf = NULL; 297 FILE_ALL_INFO *buf = NULL;
299 struct inode *newinode = NULL; 298 struct inode *newinode = NULL;
300 int disposition = FILE_OVERWRITE_IF; 299 int disposition = FILE_OVERWRITE_IF;
301 bool write_only = false;
302 300
303 xid = GetXid(); 301 xid = GetXid();
304 302
@@ -323,7 +321,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
323 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && 321 if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
324 (CIFS_UNIX_POSIX_PATH_OPS_CAP & 322 (CIFS_UNIX_POSIX_PATH_OPS_CAP &
325 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 323 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
326 rc = cifs_posix_open(full_path, &newinode, inode->i_sb, 324 rc = cifs_posix_open(full_path, &newinode, nd->path.mnt,
327 mode, oflags, &oplock, &fileHandle, xid); 325 mode, oflags, &oplock, &fileHandle, xid);
328 /* EIO could indicate that (posix open) operation is not 326 /* EIO could indicate that (posix open) operation is not
329 supported, despite what server claimed in capability 327 supported, despite what server claimed in capability
@@ -351,11 +349,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
351 desiredAccess = 0; 349 desiredAccess = 0;
352 if (oflags & FMODE_READ) 350 if (oflags & FMODE_READ)
353 desiredAccess |= GENERIC_READ; /* is this too little? */ 351 desiredAccess |= GENERIC_READ; /* is this too little? */
354 if (oflags & FMODE_WRITE) { 352 if (oflags & FMODE_WRITE)
355 desiredAccess |= GENERIC_WRITE; 353 desiredAccess |= GENERIC_WRITE;
356 if (!(oflags & FMODE_READ))
357 write_only = true;
358 }
359 354
360 if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) 355 if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
361 disposition = FILE_CREATE; 356 disposition = FILE_CREATE;
@@ -470,8 +465,8 @@ cifs_create_set_dentry:
470 /* mknod case - do not leave file open */ 465 /* mknod case - do not leave file open */
471 CIFSSMBClose(xid, tcon, fileHandle); 466 CIFSSMBClose(xid, tcon, fileHandle);
472 } else if (!(posix_create) && (newinode)) { 467 } else if (!(posix_create) && (newinode)) {
473 cifs_fill_fileinfo(newinode, fileHandle, 468 cifs_new_fileinfo(newinode, fileHandle, NULL,
474 cifs_sb->tcon, write_only); 469 nd->path.mnt, oflags);
475 } 470 }
476cifs_create_out: 471cifs_create_out:
477 kfree(buf); 472 kfree(buf);
@@ -611,7 +606,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
611{ 606{
612 int xid; 607 int xid;
613 int rc = 0; /* to get around spurious gcc warning, set to zero here */ 608 int rc = 0; /* to get around spurious gcc warning, set to zero here */
614 int oplock = 0; 609 __u32 oplock = 0;
615 __u16 fileHandle = 0; 610 __u16 fileHandle = 0;
616 bool posix_open = false; 611 bool posix_open = false;
617 struct cifs_sb_info *cifs_sb; 612 struct cifs_sb_info *cifs_sb;
@@ -683,8 +678,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
683 if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && 678 if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) &&
684 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && 679 (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open &&
685 (nd->intent.open.flags & O_CREAT)) { 680 (nd->intent.open.flags & O_CREAT)) {
686 rc = cifs_posix_open(full_path, &newInode, 681 rc = cifs_posix_open(full_path, &newInode, nd->path.mnt,
687 parent_dir_inode->i_sb,
688 nd->intent.open.create_mode, 682 nd->intent.open.create_mode,
689 nd->intent.open.flags, &oplock, 683 nd->intent.open.flags, &oplock,
690 &fileHandle, xid); 684 &fileHandle, xid);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index fa7beac8b80e..429337eb7afe 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -30,6 +30,7 @@
30#include <linux/writeback.h> 30#include <linux/writeback.h>
31#include <linux/task_io_accounting_ops.h> 31#include <linux/task_io_accounting_ops.h>
32#include <linux/delay.h> 32#include <linux/delay.h>
33#include <linux/mount.h>
33#include <asm/div64.h> 34#include <asm/div64.h>
34#include "cifsfs.h" 35#include "cifsfs.h"
35#include "cifspdu.h" 36#include "cifspdu.h"
@@ -39,27 +40,6 @@
39#include "cifs_debug.h" 40#include "cifs_debug.h"
40#include "cifs_fs_sb.h" 41#include "cifs_fs_sb.h"
41 42
42static inline struct cifsFileInfo *cifs_init_private(
43 struct cifsFileInfo *private_data, struct inode *inode,
44 struct file *file, __u16 netfid)
45{
46 memset(private_data, 0, sizeof(struct cifsFileInfo));
47 private_data->netfid = netfid;
48 private_data->pid = current->tgid;
49 mutex_init(&private_data->fh_mutex);
50 mutex_init(&private_data->lock_mutex);
51 INIT_LIST_HEAD(&private_data->llist);
52 private_data->pfile = file; /* needed for writepage */
53 private_data->pInode = inode;
54 private_data->invalidHandle = false;
55 private_data->closePend = false;
56 /* Initialize reference count to one. The private data is
57 freed on the release of the last reference */
58 atomic_set(&private_data->count, 1);
59
60 return private_data;
61}
62
63static inline int cifs_convert_flags(unsigned int flags) 43static inline int cifs_convert_flags(unsigned int flags)
64{ 44{
65 if ((flags & O_ACCMODE) == O_RDONLY) 45 if ((flags & O_ACCMODE) == O_RDONLY)
@@ -123,9 +103,11 @@ static inline int cifs_get_disposition(unsigned int flags)
123} 103}
124 104
125/* all arguments to this function must be checked for validity in caller */ 105/* all arguments to this function must be checked for validity in caller */
126static inline int cifs_posix_open_inode_helper(struct inode *inode, 106static inline int
127 struct file *file, struct cifsInodeInfo *pCifsInode, 107cifs_posix_open_inode_helper(struct inode *inode, struct file *file,
128 struct cifsFileInfo *pCifsFile, int oplock, u16 netfid) 108 struct cifsInodeInfo *pCifsInode,
109 struct cifsFileInfo *pCifsFile, __u32 oplock,
110 u16 netfid)
129{ 111{
130 112
131 write_lock(&GlobalSMBSeslock); 113 write_lock(&GlobalSMBSeslock);
@@ -219,17 +201,6 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
219 struct timespec temp; 201 struct timespec temp;
220 int rc; 202 int rc;
221 203
222 /* want handles we can use to read with first
223 in the list so we do not have to walk the
224 list to search for one in write_begin */
225 if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
226 list_add_tail(&pCifsFile->flist,
227 &pCifsInode->openFileList);
228 } else {
229 list_add(&pCifsFile->flist,
230 &pCifsInode->openFileList);
231 }
232 write_unlock(&GlobalSMBSeslock);
233 if (pCifsInode->clientCanCacheRead) { 204 if (pCifsInode->clientCanCacheRead) {
234 /* we have the inode open somewhere else 205 /* we have the inode open somewhere else
235 no need to discard cache data */ 206 no need to discard cache data */
@@ -279,7 +250,8 @@ client_can_cache:
279int cifs_open(struct inode *inode, struct file *file) 250int cifs_open(struct inode *inode, struct file *file)
280{ 251{
281 int rc = -EACCES; 252 int rc = -EACCES;
282 int xid, oplock; 253 int xid;
254 __u32 oplock;
283 struct cifs_sb_info *cifs_sb; 255 struct cifs_sb_info *cifs_sb;
284 struct cifsTconInfo *tcon; 256 struct cifsTconInfo *tcon;
285 struct cifsFileInfo *pCifsFile; 257 struct cifsFileInfo *pCifsFile;
@@ -324,7 +296,7 @@ int cifs_open(struct inode *inode, struct file *file)
324 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 296 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
325 int oflags = (int) cifs_posix_convert_flags(file->f_flags); 297 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
326 /* can not refresh inode info since size could be stale */ 298 /* can not refresh inode info since size could be stale */
327 rc = cifs_posix_open(full_path, &inode, inode->i_sb, 299 rc = cifs_posix_open(full_path, &inode, file->f_path.mnt,
328 cifs_sb->mnt_file_mode /* ignored */, 300 cifs_sb->mnt_file_mode /* ignored */,
329 oflags, &oplock, &netfid, xid); 301 oflags, &oplock, &netfid, xid);
330 if (rc == 0) { 302 if (rc == 0) {
@@ -414,24 +386,17 @@ int cifs_open(struct inode *inode, struct file *file)
414 cFYI(1, ("cifs_open returned 0x%x", rc)); 386 cFYI(1, ("cifs_open returned 0x%x", rc));
415 goto out; 387 goto out;
416 } 388 }
417 file->private_data = 389
418 kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); 390 pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt,
391 file->f_flags);
392 file->private_data = pCifsFile;
419 if (file->private_data == NULL) { 393 if (file->private_data == NULL) {
420 rc = -ENOMEM; 394 rc = -ENOMEM;
421 goto out; 395 goto out;
422 } 396 }
423 pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
424 write_lock(&GlobalSMBSeslock);
425 list_add(&pCifsFile->tlist, &tcon->openFileList);
426 397
427 pCifsInode = CIFS_I(file->f_path.dentry->d_inode); 398 rc = cifs_open_inode_helper(inode, file, pCifsInode, pCifsFile, tcon,
428 if (pCifsInode) { 399 &oplock, buf, full_path, xid);
429 rc = cifs_open_inode_helper(inode, file, pCifsInode,
430 pCifsFile, tcon,
431 &oplock, buf, full_path, xid);
432 } else {
433 write_unlock(&GlobalSMBSeslock);
434 }
435 400
436 if (oplock & CIFS_CREATE_ACTION) { 401 if (oplock & CIFS_CREATE_ACTION) {
437 /* time to set mode which we can not set earlier due to 402 /* time to set mode which we can not set earlier due to
@@ -474,7 +439,8 @@ static int cifs_relock_file(struct cifsFileInfo *cifsFile)
474static int cifs_reopen_file(struct file *file, bool can_flush) 439static int cifs_reopen_file(struct file *file, bool can_flush)
475{ 440{
476 int rc = -EACCES; 441 int rc = -EACCES;
477 int xid, oplock; 442 int xid;
443 __u32 oplock;
478 struct cifs_sb_info *cifs_sb; 444 struct cifs_sb_info *cifs_sb;
479 struct cifsTconInfo *tcon; 445 struct cifsTconInfo *tcon;
480 struct cifsFileInfo *pCifsFile; 446 struct cifsFileInfo *pCifsFile;
@@ -543,7 +509,7 @@ reopen_error_exit:
543 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 509 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
544 int oflags = (int) cifs_posix_convert_flags(file->f_flags); 510 int oflags = (int) cifs_posix_convert_flags(file->f_flags);
545 /* can not refresh inode info since size could be stale */ 511 /* can not refresh inode info since size could be stale */
546 rc = cifs_posix_open(full_path, NULL, inode->i_sb, 512 rc = cifs_posix_open(full_path, NULL, file->f_path.mnt,
547 cifs_sb->mnt_file_mode /* ignored */, 513 cifs_sb->mnt_file_mode /* ignored */,
548 oflags, &oplock, &netfid, xid); 514 oflags, &oplock, &netfid, xid);
549 if (rc == 0) { 515 if (rc == 0) {
@@ -2308,6 +2274,73 @@ out:
2308 return rc; 2274 return rc;
2309} 2275}
2310 2276
2277static void
2278cifs_oplock_break(struct slow_work *work)
2279{
2280 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2281 oplock_break);
2282 struct inode *inode = cfile->pInode;
2283 struct cifsInodeInfo *cinode = CIFS_I(inode);
2284 struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->mnt->mnt_sb);
2285 int rc, waitrc = 0;
2286
2287 if (inode && S_ISREG(inode->i_mode)) {
2288#ifdef CONFIG_CIFS_EXPERIMENTAL
2289 if (cinode->clientCanCacheAll == 0)
2290 break_lease(inode, FMODE_READ);
2291 else if (cinode->clientCanCacheRead == 0)
2292 break_lease(inode, FMODE_WRITE);
2293#endif
2294 rc = filemap_fdatawrite(inode->i_mapping);
2295 if (cinode->clientCanCacheRead == 0) {
2296 waitrc = filemap_fdatawait(inode->i_mapping);
2297 invalidate_remote_inode(inode);
2298 }
2299 if (!rc)
2300 rc = waitrc;
2301 if (rc)
2302 cinode->write_behind_rc = rc;
2303 cFYI(1, ("Oplock flush inode %p rc %d", inode, rc));
2304 }
2305
2306 /*
2307 * releasing stale oplock after recent reconnect of smb session using
2308 * a now incorrect file handle is not a data integrity issue but do
2309 * not bother sending an oplock release if session to server still is
2310 * disconnected since oplock already released by the server
2311 */
2312 if (!cfile->closePend && !cfile->oplock_break_cancelled) {
2313 rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0,
2314 LOCKING_ANDX_OPLOCK_RELEASE, false);
2315 cFYI(1, ("Oplock release rc = %d", rc));
2316 }
2317}
2318
2319static int
2320cifs_oplock_break_get(struct slow_work *work)
2321{
2322 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2323 oplock_break);
2324 mntget(cfile->mnt);
2325 cifsFileInfo_get(cfile);
2326 return 0;
2327}
2328
2329static void
2330cifs_oplock_break_put(struct slow_work *work)
2331{
2332 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
2333 oplock_break);
2334 mntput(cfile->mnt);
2335 cifsFileInfo_put(cfile);
2336}
2337
2338const struct slow_work_ops cifs_oplock_break_ops = {
2339 .get_ref = cifs_oplock_break_get,
2340 .put_ref = cifs_oplock_break_put,
2341 .execute = cifs_oplock_break,
2342};
2343
2311const struct address_space_operations cifs_addr_ops = { 2344const struct address_space_operations cifs_addr_ops = {
2312 .readpage = cifs_readpage, 2345 .readpage = cifs_readpage,
2313 .readpages = cifs_readpages, 2346 .readpages = cifs_readpages,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 5e2492535daa..cababd8a52df 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -512,13 +512,10 @@ int cifs_get_inode_info(struct inode **pinode,
512 cifs_sb->local_nls, 512 cifs_sb->local_nls,
513 cifs_sb->mnt_cifs_flags & 513 cifs_sb->mnt_cifs_flags &
514 CIFS_MOUNT_MAP_SPECIAL_CHR); 514 CIFS_MOUNT_MAP_SPECIAL_CHR);
515 if (rc1) { 515 if (rc1 || !fattr.cf_uniqueid) {
516 cFYI(1, ("GetSrvInodeNum rc %d", rc1)); 516 cFYI(1, ("GetSrvInodeNum rc %d", rc1));
517 fattr.cf_uniqueid = iunique(sb, ROOT_I); 517 fattr.cf_uniqueid = iunique(sb, ROOT_I);
518 /* disable serverino if call not supported */ 518 cifs_autodisable_serverino(cifs_sb);
519 if (rc1 == -EINVAL)
520 cifs_sb->mnt_cifs_flags &=
521 ~CIFS_MOUNT_SERVER_INUM;
522 } 519 }
523 } else { 520 } else {
524 fattr.cf_uniqueid = iunique(sb, ROOT_I); 521 fattr.cf_uniqueid = iunique(sb, ROOT_I);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index e079a9190ec4..d27d4ec6579b 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -32,7 +32,6 @@
32 32
33extern mempool_t *cifs_sm_req_poolp; 33extern mempool_t *cifs_sm_req_poolp;
34extern mempool_t *cifs_req_poolp; 34extern mempool_t *cifs_req_poolp;
35extern struct task_struct *oplockThread;
36 35
37/* The xid serves as a useful identifier for each incoming vfs request, 36/* The xid serves as a useful identifier for each incoming vfs request,
38 in a similar way to the mid which is useful to track each sent smb, 37 in a similar way to the mid which is useful to track each sent smb,
@@ -500,6 +499,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
500 struct cifsTconInfo *tcon; 499 struct cifsTconInfo *tcon;
501 struct cifsInodeInfo *pCifsInode; 500 struct cifsInodeInfo *pCifsInode;
502 struct cifsFileInfo *netfile; 501 struct cifsFileInfo *netfile;
502 int rc;
503 503
504 cFYI(1, ("Checking for oplock break or dnotify response")); 504 cFYI(1, ("Checking for oplock break or dnotify response"));
505 if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && 505 if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) &&
@@ -562,30 +562,40 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
562 continue; 562 continue;
563 563
564 cifs_stats_inc(&tcon->num_oplock_brks); 564 cifs_stats_inc(&tcon->num_oplock_brks);
565 write_lock(&GlobalSMBSeslock); 565 read_lock(&GlobalSMBSeslock);
566 list_for_each(tmp2, &tcon->openFileList) { 566 list_for_each(tmp2, &tcon->openFileList) {
567 netfile = list_entry(tmp2, struct cifsFileInfo, 567 netfile = list_entry(tmp2, struct cifsFileInfo,
568 tlist); 568 tlist);
569 if (pSMB->Fid != netfile->netfid) 569 if (pSMB->Fid != netfile->netfid)
570 continue; 570 continue;
571 571
572 write_unlock(&GlobalSMBSeslock); 572 /*
573 read_unlock(&cifs_tcp_ses_lock); 573 * don't do anything if file is about to be
574 * closed anyway.
575 */
576 if (netfile->closePend) {
577 read_unlock(&GlobalSMBSeslock);
578 read_unlock(&cifs_tcp_ses_lock);
579 return true;
580 }
581
574 cFYI(1, ("file id match, oplock break")); 582 cFYI(1, ("file id match, oplock break"));
575 pCifsInode = CIFS_I(netfile->pInode); 583 pCifsInode = CIFS_I(netfile->pInode);
576 pCifsInode->clientCanCacheAll = false; 584 pCifsInode->clientCanCacheAll = false;
577 if (pSMB->OplockLevel == 0) 585 if (pSMB->OplockLevel == 0)
578 pCifsInode->clientCanCacheRead = false; 586 pCifsInode->clientCanCacheRead = false;
579 pCifsInode->oplockPending = true; 587 rc = slow_work_enqueue(&netfile->oplock_break);
580 AllocOplockQEntry(netfile->pInode, 588 if (rc) {
581 netfile->netfid, tcon); 589 cERROR(1, ("failed to enqueue oplock "
582 cFYI(1, ("about to wake up oplock thread")); 590 "break: %d\n", rc));
583 if (oplockThread) 591 } else {
584 wake_up_process(oplockThread); 592 netfile->oplock_break_cancelled = false;
585 593 }
594 read_unlock(&GlobalSMBSeslock);
595 read_unlock(&cifs_tcp_ses_lock);
586 return true; 596 return true;
587 } 597 }
588 write_unlock(&GlobalSMBSeslock); 598 read_unlock(&GlobalSMBSeslock);
589 read_unlock(&cifs_tcp_ses_lock); 599 read_unlock(&cifs_tcp_ses_lock);
590 cFYI(1, ("No matching file for oplock break")); 600 cFYI(1, ("No matching file for oplock break"));
591 return true; 601 return true;
@@ -705,3 +715,17 @@ cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
705ctoUCS_out: 715ctoUCS_out:
706 return i; 716 return i;
707} 717}
718
719void
720cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb)
721{
722 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
723 cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
724 cERROR(1, ("Autodisabling the use of server inode numbers on "
725 "%s. This server doesn't seem to support them "
726 "properly. Hardlinks will not be recognized on this "
727 "mount. Consider mounting with the \"noserverino\" "
728 "option to silence this message.",
729 cifs_sb->tcon->treeName));
730 }
731}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f823a4a208a7..f84062f9a985 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -146,7 +146,7 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
146 } 146 }
147} 147}
148 148
149void 149static void
150cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info, 150cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
151 struct cifs_sb_info *cifs_sb) 151 struct cifs_sb_info *cifs_sb)
152{ 152{
@@ -161,7 +161,7 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
161 cifs_fill_common_info(fattr, cifs_sb); 161 cifs_fill_common_info(fattr, cifs_sb);
162} 162}
163 163
164void 164static void
165cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info, 165cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
166 struct cifs_sb_info *cifs_sb) 166 struct cifs_sb_info *cifs_sb)
167{ 167{
@@ -727,11 +727,12 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir,
727 cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *) 727 cifs_dir_info_to_fattr(&fattr, (FILE_DIRECTORY_INFO *)
728 pfindEntry, cifs_sb); 728 pfindEntry, cifs_sb);
729 729
730 /* FIXME: make _to_fattr functions fill this out */ 730 if (inum && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
731 if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO)
732 fattr.cf_uniqueid = inum; 731 fattr.cf_uniqueid = inum;
733 else 732 } else {
734 fattr.cf_uniqueid = iunique(sb, ROOT_I); 733 fattr.cf_uniqueid = iunique(sb, ROOT_I);
734 cifs_autodisable_serverino(cifs_sb);
735 }
735 736
736 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); 737 ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
737 tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr); 738 tmp_dentry = cifs_readdir_lookup(file->f_dentry, &qstring, &fattr);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 1da4ab250eae..07b8e71544ee 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -103,56 +103,6 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
103 mempool_free(midEntry, cifs_mid_poolp); 103 mempool_free(midEntry, cifs_mid_poolp);
104} 104}
105 105
106struct oplock_q_entry *
107AllocOplockQEntry(struct inode *pinode, __u16 fid, struct cifsTconInfo *tcon)
108{
109 struct oplock_q_entry *temp;
110 if ((pinode == NULL) || (tcon == NULL)) {
111 cERROR(1, ("Null parms passed to AllocOplockQEntry"));
112 return NULL;
113 }
114 temp = (struct oplock_q_entry *) kmem_cache_alloc(cifs_oplock_cachep,
115 GFP_KERNEL);
116 if (temp == NULL)
117 return temp;
118 else {
119 temp->pinode = pinode;
120 temp->tcon = tcon;
121 temp->netfid = fid;
122 spin_lock(&cifs_oplock_lock);
123 list_add_tail(&temp->qhead, &cifs_oplock_list);
124 spin_unlock(&cifs_oplock_lock);
125 }
126 return temp;
127}
128
129void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry)
130{
131 spin_lock(&cifs_oplock_lock);
132 /* should we check if list empty first? */
133 list_del(&oplockEntry->qhead);
134 spin_unlock(&cifs_oplock_lock);
135 kmem_cache_free(cifs_oplock_cachep, oplockEntry);
136}
137
138
139void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
140{
141 struct oplock_q_entry *temp;
142
143 if (tcon == NULL)
144 return;
145
146 spin_lock(&cifs_oplock_lock);
147 list_for_each_entry(temp, &cifs_oplock_list, qhead) {
148 if ((temp->tcon) && (temp->tcon == tcon)) {
149 list_del(&temp->qhead);
150 kmem_cache_free(cifs_oplock_cachep, temp);
151 }
152 }
153 spin_unlock(&cifs_oplock_lock);
154}
155
156static int 106static int
157smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) 107smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
158{ 108{
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 0376ac66c44a..be4392ca2098 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -22,6 +22,7 @@
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/major.h> 23#include <linux/major.h>
24#include <linux/time.h> 24#include <linux/time.h>
25#include <linux/sched.h>
25#include <linux/slab.h> 26#include <linux/slab.h>
26#include <linux/ioport.h> 27#include <linux/ioport.h>
27#include <linux/fcntl.h> 28#include <linux/fcntl.h>
diff --git a/fs/compat.c b/fs/compat.c
index d576b552e8e2..6c19040ffeef 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1532,6 +1532,8 @@ int compat_do_execve(char * filename,
1532 if (retval < 0) 1532 if (retval < 0)
1533 goto out; 1533 goto out;
1534 1534
1535 current->stack_start = current->mm->start_stack;
1536
1535 /* execve succeeded */ 1537 /* execve succeeded */
1536 current->fs->in_exec = 0; 1538 current->fs->in_exec = 0;
1537 current->in_execve = 0; 1539 current->in_execve = 0;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index f91fd51b32e3..d84e7058c298 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1800,7 +1800,7 @@ struct space_resv_32 {
1800/* just account for different alignment */ 1800/* just account for different alignment */
1801static int compat_ioctl_preallocate(struct file *file, unsigned long arg) 1801static int compat_ioctl_preallocate(struct file *file, unsigned long arg)
1802{ 1802{
1803 struct space_resv_32 __user *p32 = (void __user *)arg; 1803 struct space_resv_32 __user *p32 = compat_ptr(arg);
1804 struct space_resv __user *p = compat_alloc_user_space(sizeof(*p)); 1804 struct space_resv __user *p = compat_alloc_user_space(sizeof(*p));
1805 1805
1806 if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) || 1806 if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) ||
@@ -2802,7 +2802,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
2802#else 2802#else
2803 case FS_IOC_RESVSP: 2803 case FS_IOC_RESVSP:
2804 case FS_IOC_RESVSP64: 2804 case FS_IOC_RESVSP64:
2805 error = ioctl_preallocate(filp, (void __user *)arg); 2805 error = ioctl_preallocate(filp, compat_ptr(arg));
2806 goto out_fput; 2806 goto out_fput;
2807#endif 2807#endif
2808 2808
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 240cef14fe58..70736eb4b516 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -316,6 +316,10 @@ int dlm_lowcomms_connect_node(int nodeid)
316{ 316{
317 struct connection *con; 317 struct connection *con;
318 318
319 /* with sctp there's no connecting without sending */
320 if (dlm_config.ci_protocol != 0)
321 return 0;
322
319 if (nodeid == dlm_our_nodeid()) 323 if (nodeid == dlm_our_nodeid())
320 return 0; 324 return 0;
321 325
@@ -455,9 +459,9 @@ static void process_sctp_notification(struct connection *con,
455 int prim_len, ret; 459 int prim_len, ret;
456 int addr_len; 460 int addr_len;
457 struct connection *new_con; 461 struct connection *new_con;
458 struct file *file;
459 sctp_peeloff_arg_t parg; 462 sctp_peeloff_arg_t parg;
460 int parglen = sizeof(parg); 463 int parglen = sizeof(parg);
464 int err;
461 465
462 /* 466 /*
463 * We get this before any data for an association. 467 * We get this before any data for an association.
@@ -512,19 +516,22 @@ static void process_sctp_notification(struct connection *con,
512 ret = kernel_getsockopt(con->sock, IPPROTO_SCTP, 516 ret = kernel_getsockopt(con->sock, IPPROTO_SCTP,
513 SCTP_SOCKOPT_PEELOFF, 517 SCTP_SOCKOPT_PEELOFF,
514 (void *)&parg, &parglen); 518 (void *)&parg, &parglen);
515 if (ret) { 519 if (ret < 0) {
516 log_print("Can't peel off a socket for " 520 log_print("Can't peel off a socket for "
517 "connection %d to node %d: err=%d\n", 521 "connection %d to node %d: err=%d",
518 parg.associd, nodeid, ret); 522 parg.associd, nodeid, ret);
523 return;
524 }
525 new_con->sock = sockfd_lookup(parg.sd, &err);
526 if (!new_con->sock) {
527 log_print("sockfd_lookup error %d", err);
528 return;
519 } 529 }
520 file = fget(parg.sd);
521 new_con->sock = SOCKET_I(file->f_dentry->d_inode);
522 add_sock(new_con->sock, new_con); 530 add_sock(new_con->sock, new_con);
523 fput(file); 531 sockfd_put(new_con->sock);
524 put_unused_fd(parg.sd);
525 532
526 log_print("got new/restarted association %d nodeid %d", 533 log_print("connecting to %d sctp association %d",
527 (int)sn->sn_assoc_change.sac_assoc_id, nodeid); 534 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
528 535
529 /* Send any pending writes */ 536 /* Send any pending writes */
530 clear_bit(CF_CONNECT_PENDING, &new_con->flags); 537 clear_bit(CF_CONNECT_PENDING, &new_con->flags);
@@ -837,8 +844,6 @@ static void sctp_init_assoc(struct connection *con)
837 if (con->retries++ > MAX_CONNECT_RETRIES) 844 if (con->retries++ > MAX_CONNECT_RETRIES)
838 return; 845 return;
839 846
840 log_print("Initiating association with node %d", con->nodeid);
841
842 if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) { 847 if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) {
843 log_print("no address for nodeid %d", con->nodeid); 848 log_print("no address for nodeid %d", con->nodeid);
844 return; 849 return;
@@ -855,11 +860,14 @@ static void sctp_init_assoc(struct connection *con)
855 outmessage.msg_flags = MSG_EOR; 860 outmessage.msg_flags = MSG_EOR;
856 861
857 spin_lock(&con->writequeue_lock); 862 spin_lock(&con->writequeue_lock);
858 e = list_entry(con->writequeue.next, struct writequeue_entry,
859 list);
860 863
861 BUG_ON((struct list_head *) e == &con->writequeue); 864 if (list_empty(&con->writequeue)) {
865 spin_unlock(&con->writequeue_lock);
866 log_print("writequeue empty for nodeid %d", con->nodeid);
867 return;
868 }
862 869
870 e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
863 len = e->len; 871 len = e->len;
864 offset = e->offset; 872 offset = e->offset;
865 spin_unlock(&con->writequeue_lock); 873 spin_unlock(&con->writequeue_lock);
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index 0c754e64232b..1cd6d9d3e29a 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -1,6 +1,9 @@
1config ECRYPT_FS 1config ECRYPT_FS
2 tristate "eCrypt filesystem layer support (EXPERIMENTAL)" 2 tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
3 depends on EXPERIMENTAL && KEYS && CRYPTO && NET 3 depends on EXPERIMENTAL && KEYS && CRYPTO
4 select CRYPTO_ECB
5 select CRYPTO_CBC
6 select CRYPTO_MD5
4 help 7 help
5 Encrypted filesystem that operates on the VFS layer. See 8 Encrypted filesystem that operates on the VFS layer. See
6 <file:Documentation/filesystems/ecryptfs.txt> to learn more about 9 <file:Documentation/filesystems/ecryptfs.txt> to learn more about
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index b91851f1cda3..fbb6e5eed697 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -245,13 +245,11 @@ void ecryptfs_destroy_crypt_stat(struct ecryptfs_crypt_stat *crypt_stat)
245 crypto_free_blkcipher(crypt_stat->tfm); 245 crypto_free_blkcipher(crypt_stat->tfm);
246 if (crypt_stat->hash_tfm) 246 if (crypt_stat->hash_tfm)
247 crypto_free_hash(crypt_stat->hash_tfm); 247 crypto_free_hash(crypt_stat->hash_tfm);
248 mutex_lock(&crypt_stat->keysig_list_mutex);
249 list_for_each_entry_safe(key_sig, key_sig_tmp, 248 list_for_each_entry_safe(key_sig, key_sig_tmp,
250 &crypt_stat->keysig_list, crypt_stat_list) { 249 &crypt_stat->keysig_list, crypt_stat_list) {
251 list_del(&key_sig->crypt_stat_list); 250 list_del(&key_sig->crypt_stat_list);
252 kmem_cache_free(ecryptfs_key_sig_cache, key_sig); 251 kmem_cache_free(ecryptfs_key_sig_cache, key_sig);
253 } 252 }
254 mutex_unlock(&crypt_stat->keysig_list_mutex);
255 memset(crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat)); 253 memset(crypt_stat, 0, sizeof(struct ecryptfs_crypt_stat));
256} 254}
257 255
@@ -511,13 +509,14 @@ int ecryptfs_encrypt_page(struct page *page)
511 + extent_offset), crypt_stat); 509 + extent_offset), crypt_stat);
512 rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, 510 rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt,
513 offset, crypt_stat->extent_size); 511 offset, crypt_stat->extent_size);
514 if (rc) { 512 if (rc < 0) {
515 ecryptfs_printk(KERN_ERR, "Error attempting " 513 ecryptfs_printk(KERN_ERR, "Error attempting "
516 "to write lower page; rc = [%d]" 514 "to write lower page; rc = [%d]"
517 "\n", rc); 515 "\n", rc);
518 goto out; 516 goto out;
519 } 517 }
520 } 518 }
519 rc = 0;
521out: 520out:
522 if (enc_extent_page) { 521 if (enc_extent_page) {
523 kunmap(enc_extent_page); 522 kunmap(enc_extent_page);
@@ -633,7 +632,7 @@ int ecryptfs_decrypt_page(struct page *page)
633 rc = ecryptfs_read_lower(enc_extent_virt, offset, 632 rc = ecryptfs_read_lower(enc_extent_virt, offset,
634 crypt_stat->extent_size, 633 crypt_stat->extent_size,
635 ecryptfs_inode); 634 ecryptfs_inode);
636 if (rc) { 635 if (rc < 0) {
637 ecryptfs_printk(KERN_ERR, "Error attempting " 636 ecryptfs_printk(KERN_ERR, "Error attempting "
638 "to read lower page; rc = [%d]" 637 "to read lower page; rc = [%d]"
639 "\n", rc); 638 "\n", rc);
@@ -797,6 +796,7 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat)
797 kfree(full_alg_name); 796 kfree(full_alg_name);
798 if (IS_ERR(crypt_stat->tfm)) { 797 if (IS_ERR(crypt_stat->tfm)) {
799 rc = PTR_ERR(crypt_stat->tfm); 798 rc = PTR_ERR(crypt_stat->tfm);
799 crypt_stat->tfm = NULL;
800 ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): " 800 ecryptfs_printk(KERN_ERR, "cryptfs: init_crypt_ctx(): "
801 "Error initializing cipher [%s]\n", 801 "Error initializing cipher [%s]\n",
802 crypt_stat->cipher); 802 crypt_stat->cipher);
@@ -925,7 +925,9 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
925 struct ecryptfs_global_auth_tok *global_auth_tok; 925 struct ecryptfs_global_auth_tok *global_auth_tok;
926 int rc = 0; 926 int rc = 0;
927 927
928 mutex_lock(&crypt_stat->keysig_list_mutex);
928 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); 929 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
930
929 list_for_each_entry(global_auth_tok, 931 list_for_each_entry(global_auth_tok,
930 &mount_crypt_stat->global_auth_tok_list, 932 &mount_crypt_stat->global_auth_tok_list,
931 mount_crypt_stat_list) { 933 mount_crypt_stat_list) {
@@ -934,13 +936,13 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
934 rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig); 936 rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig);
935 if (rc) { 937 if (rc) {
936 printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc); 938 printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc);
937 mutex_unlock(
938 &mount_crypt_stat->global_auth_tok_list_mutex);
939 goto out; 939 goto out;
940 } 940 }
941 } 941 }
942 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); 942
943out: 943out:
944 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
945 mutex_unlock(&crypt_stat->keysig_list_mutex);
944 return rc; 946 return rc;
945} 947}
946 948
@@ -1212,14 +1214,15 @@ int ecryptfs_read_and_validate_header_region(char *data,
1212 crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE; 1214 crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
1213 rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size, 1215 rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size,
1214 ecryptfs_inode); 1216 ecryptfs_inode);
1215 if (rc) { 1217 if (rc < 0) {
1216 printk(KERN_ERR "%s: Error reading header region; rc = [%d]\n", 1218 printk(KERN_ERR "%s: Error reading header region; rc = [%d]\n",
1217 __func__, rc); 1219 __func__, rc);
1218 goto out; 1220 goto out;
1219 } 1221 }
1220 if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) { 1222 if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
1221 rc = -EINVAL; 1223 rc = -EINVAL;
1222 } 1224 } else
1225 rc = 0;
1223out: 1226out:
1224 return rc; 1227 return rc;
1225} 1228}
@@ -1314,10 +1317,11 @@ ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry,
1314 1317
1315 rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, 1318 rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt,
1316 0, virt_len); 1319 0, virt_len);
1317 if (rc) 1320 if (rc < 0)
1318 printk(KERN_ERR "%s: Error attempting to write header " 1321 printk(KERN_ERR "%s: Error attempting to write header "
1319 "information to lower file; rc = [%d]\n", __func__, 1322 "information to lower file; rc = [%d]\n", __func__, rc);
1320 rc); 1323 else
1324 rc = 0;
1321 return rc; 1325 return rc;
1322} 1326}
1323 1327
@@ -1597,7 +1601,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
1597 } 1601 }
1598 rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size, 1602 rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size,
1599 ecryptfs_inode); 1603 ecryptfs_inode);
1600 if (!rc) 1604 if (rc >= 0)
1601 rc = ecryptfs_read_headers_virt(page_virt, crypt_stat, 1605 rc = ecryptfs_read_headers_virt(page_virt, crypt_stat,
1602 ecryptfs_dentry, 1606 ecryptfs_dentry,
1603 ECRYPTFS_VALIDATE_HEADER_SIZE); 1607 ECRYPTFS_VALIDATE_HEADER_SIZE);
@@ -1702,7 +1706,7 @@ ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
1702 } else { 1706 } else {
1703 printk(KERN_ERR "%s: No support for requested filename " 1707 printk(KERN_ERR "%s: No support for requested filename "
1704 "encryption method in this release\n", __func__); 1708 "encryption method in this release\n", __func__);
1705 rc = -ENOTSUPP; 1709 rc = -EOPNOTSUPP;
1706 goto out; 1710 goto out;
1707 } 1711 }
1708out: 1712out:
@@ -1763,7 +1767,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1763 if (IS_ERR(*key_tfm)) { 1767 if (IS_ERR(*key_tfm)) {
1764 rc = PTR_ERR(*key_tfm); 1768 rc = PTR_ERR(*key_tfm);
1765 printk(KERN_ERR "Unable to allocate crypto cipher with name " 1769 printk(KERN_ERR "Unable to allocate crypto cipher with name "
1766 "[%s]; rc = [%d]\n", cipher_name, rc); 1770 "[%s]; rc = [%d]\n", full_alg_name, rc);
1767 goto out; 1771 goto out;
1768 } 1772 }
1769 crypto_blkcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY); 1773 crypto_blkcipher_set_flags(*key_tfm, CRYPTO_TFM_REQ_WEAK_KEY);
@@ -1776,7 +1780,8 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
1776 rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size); 1780 rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
1777 if (rc) { 1781 if (rc) {
1778 printk(KERN_ERR "Error attempting to set key of size [%zd] for " 1782 printk(KERN_ERR "Error attempting to set key of size [%zd] for "
1779 "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc); 1783 "cipher [%s]; rc = [%d]\n", *key_size, full_alg_name,
1784 rc);
1780 rc = -EINVAL; 1785 rc = -EINVAL;
1781 goto out; 1786 goto out;
1782 } 1787 }
@@ -2166,7 +2171,7 @@ int ecryptfs_encrypt_and_encode_filename(
2166 (*encoded_name)[(*encoded_name_size)] = '\0'; 2171 (*encoded_name)[(*encoded_name_size)] = '\0';
2167 (*encoded_name_size)++; 2172 (*encoded_name_size)++;
2168 } else { 2173 } else {
2169 rc = -ENOTSUPP; 2174 rc = -EOPNOTSUPP;
2170 } 2175 }
2171 if (rc) { 2176 if (rc) {
2172 printk(KERN_ERR "%s: Error attempting to encode " 2177 printk(KERN_ERR "%s: Error attempting to encode "
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 2f0945d63297..056fed62d0de 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -476,6 +476,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
476 struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir); 476 struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
477 struct dentry *lower_dir_dentry; 477 struct dentry *lower_dir_dentry;
478 478
479 dget(lower_dentry);
479 lower_dir_dentry = lock_parent(lower_dentry); 480 lower_dir_dentry = lock_parent(lower_dentry);
480 rc = vfs_unlink(lower_dir_inode, lower_dentry); 481 rc = vfs_unlink(lower_dir_inode, lower_dentry);
481 if (rc) { 482 if (rc) {
@@ -489,6 +490,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
489 d_drop(dentry); 490 d_drop(dentry);
490out_unlock: 491out_unlock:
491 unlock_dir(lower_dir_dentry); 492 unlock_dir(lower_dir_dentry);
493 dput(lower_dentry);
492 return rc; 494 return rc;
493} 495}
494 496
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 259525c9abb8..a0a7847567e9 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -416,7 +416,9 @@ ecryptfs_find_global_auth_tok_for_sig(
416 &mount_crypt_stat->global_auth_tok_list, 416 &mount_crypt_stat->global_auth_tok_list,
417 mount_crypt_stat_list) { 417 mount_crypt_stat_list) {
418 if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) { 418 if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
419 (*global_auth_tok) = walker; 419 rc = key_validate(walker->global_auth_tok_key);
420 if (!rc)
421 (*global_auth_tok) = walker;
420 goto out; 422 goto out;
421 } 423 }
422 } 424 }
@@ -612,7 +614,12 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
612 } 614 }
613 /* TODO: Support other key modules than passphrase for 615 /* TODO: Support other key modules than passphrase for
614 * filename encryption */ 616 * filename encryption */
615 BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD); 617 if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
618 rc = -EOPNOTSUPP;
619 printk(KERN_INFO "%s: Filename encryption only supports "
620 "password tokens\n", __func__);
621 goto out_free_unlock;
622 }
616 sg_init_one( 623 sg_init_one(
617 &s->hash_sg, 624 &s->hash_sg,
618 (u8 *)s->auth_tok->token.password.session_key_encryption_key, 625 (u8 *)s->auth_tok->token.password.session_key_encryption_key,
@@ -910,7 +917,12 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
910 } 917 }
911 /* TODO: Support other key modules than passphrase for 918 /* TODO: Support other key modules than passphrase for
912 * filename encryption */ 919 * filename encryption */
913 BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD); 920 if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
921 rc = -EOPNOTSUPP;
922 printk(KERN_INFO "%s: Filename encryption only supports "
923 "password tokens\n", __func__);
924 goto out_free_unlock;
925 }
914 rc = crypto_blkcipher_setkey( 926 rc = crypto_blkcipher_setkey(
915 s->desc.tfm, 927 s->desc.tfm,
916 s->auth_tok->token.password.session_key_encryption_key, 928 s->auth_tok->token.password.session_key_encryption_key,
@@ -1316,8 +1328,10 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
1316 rc = -EINVAL; 1328 rc = -EINVAL;
1317 goto out_free; 1329 goto out_free;
1318 } 1330 }
1319 ecryptfs_cipher_code_to_string(crypt_stat->cipher, 1331 rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher,
1320 (u16)data[(*packet_size)]); 1332 (u16)data[(*packet_size)]);
1333 if (rc)
1334 goto out_free;
1321 /* A little extra work to differentiate among the AES key 1335 /* A little extra work to differentiate among the AES key
1322 * sizes; see RFC2440 */ 1336 * sizes; see RFC2440 */
1323 switch(data[(*packet_size)++]) { 1337 switch(data[(*packet_size)++]) {
@@ -1328,7 +1342,9 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
1328 crypt_stat->key_size = 1342 crypt_stat->key_size =
1329 (*new_auth_tok)->session_key.encrypted_key_size; 1343 (*new_auth_tok)->session_key.encrypted_key_size;
1330 } 1344 }
1331 ecryptfs_init_crypt_ctx(crypt_stat); 1345 rc = ecryptfs_init_crypt_ctx(crypt_stat);
1346 if (rc)
1347 goto out_free;
1332 if (unlikely(data[(*packet_size)++] != 0x03)) { 1348 if (unlikely(data[(*packet_size)++] != 0x03)) {
1333 printk(KERN_WARNING "Only S2K ID 3 is currently supported\n"); 1349 printk(KERN_WARNING "Only S2K ID 3 is currently supported\n");
1334 rc = -ENOSYS; 1350 rc = -ENOSYS;
@@ -2366,21 +2382,18 @@ struct kmem_cache *ecryptfs_key_sig_cache;
2366int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig) 2382int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig)
2367{ 2383{
2368 struct ecryptfs_key_sig *new_key_sig; 2384 struct ecryptfs_key_sig *new_key_sig;
2369 int rc = 0;
2370 2385
2371 new_key_sig = kmem_cache_alloc(ecryptfs_key_sig_cache, GFP_KERNEL); 2386 new_key_sig = kmem_cache_alloc(ecryptfs_key_sig_cache, GFP_KERNEL);
2372 if (!new_key_sig) { 2387 if (!new_key_sig) {
2373 rc = -ENOMEM;
2374 printk(KERN_ERR 2388 printk(KERN_ERR
2375 "Error allocating from ecryptfs_key_sig_cache\n"); 2389 "Error allocating from ecryptfs_key_sig_cache\n");
2376 goto out; 2390 return -ENOMEM;
2377 } 2391 }
2378 memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX); 2392 memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX);
2379 mutex_lock(&crypt_stat->keysig_list_mutex); 2393 /* Caller must hold keysig_list_mutex */
2380 list_add(&new_key_sig->crypt_stat_list, &crypt_stat->keysig_list); 2394 list_add(&new_key_sig->crypt_stat_list, &crypt_stat->keysig_list);
2381 mutex_unlock(&crypt_stat->keysig_list_mutex); 2395
2382out: 2396 return 0;
2383 return rc;
2384} 2397}
2385 2398
2386struct kmem_cache *ecryptfs_global_auth_tok_cache; 2399struct kmem_cache *ecryptfs_global_auth_tok_cache;
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index c6d7a4d748a0..e14cf7e588db 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -136,6 +136,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
136 const struct cred *cred) 136 const struct cred *cred)
137{ 137{
138 struct ecryptfs_open_req *req; 138 struct ecryptfs_open_req *req;
139 int flags = O_LARGEFILE;
139 int rc = 0; 140 int rc = 0;
140 141
141 /* Corresponding dput() and mntput() are done when the 142 /* Corresponding dput() and mntput() are done when the
@@ -143,10 +144,14 @@ int ecryptfs_privileged_open(struct file **lower_file,
143 * destroyed. */ 144 * destroyed. */
144 dget(lower_dentry); 145 dget(lower_dentry);
145 mntget(lower_mnt); 146 mntget(lower_mnt);
146 (*lower_file) = dentry_open(lower_dentry, lower_mnt, 147 flags |= IS_RDONLY(lower_dentry->d_inode) ? O_RDONLY : O_RDWR;
147 (O_RDWR | O_LARGEFILE), cred); 148 (*lower_file) = dentry_open(lower_dentry, lower_mnt, flags, cred);
148 if (!IS_ERR(*lower_file)) 149 if (!IS_ERR(*lower_file))
149 goto out; 150 goto out;
151 if (flags & O_RDONLY) {
152 rc = PTR_ERR((*lower_file));
153 goto out;
154 }
150 req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL); 155 req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL);
151 if (!req) { 156 if (!req) {
152 rc = -ENOMEM; 157 rc = -ENOMEM;
@@ -180,21 +185,8 @@ int ecryptfs_privileged_open(struct file **lower_file,
180 __func__); 185 __func__);
181 goto out_unlock; 186 goto out_unlock;
182 } 187 }
183 if (IS_ERR(*req->lower_file)) { 188 if (IS_ERR(*req->lower_file))
184 rc = PTR_ERR(*req->lower_file); 189 rc = PTR_ERR(*req->lower_file);
185 dget(lower_dentry);
186 mntget(lower_mnt);
187 (*lower_file) = dentry_open(lower_dentry, lower_mnt,
188 (O_RDONLY | O_LARGEFILE), cred);
189 if (IS_ERR(*lower_file)) {
190 rc = PTR_ERR(*req->lower_file);
191 (*lower_file) = NULL;
192 printk(KERN_WARNING "%s: Error attempting privileged "
193 "open of lower file with either RW or RO "
194 "perms; rc = [%d]. Giving up.\n",
195 __func__, rc);
196 }
197 }
198out_unlock: 190out_unlock:
199 mutex_unlock(&req->mux); 191 mutex_unlock(&req->mux);
200out_free: 192out_free:
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 9f0aa9883c28..c6ac85d6c701 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -35,6 +35,7 @@
35#include <linux/key.h> 35#include <linux/key.h>
36#include <linux/parser.h> 36#include <linux/parser.h>
37#include <linux/fs_stack.h> 37#include <linux/fs_stack.h>
38#include <linux/ima.h>
38#include "ecryptfs_kernel.h" 39#include "ecryptfs_kernel.h"
39 40
40/** 41/**
@@ -118,6 +119,7 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
118 const struct cred *cred = current_cred(); 119 const struct cred *cred = current_cred();
119 struct ecryptfs_inode_info *inode_info = 120 struct ecryptfs_inode_info *inode_info =
120 ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); 121 ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
122 int opened_lower_file = 0;
121 int rc = 0; 123 int rc = 0;
122 124
123 mutex_lock(&inode_info->lower_file_mutex); 125 mutex_lock(&inode_info->lower_file_mutex);
@@ -129,15 +131,17 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
129 lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); 131 lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
130 rc = ecryptfs_privileged_open(&inode_info->lower_file, 132 rc = ecryptfs_privileged_open(&inode_info->lower_file,
131 lower_dentry, lower_mnt, cred); 133 lower_dentry, lower_mnt, cred);
132 if (rc || IS_ERR(inode_info->lower_file)) { 134 if (rc) {
133 printk(KERN_ERR "Error opening lower persistent file " 135 printk(KERN_ERR "Error opening lower persistent file "
134 "for lower_dentry [0x%p] and lower_mnt [0x%p]; " 136 "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
135 "rc = [%d]\n", lower_dentry, lower_mnt, rc); 137 "rc = [%d]\n", lower_dentry, lower_mnt, rc);
136 rc = PTR_ERR(inode_info->lower_file);
137 inode_info->lower_file = NULL; 138 inode_info->lower_file = NULL;
138 } 139 } else
140 opened_lower_file = 1;
139 } 141 }
140 mutex_unlock(&inode_info->lower_file_mutex); 142 mutex_unlock(&inode_info->lower_file_mutex);
143 if (opened_lower_file)
144 ima_counts_get(inode_info->lower_file);
141 return rc; 145 return rc;
142} 146}
143 147
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 05772aeaa8f4..df4ce99d0597 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -396,9 +396,11 @@ static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode)
396 rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0, 396 rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0,
397 sizeof(u64)); 397 sizeof(u64));
398 kfree(file_size_virt); 398 kfree(file_size_virt);
399 if (rc) 399 if (rc < 0)
400 printk(KERN_ERR "%s: Error writing file size to header; " 400 printk(KERN_ERR "%s: Error writing file size to header; "
401 "rc = [%d]\n", __func__, rc); 401 "rc = [%d]\n", __func__, rc);
402 else
403 rc = 0;
402out: 404out:
403 return rc; 405 return rc;
404} 406}
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index a137c6ea2fee..0cc4fafd6552 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -34,15 +34,14 @@
34 * 34 *
35 * Write data to the lower file. 35 * Write data to the lower file.
36 * 36 *
37 * Returns zero on success; non-zero on error 37 * Returns bytes written on success; less than zero on error
38 */ 38 */
39int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, 39int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
40 loff_t offset, size_t size) 40 loff_t offset, size_t size)
41{ 41{
42 struct ecryptfs_inode_info *inode_info; 42 struct ecryptfs_inode_info *inode_info;
43 ssize_t octets_written;
44 mm_segment_t fs_save; 43 mm_segment_t fs_save;
45 int rc = 0; 44 ssize_t rc;
46 45
47 inode_info = ecryptfs_inode_to_private(ecryptfs_inode); 46 inode_info = ecryptfs_inode_to_private(ecryptfs_inode);
48 mutex_lock(&inode_info->lower_file_mutex); 47 mutex_lock(&inode_info->lower_file_mutex);
@@ -50,14 +49,9 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
50 inode_info->lower_file->f_pos = offset; 49 inode_info->lower_file->f_pos = offset;
51 fs_save = get_fs(); 50 fs_save = get_fs();
52 set_fs(get_ds()); 51 set_fs(get_ds());
53 octets_written = vfs_write(inode_info->lower_file, data, size, 52 rc = vfs_write(inode_info->lower_file, data, size,
54 &inode_info->lower_file->f_pos); 53 &inode_info->lower_file->f_pos);
55 set_fs(fs_save); 54 set_fs(fs_save);
56 if (octets_written < 0) {
57 printk(KERN_ERR "%s: octets_written = [%td]; "
58 "expected [%td]\n", __func__, octets_written, size);
59 rc = -EINVAL;
60 }
61 mutex_unlock(&inode_info->lower_file_mutex); 55 mutex_unlock(&inode_info->lower_file_mutex);
62 mark_inode_dirty_sync(ecryptfs_inode); 56 mark_inode_dirty_sync(ecryptfs_inode);
63 return rc; 57 return rc;
@@ -91,6 +85,8 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
91 + offset_in_page); 85 + offset_in_page);
92 virt = kmap(page_for_lower); 86 virt = kmap(page_for_lower);
93 rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size); 87 rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
88 if (rc > 0)
89 rc = 0;
94 kunmap(page_for_lower); 90 kunmap(page_for_lower);
95 return rc; 91 return rc;
96} 92}
@@ -229,30 +225,24 @@ out:
229 * Read @size bytes of data at byte offset @offset from the lower 225 * Read @size bytes of data at byte offset @offset from the lower
230 * inode into memory location @data. 226 * inode into memory location @data.
231 * 227 *
232 * Returns zero on success; non-zero on error 228 * Returns bytes read on success; 0 on EOF; less than zero on error
233 */ 229 */
234int ecryptfs_read_lower(char *data, loff_t offset, size_t size, 230int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
235 struct inode *ecryptfs_inode) 231 struct inode *ecryptfs_inode)
236{ 232{
237 struct ecryptfs_inode_info *inode_info = 233 struct ecryptfs_inode_info *inode_info =
238 ecryptfs_inode_to_private(ecryptfs_inode); 234 ecryptfs_inode_to_private(ecryptfs_inode);
239 ssize_t octets_read;
240 mm_segment_t fs_save; 235 mm_segment_t fs_save;
241 int rc = 0; 236 ssize_t rc;
242 237
243 mutex_lock(&inode_info->lower_file_mutex); 238 mutex_lock(&inode_info->lower_file_mutex);
244 BUG_ON(!inode_info->lower_file); 239 BUG_ON(!inode_info->lower_file);
245 inode_info->lower_file->f_pos = offset; 240 inode_info->lower_file->f_pos = offset;
246 fs_save = get_fs(); 241 fs_save = get_fs();
247 set_fs(get_ds()); 242 set_fs(get_ds());
248 octets_read = vfs_read(inode_info->lower_file, data, size, 243 rc = vfs_read(inode_info->lower_file, data, size,
249 &inode_info->lower_file->f_pos); 244 &inode_info->lower_file->f_pos);
250 set_fs(fs_save); 245 set_fs(fs_save);
251 if (octets_read < 0) {
252 printk(KERN_ERR "%s: octets_read = [%td]; "
253 "expected [%td]\n", __func__, octets_read, size);
254 rc = -EINVAL;
255 }
256 mutex_unlock(&inode_info->lower_file_mutex); 246 mutex_unlock(&inode_info->lower_file_mutex);
257 return rc; 247 return rc;
258} 248}
@@ -284,6 +274,8 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
284 offset = ((((loff_t)page_index) << PAGE_CACHE_SHIFT) + offset_in_page); 274 offset = ((((loff_t)page_index) << PAGE_CACHE_SHIFT) + offset_in_page);
285 virt = kmap(page_for_ecryptfs); 275 virt = kmap(page_for_ecryptfs);
286 rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode); 276 rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode);
277 if (rc > 0)
278 rc = 0;
287 kunmap(page_for_ecryptfs); 279 kunmap(page_for_ecryptfs);
288 flush_dcache_page(page_for_ecryptfs); 280 flush_dcache_page(page_for_ecryptfs);
289 return rc; 281 return rc;
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 12d649602d3a..b15a43a80ab7 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -77,7 +77,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
77 struct ecryptfs_inode_info *inode_info; 77 struct ecryptfs_inode_info *inode_info;
78 78
79 inode_info = ecryptfs_inode_to_private(inode); 79 inode_info = ecryptfs_inode_to_private(inode);
80 mutex_lock(&inode_info->lower_file_mutex);
81 if (inode_info->lower_file) { 80 if (inode_info->lower_file) {
82 struct dentry *lower_dentry = 81 struct dentry *lower_dentry =
83 inode_info->lower_file->f_dentry; 82 inode_info->lower_file->f_dentry;
@@ -89,7 +88,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
89 d_drop(lower_dentry); 88 d_drop(lower_dentry);
90 } 89 }
91 } 90 }
92 mutex_unlock(&inode_info->lower_file_mutex);
93 ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat); 91 ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
94 kmem_cache_free(ecryptfs_inode_info_cache, inode_info); 92 kmem_cache_free(ecryptfs_inode_info_cache, inode_info);
95} 93}
diff --git a/fs/exec.c b/fs/exec.c
index d49be6bc1793..ba112bd4a339 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -624,10 +624,8 @@ int setup_arg_pages(struct linux_binprm *bprm,
624 /* Move stack pages down in memory. */ 624 /* Move stack pages down in memory. */
625 if (stack_shift) { 625 if (stack_shift) {
626 ret = shift_arg_pages(vma, stack_shift); 626 ret = shift_arg_pages(vma, stack_shift);
627 if (ret) { 627 if (ret)
628 up_write(&mm->mmap_sem); 628 goto out_unlock;
629 return ret;
630 }
631 } 629 }
632 630
633#ifdef CONFIG_STACK_GROWSUP 631#ifdef CONFIG_STACK_GROWSUP
@@ -641,7 +639,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
641 639
642out_unlock: 640out_unlock:
643 up_write(&mm->mmap_sem); 641 up_write(&mm->mmap_sem);
644 return 0; 642 return ret;
645} 643}
646EXPORT_SYMBOL(setup_arg_pages); 644EXPORT_SYMBOL(setup_arg_pages);
647 645
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 451d166bbe93..8209f266e9ad 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -46,19 +46,21 @@
46int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) 46int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
47{ 47{
48 struct inode *inode = dentry->d_inode; 48 struct inode *inode = dentry->d_inode;
49 struct ext3_inode_info *ei = EXT3_I(inode);
50 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
49 int ret = 0; 51 int ret = 0;
52 tid_t commit_tid;
53
54 if (inode->i_sb->s_flags & MS_RDONLY)
55 return 0;
50 56
51 J_ASSERT(ext3_journal_current_handle() == NULL); 57 J_ASSERT(ext3_journal_current_handle() == NULL);
52 58
53 /* 59 /*
54 * data=writeback: 60 * data=writeback,ordered:
55 * The caller's filemap_fdatawrite()/wait will sync the data. 61 * The caller's filemap_fdatawrite()/wait will sync the data.
56 * sync_inode() will sync the metadata 62 * Metadata is in the journal, we wait for a proper transaction
57 * 63 * to commit here.
58 * data=ordered:
59 * The caller's filemap_fdatawrite() will write the data and
60 * sync_inode() will write the inode if it is dirty. Then the caller's
61 * filemap_fdatawait() will wait on the pages.
62 * 64 *
63 * data=journal: 65 * data=journal:
64 * filemap_fdatawrite won't do anything (the buffers are clean). 66 * filemap_fdatawrite won't do anything (the buffers are clean).
@@ -73,22 +75,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
73 goto out; 75 goto out;
74 } 76 }
75 77
76 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 78 if (datasync)
77 goto flush; 79 commit_tid = atomic_read(&ei->i_datasync_tid);
80 else
81 commit_tid = atomic_read(&ei->i_sync_tid);
78 82
79 /* 83 if (log_start_commit(journal, commit_tid)) {
80 * The VFS has written the file data. If the inode is unaltered 84 log_wait_commit(journal, commit_tid);
81 * then we need not start a commit.
82 */
83 if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
84 struct writeback_control wbc = {
85 .sync_mode = WB_SYNC_ALL,
86 .nr_to_write = 0, /* sys_fsync did this */
87 };
88 ret = sync_inode(inode, &wbc);
89 goto out; 85 goto out;
90 } 86 }
91flush: 87
92 /* 88 /*
93 * In case we didn't commit a transaction, we have to flush 89 * In case we didn't commit a transaction, we have to flush
94 * disk caches manually so that data really is on persistent 90 * disk caches manually so that data really is on persistent
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index acf1b1423327..354ed3b47b30 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -699,8 +699,9 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
699 int err = 0; 699 int err = 0;
700 struct ext3_block_alloc_info *block_i; 700 struct ext3_block_alloc_info *block_i;
701 ext3_fsblk_t current_block; 701 ext3_fsblk_t current_block;
702 struct ext3_inode_info *ei = EXT3_I(inode);
702 703
703 block_i = EXT3_I(inode)->i_block_alloc_info; 704 block_i = ei->i_block_alloc_info;
704 /* 705 /*
705 * If we're splicing into a [td]indirect block (as opposed to the 706 * If we're splicing into a [td]indirect block (as opposed to the
706 * inode) then we need to get write access to the [td]indirect block 707 * inode) then we need to get write access to the [td]indirect block
@@ -741,6 +742,8 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode,
741 742
742 inode->i_ctime = CURRENT_TIME_SEC; 743 inode->i_ctime = CURRENT_TIME_SEC;
743 ext3_mark_inode_dirty(handle, inode); 744 ext3_mark_inode_dirty(handle, inode);
745 /* ext3_mark_inode_dirty already updated i_sync_tid */
746 atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
744 747
745 /* had we spliced it onto indirect block? */ 748 /* had we spliced it onto indirect block? */
746 if (where->bh) { 749 if (where->bh) {
@@ -1735,6 +1738,7 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1735 ssize_t ret; 1738 ssize_t ret;
1736 int orphan = 0; 1739 int orphan = 0;
1737 size_t count = iov_length(iov, nr_segs); 1740 size_t count = iov_length(iov, nr_segs);
1741 int retries = 0;
1738 1742
1739 if (rw == WRITE) { 1743 if (rw == WRITE) {
1740 loff_t final_size = offset + count; 1744 loff_t final_size = offset + count;
@@ -1757,9 +1761,12 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1757 } 1761 }
1758 } 1762 }
1759 1763
1764retry:
1760 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 1765 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1761 offset, nr_segs, 1766 offset, nr_segs,
1762 ext3_get_block, NULL); 1767 ext3_get_block, NULL);
1768 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1769 goto retry;
1763 1770
1764 if (orphan) { 1771 if (orphan) {
1765 int err; 1772 int err;
@@ -2750,6 +2757,8 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2750 struct ext3_inode_info *ei; 2757 struct ext3_inode_info *ei;
2751 struct buffer_head *bh; 2758 struct buffer_head *bh;
2752 struct inode *inode; 2759 struct inode *inode;
2760 journal_t *journal = EXT3_SB(sb)->s_journal;
2761 transaction_t *transaction;
2753 long ret; 2762 long ret;
2754 int block; 2763 int block;
2755 2764
@@ -2827,6 +2836,30 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
2827 ei->i_data[block] = raw_inode->i_block[block]; 2836 ei->i_data[block] = raw_inode->i_block[block];
2828 INIT_LIST_HEAD(&ei->i_orphan); 2837 INIT_LIST_HEAD(&ei->i_orphan);
2829 2838
2839 /*
2840 * Set transaction id's of transactions that have to be committed
2841 * to finish f[data]sync. We set them to currently running transaction
2842 * as we cannot be sure that the inode or some of its metadata isn't
2843 * part of the transaction - the inode could have been reclaimed and
2844 * now it is reread from disk.
2845 */
2846 if (journal) {
2847 tid_t tid;
2848
2849 spin_lock(&journal->j_state_lock);
2850 if (journal->j_running_transaction)
2851 transaction = journal->j_running_transaction;
2852 else
2853 transaction = journal->j_committing_transaction;
2854 if (transaction)
2855 tid = transaction->t_tid;
2856 else
2857 tid = journal->j_commit_sequence;
2858 spin_unlock(&journal->j_state_lock);
2859 atomic_set(&ei->i_sync_tid, tid);
2860 atomic_set(&ei->i_datasync_tid, tid);
2861 }
2862
2830 if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 && 2863 if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
2831 EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) { 2864 EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
2832 /* 2865 /*
@@ -3011,6 +3044,7 @@ again:
3011 err = rc; 3044 err = rc;
3012 ei->i_state &= ~EXT3_STATE_NEW; 3045 ei->i_state &= ~EXT3_STATE_NEW;
3013 3046
3047 atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
3014out_brelse: 3048out_brelse:
3015 brelse (bh); 3049 brelse (bh);
3016 ext3_std_error(inode->i_sb, err); 3050 ext3_std_error(inode->i_sb, err);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 72743d360509..427496c4767c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -466,6 +466,8 @@ static struct inode *ext3_alloc_inode(struct super_block *sb)
466 return NULL; 466 return NULL;
467 ei->i_block_alloc_info = NULL; 467 ei->i_block_alloc_info = NULL;
468 ei->vfs_inode.i_version = 1; 468 ei->vfs_inode.i_version = 1;
469 atomic_set(&ei->i_datasync_tid, 0);
470 atomic_set(&ei->i_sync_tid, 0);
469 return &ei->vfs_inode; 471 return &ei->vfs_inode;
470} 472}
471 473
@@ -2321,7 +2323,18 @@ static int ext3_commit_super(struct super_block *sb,
2321 2323
2322 if (!sbh) 2324 if (!sbh)
2323 return error; 2325 return error;
2324 es->s_wtime = cpu_to_le32(get_seconds()); 2326 /*
2327 * If the file system is mounted read-only, don't update the
2328 * superblock write time. This avoids updating the superblock
2329 * write time when we are mounting the root file system
2330 * read/only but we need to replay the journal; at that point,
2331 * for people who are east of GMT and who make their clock
2332 * tick in localtime for Windows bug-for-bug compatibility,
2333 * the clock is set in the future, and this will cause e2fsck
2334 * to complain and force a full file system check.
2335 */
2336 if (!(sb->s_flags & MS_RDONLY))
2337 es->s_wtime = cpu_to_le32(get_seconds());
2325 es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb)); 2338 es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
2326 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb)); 2339 es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
2327 BUFFER_TRACE(sbh, "marking dirty"); 2340 BUFFER_TRACE(sbh, "marking dirty");
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index d5c0ea2e8f2d..9f2d45d75b1a 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -26,20 +26,6 @@ config EXT4_FS
26 26
27 If unsure, say N. 27 If unsure, say N.
28 28
29config EXT4DEV_COMPAT
30 bool "Enable ext4dev compatibility"
31 depends on EXT4_FS
32 help
33 Starting with 2.6.28, the name of the ext4 filesystem was
34 renamed from ext4dev to ext4. Unfortunately there are some
35 legacy userspace programs (such as klibc's fstype) have
36 "ext4dev" hardcoded.
37
38 To enable backwards compatibility so that systems that are
39 still expecting to mount ext4 filesystems using ext4dev,
40 choose Y here. This feature will go away by 2.6.31, so
41 please arrange to get your userspace programs fixed!
42
43config EXT4_FS_XATTR 29config EXT4_FS_XATTR
44 bool "Ext4 extended attributes" 30 bool "Ext4 extended attributes"
45 depends on EXT4_FS 31 depends on EXT4_FS
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e227eea23f05..8825515eeddd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -65,6 +65,12 @@ typedef __u32 ext4_lblk_t;
65/* data type for block group number */ 65/* data type for block group number */
66typedef unsigned int ext4_group_t; 66typedef unsigned int ext4_group_t;
67 67
68/*
69 * Flags used in mballoc's allocation_context flags field.
70 *
71 * Also used to show what's going on for debugging purposes when the
72 * flag field is exported via the traceport interface
73 */
68 74
69/* prefer goal again. length */ 75/* prefer goal again. length */
70#define EXT4_MB_HINT_MERGE 0x0001 76#define EXT4_MB_HINT_MERGE 0x0001
@@ -127,6 +133,16 @@ struct mpage_da_data {
127 int pages_written; 133 int pages_written;
128 int retval; 134 int retval;
129}; 135};
136#define DIO_AIO_UNWRITTEN 0x1
137typedef struct ext4_io_end {
138 struct list_head list; /* per-file finished AIO list */
139 struct inode *inode; /* file being written to */
140 unsigned int flag; /* unwritten or not */
141 int error; /* I/O error code */
142 ext4_lblk_t offset; /* offset in the file */
143 size_t size; /* size of the extent */
144 struct work_struct work; /* data work queue */
145} ext4_io_end_t;
130 146
131/* 147/*
132 * Special inodes numbers 148 * Special inodes numbers
@@ -306,6 +322,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
306#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ 322#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
307#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ 323#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
308#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */ 324#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
325#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
309 326
310/* Used to pass group descriptor data when online resize is done */ 327/* Used to pass group descriptor data when online resize is done */
311struct ext4_new_group_input { 328struct ext4_new_group_input {
@@ -347,7 +364,16 @@ struct ext4_new_group_data {
347 /* Call ext4_da_update_reserve_space() after successfully 364 /* Call ext4_da_update_reserve_space() after successfully
348 allocating the blocks */ 365 allocating the blocks */
349#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008 366#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
350 367 /* caller is from the direct IO path, request to creation of an
368 unitialized extents if not allocated, split the uninitialized
369 extent if blocks has been preallocated already*/
370#define EXT4_GET_BLOCKS_DIO 0x0010
371#define EXT4_GET_BLOCKS_CONVERT 0x0020
372#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
373 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
374 /* Convert extent to initialized after direct IO complete */
375#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
376 EXT4_GET_BLOCKS_DIO_CREATE_EXT)
351 377
352/* 378/*
353 * ioctl commands 379 * ioctl commands
@@ -500,8 +526,8 @@ struct move_extent {
500static inline __le32 ext4_encode_extra_time(struct timespec *time) 526static inline __le32 ext4_encode_extra_time(struct timespec *time)
501{ 527{
502 return cpu_to_le32((sizeof(time->tv_sec) > 4 ? 528 return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
503 time->tv_sec >> 32 : 0) | 529 (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
504 ((time->tv_nsec << 2) & EXT4_NSEC_MASK)); 530 ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
505} 531}
506 532
507static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) 533static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
@@ -509,7 +535,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
509 if (sizeof(time->tv_sec) > 4) 535 if (sizeof(time->tv_sec) > 4)
510 time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) 536 time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
511 << 32; 537 << 32;
512 time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2; 538 time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
513} 539}
514 540
515#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ 541#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
@@ -672,6 +698,11 @@ struct ext4_inode_info {
672 __u16 i_extra_isize; 698 __u16 i_extra_isize;
673 699
674 spinlock_t i_block_reservation_lock; 700 spinlock_t i_block_reservation_lock;
701
702 /* completed async DIOs that might need unwritten extents handling */
703 struct list_head i_aio_dio_complete_list;
704 /* current io_end structure for async DIO write*/
705 ext4_io_end_t *cur_aio_dio;
675}; 706};
676 707
677/* 708/*
@@ -713,6 +744,7 @@ struct ext4_inode_info {
713#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 744#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
714#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 745#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
715#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 746#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
747#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
716#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 748#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
717#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 749#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
718#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 750#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
@@ -942,18 +974,11 @@ struct ext4_sb_info {
942 unsigned int s_mb_stats; 974 unsigned int s_mb_stats;
943 unsigned int s_mb_order2_reqs; 975 unsigned int s_mb_order2_reqs;
944 unsigned int s_mb_group_prealloc; 976 unsigned int s_mb_group_prealloc;
977 unsigned int s_max_writeback_mb_bump;
945 /* where last allocation was done - for stream allocation */ 978 /* where last allocation was done - for stream allocation */
946 unsigned long s_mb_last_group; 979 unsigned long s_mb_last_group;
947 unsigned long s_mb_last_start; 980 unsigned long s_mb_last_start;
948 981
949 /* history to debug policy */
950 struct ext4_mb_history *s_mb_history;
951 int s_mb_history_cur;
952 int s_mb_history_max;
953 int s_mb_history_num;
954 spinlock_t s_mb_history_lock;
955 int s_mb_history_filter;
956
957 /* stats for buddy allocator */ 982 /* stats for buddy allocator */
958 spinlock_t s_mb_pa_lock; 983 spinlock_t s_mb_pa_lock;
959 atomic_t s_bal_reqs; /* number of reqs with len > 1 */ 984 atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -980,6 +1005,9 @@ struct ext4_sb_info {
980 1005
981 unsigned int s_log_groups_per_flex; 1006 unsigned int s_log_groups_per_flex;
982 struct flex_groups *s_flex_groups; 1007 struct flex_groups *s_flex_groups;
1008
1009 /* workqueue for dio unwritten */
1010 struct workqueue_struct *dio_unwritten_wq;
983}; 1011};
984 1012
985static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1013static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1397,7 +1425,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
1397 struct address_space *mapping, loff_t from); 1425 struct address_space *mapping, loff_t from);
1398extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1426extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1399extern qsize_t ext4_get_reserved_space(struct inode *inode); 1427extern qsize_t ext4_get_reserved_space(struct inode *inode);
1400 1428extern int flush_aio_dio_completed_IO(struct inode *inode);
1401/* ioctl.c */ 1429/* ioctl.c */
1402extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1430extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
1403extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); 1431extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1699,6 +1727,8 @@ extern void ext4_ext_init(struct super_block *);
1699extern void ext4_ext_release(struct super_block *); 1727extern void ext4_ext_release(struct super_block *);
1700extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, 1728extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1701 loff_t len); 1729 loff_t len);
1730extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1731 loff_t len);
1702extern int ext4_get_blocks(handle_t *handle, struct inode *inode, 1732extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1703 sector_t block, unsigned int max_blocks, 1733 sector_t block, unsigned int max_blocks,
1704 struct buffer_head *bh, int flags); 1734 struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 61652f1d15e6..2ca686454e87 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
220 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); 220 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
221} 221}
222 222
223static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
224{
225 ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
226}
227
223extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); 228extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
224extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); 229extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
225extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); 230extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
@@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
235 struct ext4_ext_path *path, 240 struct ext4_ext_path *path,
236 struct ext4_extent *); 241 struct ext4_extent *);
237extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); 242extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
238extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); 243extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
239extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, 244extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
240 ext_prepare_callback, void *); 245 ext_prepare_callback, void *);
241extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 246extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 139fb8cb87e4..a2865980342f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -161,11 +161,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
161handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 161handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
162int __ext4_journal_stop(const char *where, handle_t *handle); 162int __ext4_journal_stop(const char *where, handle_t *handle);
163 163
164#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1) 164#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
165 165
166/* Note: Do not use this for NULL handles. This is only to determine if
167 * a properly allocated handle is using a journal or not. */
166static inline int ext4_handle_valid(handle_t *handle) 168static inline int ext4_handle_valid(handle_t *handle)
167{ 169{
168 if (handle == EXT4_NOJOURNAL_HANDLE) 170 if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
169 return 0; 171 return 0;
170 return 1; 172 return 1;
171} 173}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7a3832577923..715264b4bae4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -723,7 +723,7 @@ err:
723 * insert new index [@logical;@ptr] into the block at @curp; 723 * insert new index [@logical;@ptr] into the block at @curp;
724 * check where to insert: before @curp or after @curp 724 * check where to insert: before @curp or after @curp
725 */ 725 */
726static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, 726int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
727 struct ext4_ext_path *curp, 727 struct ext4_ext_path *curp,
728 int logical, ext4_fsblk_t ptr) 728 int logical, ext4_fsblk_t ptr)
729{ 729{
@@ -1586,7 +1586,7 @@ out:
1586 */ 1586 */
1587int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1587int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1588 struct ext4_ext_path *path, 1588 struct ext4_ext_path *path,
1589 struct ext4_extent *newext) 1589 struct ext4_extent *newext, int flag)
1590{ 1590{
1591 struct ext4_extent_header *eh; 1591 struct ext4_extent_header *eh;
1592 struct ext4_extent *ex, *fex; 1592 struct ext4_extent *ex, *fex;
@@ -1602,7 +1602,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1602 BUG_ON(path[depth].p_hdr == NULL); 1602 BUG_ON(path[depth].p_hdr == NULL);
1603 1603
1604 /* try to insert block into found extent and return */ 1604 /* try to insert block into found extent and return */
1605 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { 1605 if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1606 && ext4_can_extents_be_merged(inode, ex, newext)) {
1606 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n", 1607 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1607 ext4_ext_is_uninitialized(newext), 1608 ext4_ext_is_uninitialized(newext),
1608 ext4_ext_get_actual_len(newext), 1609 ext4_ext_get_actual_len(newext),
@@ -1722,7 +1723,8 @@ has_space:
1722 1723
1723merge: 1724merge:
1724 /* try to merge extents to the right */ 1725 /* try to merge extents to the right */
1725 ext4_ext_try_to_merge(inode, path, nearex); 1726 if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
1727 ext4_ext_try_to_merge(inode, path, nearex);
1726 1728
1727 /* try to merge extents to the left */ 1729 /* try to merge extents to the left */
1728 1730
@@ -2378,6 +2380,7 @@ void ext4_ext_init(struct super_block *sb)
2378 */ 2380 */
2379 2381
2380 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 2382 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
2383#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
2381 printk(KERN_INFO "EXT4-fs: file extents enabled"); 2384 printk(KERN_INFO "EXT4-fs: file extents enabled");
2382#ifdef AGGRESSIVE_TEST 2385#ifdef AGGRESSIVE_TEST
2383 printk(", aggressive tests"); 2386 printk(", aggressive tests");
@@ -2389,6 +2392,7 @@ void ext4_ext_init(struct super_block *sb)
2389 printk(", stats"); 2392 printk(", stats");
2390#endif 2393#endif
2391 printk("\n"); 2394 printk("\n");
2395#endif
2392#ifdef EXTENTS_STATS 2396#ifdef EXTENTS_STATS
2393 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock); 2397 spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
2394 EXT4_SB(sb)->s_ext_min = 1 << 30; 2398 EXT4_SB(sb)->s_ext_min = 1 << 30;
@@ -2490,7 +2494,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2490} 2494}
2491 2495
2492#define EXT4_EXT_ZERO_LEN 7 2496#define EXT4_EXT_ZERO_LEN 7
2493
2494/* 2497/*
2495 * This function is called by ext4_ext_get_blocks() if someone tries to write 2498 * This function is called by ext4_ext_get_blocks() if someone tries to write
2496 * to an uninitialized extent. It may result in splitting the uninitialized 2499 * to an uninitialized extent. It may result in splitting the uninitialized
@@ -2583,7 +2586,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2583 ex3->ee_block = cpu_to_le32(iblock); 2586 ex3->ee_block = cpu_to_le32(iblock);
2584 ext4_ext_store_pblock(ex3, newblock); 2587 ext4_ext_store_pblock(ex3, newblock);
2585 ex3->ee_len = cpu_to_le16(allocated); 2588 ex3->ee_len = cpu_to_le16(allocated);
2586 err = ext4_ext_insert_extent(handle, inode, path, ex3); 2589 err = ext4_ext_insert_extent(handle, inode, path,
2590 ex3, 0);
2587 if (err == -ENOSPC) { 2591 if (err == -ENOSPC) {
2588 err = ext4_ext_zeroout(inode, &orig_ex); 2592 err = ext4_ext_zeroout(inode, &orig_ex);
2589 if (err) 2593 if (err)
@@ -2639,7 +2643,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2639 ext4_ext_store_pblock(ex3, newblock + max_blocks); 2643 ext4_ext_store_pblock(ex3, newblock + max_blocks);
2640 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 2644 ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2641 ext4_ext_mark_uninitialized(ex3); 2645 ext4_ext_mark_uninitialized(ex3);
2642 err = ext4_ext_insert_extent(handle, inode, path, ex3); 2646 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2643 if (err == -ENOSPC) { 2647 if (err == -ENOSPC) {
2644 err = ext4_ext_zeroout(inode, &orig_ex); 2648 err = ext4_ext_zeroout(inode, &orig_ex);
2645 if (err) 2649 if (err)
@@ -2757,7 +2761,192 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2757 err = ext4_ext_dirty(handle, inode, path + depth); 2761 err = ext4_ext_dirty(handle, inode, path + depth);
2758 goto out; 2762 goto out;
2759insert: 2763insert:
2760 err = ext4_ext_insert_extent(handle, inode, path, &newex); 2764 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2765 if (err == -ENOSPC) {
2766 err = ext4_ext_zeroout(inode, &orig_ex);
2767 if (err)
2768 goto fix_extent_len;
2769 /* update the extent length and mark as initialized */
2770 ex->ee_block = orig_ex.ee_block;
2771 ex->ee_len = orig_ex.ee_len;
2772 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2773 ext4_ext_dirty(handle, inode, path + depth);
2774 /* zero out the first half */
2775 return allocated;
2776 } else if (err)
2777 goto fix_extent_len;
2778out:
2779 ext4_ext_show_leaf(inode, path);
2780 return err ? err : allocated;
2781
2782fix_extent_len:
2783 ex->ee_block = orig_ex.ee_block;
2784 ex->ee_len = orig_ex.ee_len;
2785 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2786 ext4_ext_mark_uninitialized(ex);
2787 ext4_ext_dirty(handle, inode, path + depth);
2788 return err;
2789}
2790
2791/*
2792 * This function is called by ext4_ext_get_blocks() from
2793 * ext4_get_blocks_dio_write() when DIO to write
2794 * to an uninitialized extent.
2795 *
2796 * Writing to an uninitized extent may result in splitting the uninitialized
2797 * extent into multiple /intialized unintialized extents (up to three)
2798 * There are three possibilities:
2799 * a> There is no split required: Entire extent should be uninitialized
2800 * b> Splits in two extents: Write is happening at either end of the extent
2801 * c> Splits in three extents: Somone is writing in middle of the extent
2802 *
2803 * One of more index blocks maybe needed if the extent tree grow after
2804 * the unintialized extent split. To prevent ENOSPC occur at the IO
2805 * complete, we need to split the uninitialized extent before DIO submit
2806 * the IO. The uninitilized extent called at this time will be split
2807 * into three uninitialized extent(at most). After IO complete, the part
2808 * being filled will be convert to initialized by the end_io callback function
2809 * via ext4_convert_unwritten_extents().
2810 *
2811 * Returns the size of uninitialized extent to be written on success.
2812 */
2813static int ext4_split_unwritten_extents(handle_t *handle,
2814 struct inode *inode,
2815 struct ext4_ext_path *path,
2816 ext4_lblk_t iblock,
2817 unsigned int max_blocks,
2818 int flags)
2819{
2820 struct ext4_extent *ex, newex, orig_ex;
2821 struct ext4_extent *ex1 = NULL;
2822 struct ext4_extent *ex2 = NULL;
2823 struct ext4_extent *ex3 = NULL;
2824 struct ext4_extent_header *eh;
2825 ext4_lblk_t ee_block;
2826 unsigned int allocated, ee_len, depth;
2827 ext4_fsblk_t newblock;
2828 int err = 0;
2829
2830 ext_debug("ext4_split_unwritten_extents: inode %lu,"
2831 "iblock %llu, max_blocks %u\n", inode->i_ino,
2832 (unsigned long long)iblock, max_blocks);
2833 depth = ext_depth(inode);
2834 eh = path[depth].p_hdr;
2835 ex = path[depth].p_ext;
2836 ee_block = le32_to_cpu(ex->ee_block);
2837 ee_len = ext4_ext_get_actual_len(ex);
2838 allocated = ee_len - (iblock - ee_block);
2839 newblock = iblock - ee_block + ext_pblock(ex);
2840 ex2 = ex;
2841 orig_ex.ee_block = ex->ee_block;
2842 orig_ex.ee_len = cpu_to_le16(ee_len);
2843 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2844
2845 /*
2846 * If the uninitialized extent begins at the same logical
2847 * block where the write begins, and the write completely
2848 * covers the extent, then we don't need to split it.
2849 */
2850 if ((iblock == ee_block) && (allocated <= max_blocks))
2851 return allocated;
2852
2853 err = ext4_ext_get_access(handle, inode, path + depth);
2854 if (err)
2855 goto out;
2856 /* ex1: ee_block to iblock - 1 : uninitialized */
2857 if (iblock > ee_block) {
2858 ex1 = ex;
2859 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2860 ext4_ext_mark_uninitialized(ex1);
2861 ex2 = &newex;
2862 }
2863 /*
2864 * for sanity, update the length of the ex2 extent before
2865 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2866 * overlap of blocks.
2867 */
2868 if (!ex1 && allocated > max_blocks)
2869 ex2->ee_len = cpu_to_le16(max_blocks);
2870 /* ex3: to ee_block + ee_len : uninitialised */
2871 if (allocated > max_blocks) {
2872 unsigned int newdepth;
2873 ex3 = &newex;
2874 ex3->ee_block = cpu_to_le32(iblock + max_blocks);
2875 ext4_ext_store_pblock(ex3, newblock + max_blocks);
2876 ex3->ee_len = cpu_to_le16(allocated - max_blocks);
2877 ext4_ext_mark_uninitialized(ex3);
2878 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2879 if (err == -ENOSPC) {
2880 err = ext4_ext_zeroout(inode, &orig_ex);
2881 if (err)
2882 goto fix_extent_len;
2883 /* update the extent length and mark as initialized */
2884 ex->ee_block = orig_ex.ee_block;
2885 ex->ee_len = orig_ex.ee_len;
2886 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2887 ext4_ext_dirty(handle, inode, path + depth);
2888 /* zeroed the full extent */
2889 /* blocks available from iblock */
2890 return allocated;
2891
2892 } else if (err)
2893 goto fix_extent_len;
2894 /*
2895 * The depth, and hence eh & ex might change
2896 * as part of the insert above.
2897 */
2898 newdepth = ext_depth(inode);
2899 /*
2900 * update the extent length after successful insert of the
2901 * split extent
2902 */
2903 orig_ex.ee_len = cpu_to_le16(ee_len -
2904 ext4_ext_get_actual_len(ex3));
2905 depth = newdepth;
2906 ext4_ext_drop_refs(path);
2907 path = ext4_ext_find_extent(inode, iblock, path);
2908 if (IS_ERR(path)) {
2909 err = PTR_ERR(path);
2910 goto out;
2911 }
2912 eh = path[depth].p_hdr;
2913 ex = path[depth].p_ext;
2914 if (ex2 != &newex)
2915 ex2 = ex;
2916
2917 err = ext4_ext_get_access(handle, inode, path + depth);
2918 if (err)
2919 goto out;
2920
2921 allocated = max_blocks;
2922 }
2923 /*
2924 * If there was a change of depth as part of the
2925 * insertion of ex3 above, we need to update the length
2926 * of the ex1 extent again here
2927 */
2928 if (ex1 && ex1 != ex) {
2929 ex1 = ex;
2930 ex1->ee_len = cpu_to_le16(iblock - ee_block);
2931 ext4_ext_mark_uninitialized(ex1);
2932 ex2 = &newex;
2933 }
2934 /*
2935 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
2936 * uninitialised still.
2937 */
2938 ex2->ee_block = cpu_to_le32(iblock);
2939 ext4_ext_store_pblock(ex2, newblock);
2940 ex2->ee_len = cpu_to_le16(allocated);
2941 ext4_ext_mark_uninitialized(ex2);
2942 if (ex2 != ex)
2943 goto insert;
2944 /* Mark modified extent as dirty */
2945 err = ext4_ext_dirty(handle, inode, path + depth);
2946 ext_debug("out here\n");
2947 goto out;
2948insert:
2949 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2761 if (err == -ENOSPC) { 2950 if (err == -ENOSPC) {
2762 err = ext4_ext_zeroout(inode, &orig_ex); 2951 err = ext4_ext_zeroout(inode, &orig_ex);
2763 if (err) 2952 if (err)
@@ -2783,7 +2972,147 @@ fix_extent_len:
2783 ext4_ext_dirty(handle, inode, path + depth); 2972 ext4_ext_dirty(handle, inode, path + depth);
2784 return err; 2973 return err;
2785} 2974}
2975static int ext4_convert_unwritten_extents_dio(handle_t *handle,
2976 struct inode *inode,
2977 struct ext4_ext_path *path)
2978{
2979 struct ext4_extent *ex;
2980 struct ext4_extent_header *eh;
2981 int depth;
2982 int err = 0;
2983 int ret = 0;
2984
2985 depth = ext_depth(inode);
2986 eh = path[depth].p_hdr;
2987 ex = path[depth].p_ext;
2988
2989 err = ext4_ext_get_access(handle, inode, path + depth);
2990 if (err)
2991 goto out;
2992 /* first mark the extent as initialized */
2993 ext4_ext_mark_initialized(ex);
2994
2995 /*
2996 * We have to see if it can be merged with the extent
2997 * on the left.
2998 */
2999 if (ex > EXT_FIRST_EXTENT(eh)) {
3000 /*
3001 * To merge left, pass "ex - 1" to try_to_merge(),
3002 * since it merges towards right _only_.
3003 */
3004 ret = ext4_ext_try_to_merge(inode, path, ex - 1);
3005 if (ret) {
3006 err = ext4_ext_correct_indexes(handle, inode, path);
3007 if (err)
3008 goto out;
3009 depth = ext_depth(inode);
3010 ex--;
3011 }
3012 }
3013 /*
3014 * Try to Merge towards right.
3015 */
3016 ret = ext4_ext_try_to_merge(inode, path, ex);
3017 if (ret) {
3018 err = ext4_ext_correct_indexes(handle, inode, path);
3019 if (err)
3020 goto out;
3021 depth = ext_depth(inode);
3022 }
3023 /* Mark modified extent as dirty */
3024 err = ext4_ext_dirty(handle, inode, path + depth);
3025out:
3026 ext4_ext_show_leaf(inode, path);
3027 return err;
3028}
3029
3030static int
3031ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3032 ext4_lblk_t iblock, unsigned int max_blocks,
3033 struct ext4_ext_path *path, int flags,
3034 unsigned int allocated, struct buffer_head *bh_result,
3035 ext4_fsblk_t newblock)
3036{
3037 int ret = 0;
3038 int err = 0;
3039 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3040
3041 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3042 "block %llu, max_blocks %u, flags %d, allocated %u",
3043 inode->i_ino, (unsigned long long)iblock, max_blocks,
3044 flags, allocated);
3045 ext4_ext_show_leaf(inode, path);
2786 3046
3047 /* DIO get_block() before submit the IO, split the extent */
3048 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
3049 ret = ext4_split_unwritten_extents(handle,
3050 inode, path, iblock,
3051 max_blocks, flags);
3052 /*
3053 * Flag the inode(non aio case) or end_io struct (aio case)
3054 * that this IO needs to convertion to written when IO is
3055 * completed
3056 */
3057 if (io)
3058 io->flag = DIO_AIO_UNWRITTEN;
3059 else
3060 EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
3061 goto out;
3062 }
3063 /* async DIO end_io complete, convert the filled extent to written */
3064 if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
3065 ret = ext4_convert_unwritten_extents_dio(handle, inode,
3066 path);
3067 goto out2;
3068 }
3069 /* buffered IO case */
3070 /*
3071 * repeat fallocate creation request
3072 * we already have an unwritten extent
3073 */
3074 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
3075 goto map_out;
3076
3077 /* buffered READ or buffered write_begin() lookup */
3078 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3079 /*
3080 * We have blocks reserved already. We
3081 * return allocated blocks so that delalloc
3082 * won't do block reservation for us. But
3083 * the buffer head will be unmapped so that
3084 * a read from the block returns 0s.
3085 */
3086 set_buffer_unwritten(bh_result);
3087 goto out1;
3088 }
3089
3090 /* buffered write, writepage time, convert*/
3091 ret = ext4_ext_convert_to_initialized(handle, inode,
3092 path, iblock,
3093 max_blocks);
3094out:
3095 if (ret <= 0) {
3096 err = ret;
3097 goto out2;
3098 } else
3099 allocated = ret;
3100 set_buffer_new(bh_result);
3101map_out:
3102 set_buffer_mapped(bh_result);
3103out1:
3104 if (allocated > max_blocks)
3105 allocated = max_blocks;
3106 ext4_ext_show_leaf(inode, path);
3107 bh_result->b_bdev = inode->i_sb->s_bdev;
3108 bh_result->b_blocknr = newblock;
3109out2:
3110 if (path) {
3111 ext4_ext_drop_refs(path);
3112 kfree(path);
3113 }
3114 return err ? err : allocated;
3115}
2787/* 3116/*
2788 * Block allocation/map/preallocation routine for extents based files 3117 * Block allocation/map/preallocation routine for extents based files
2789 * 3118 *
@@ -2814,6 +3143,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2814 int err = 0, depth, ret, cache_type; 3143 int err = 0, depth, ret, cache_type;
2815 unsigned int allocated = 0; 3144 unsigned int allocated = 0;
2816 struct ext4_allocation_request ar; 3145 struct ext4_allocation_request ar;
3146 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
2817 3147
2818 __clear_bit(BH_New, &bh_result->b_state); 3148 __clear_bit(BH_New, &bh_result->b_state);
2819 ext_debug("blocks %u/%u requested for inode %lu\n", 3149 ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -2889,33 +3219,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2889 EXT4_EXT_CACHE_EXTENT); 3219 EXT4_EXT_CACHE_EXTENT);
2890 goto out; 3220 goto out;
2891 } 3221 }
2892 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) 3222 ret = ext4_ext_handle_uninitialized_extents(handle,
2893 goto out; 3223 inode, iblock, max_blocks, path,
2894 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3224 flags, allocated, bh_result, newblock);
2895 if (allocated > max_blocks) 3225 return ret;
2896 allocated = max_blocks;
2897 /*
2898 * We have blocks reserved already. We
2899 * return allocated blocks so that delalloc
2900 * won't do block reservation for us. But
2901 * the buffer head will be unmapped so that
2902 * a read from the block returns 0s.
2903 */
2904 set_buffer_unwritten(bh_result);
2905 bh_result->b_bdev = inode->i_sb->s_bdev;
2906 bh_result->b_blocknr = newblock;
2907 goto out2;
2908 }
2909
2910 ret = ext4_ext_convert_to_initialized(handle, inode,
2911 path, iblock,
2912 max_blocks);
2913 if (ret <= 0) {
2914 err = ret;
2915 goto out2;
2916 } else
2917 allocated = ret;
2918 goto outnew;
2919 } 3226 }
2920 } 3227 }
2921 3228
@@ -2986,9 +3293,27 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2986 /* try to insert new extent into found leaf and return */ 3293 /* try to insert new extent into found leaf and return */
2987 ext4_ext_store_pblock(&newex, newblock); 3294 ext4_ext_store_pblock(&newex, newblock);
2988 newex.ee_len = cpu_to_le16(ar.len); 3295 newex.ee_len = cpu_to_le16(ar.len);
2989 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ 3296 /* Mark uninitialized */
3297 if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
2990 ext4_ext_mark_uninitialized(&newex); 3298 ext4_ext_mark_uninitialized(&newex);
2991 err = ext4_ext_insert_extent(handle, inode, path, &newex); 3299 /*
3300 * io_end structure was created for every async
3301 * direct IO write to the middle of the file.
3302 * To avoid unecessary convertion for every aio dio rewrite
3303 * to the mid of file, here we flag the IO that is really
3304 * need the convertion.
3305 * For non asycn direct IO case, flag the inode state
3306 * that we need to perform convertion when IO is done.
3307 */
3308 if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
3309 if (io)
3310 io->flag = DIO_AIO_UNWRITTEN;
3311 else
3312 EXT4_I(inode)->i_state |=
3313 EXT4_STATE_DIO_UNWRITTEN;;
3314 }
3315 }
3316 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2992 if (err) { 3317 if (err) {
2993 /* free data blocks we just allocated */ 3318 /* free data blocks we just allocated */
2994 /* not a good idea to call discard here directly, 3319 /* not a good idea to call discard here directly,
@@ -3002,7 +3327,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3002 /* previous routine could use block we allocated */ 3327 /* previous routine could use block we allocated */
3003 newblock = ext_pblock(&newex); 3328 newblock = ext_pblock(&newex);
3004 allocated = ext4_ext_get_actual_len(&newex); 3329 allocated = ext4_ext_get_actual_len(&newex);
3005outnew:
3006 set_buffer_new(bh_result); 3330 set_buffer_new(bh_result);
3007 3331
3008 /* Cache only when it is _not_ an uninitialized extent */ 3332 /* Cache only when it is _not_ an uninitialized extent */
@@ -3201,6 +3525,64 @@ retry:
3201} 3525}
3202 3526
3203/* 3527/*
3528 * This function convert a range of blocks to written extents
3529 * The caller of this function will pass the start offset and the size.
3530 * all unwritten extents within this range will be converted to
3531 * written extents.
3532 *
3533 * This function is called from the direct IO end io call back
3534 * function, to convert the fallocated extents after IO is completed.
3535 * Returns 0 on success.
3536 */
3537int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3538 loff_t len)
3539{
3540 handle_t *handle;
3541 ext4_lblk_t block;
3542 unsigned int max_blocks;
3543 int ret = 0;
3544 int ret2 = 0;
3545 struct buffer_head map_bh;
3546 unsigned int credits, blkbits = inode->i_blkbits;
3547
3548 block = offset >> blkbits;
3549 /*
3550 * We can't just convert len to max_blocks because
3551 * If blocksize = 4096 offset = 3072 and len = 2048
3552 */
3553 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3554 - block;
3555 /*
3556 * credits to insert 1 extent into extent tree
3557 */
3558 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3559 while (ret >= 0 && ret < max_blocks) {
3560 block = block + ret;
3561 max_blocks = max_blocks - ret;
3562 handle = ext4_journal_start(inode, credits);
3563 if (IS_ERR(handle)) {
3564 ret = PTR_ERR(handle);
3565 break;
3566 }
3567 map_bh.b_state = 0;
3568 ret = ext4_get_blocks(handle, inode, block,
3569 max_blocks, &map_bh,
3570 EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
3571 if (ret <= 0) {
3572 WARN_ON(ret <= 0);
3573 printk(KERN_ERR "%s: ext4_ext_get_blocks "
3574 "returned error inode#%lu, block=%u, "
3575 "max_blocks=%u", __func__,
3576 inode->i_ino, block, max_blocks);
3577 }
3578 ext4_mark_inode_dirty(handle, inode);
3579 ret2 = ext4_journal_stop(handle);
3580 if (ret <= 0 || ret2 )
3581 break;
3582 }
3583 return ret > 0 ? ret2 : ret;
3584}
3585/*
3204 * Callback function called for each extent to gather FIEMAP information. 3586 * Callback function called for each extent to gather FIEMAP information.
3205 */ 3587 */
3206static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, 3588static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 5ca3eca70a1e..9630583cef28 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -81,7 +81,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
81 return generic_file_aio_write(iocb, iov, nr_segs, pos); 81 return generic_file_aio_write(iocb, iov, nr_segs, pos);
82} 82}
83 83
84static struct vm_operations_struct ext4_file_vm_ops = { 84static const struct vm_operations_struct ext4_file_vm_ops = {
85 .fault = filemap_fault, 85 .fault = filemap_fault,
86 .page_mkwrite = ext4_page_mkwrite, 86 .page_mkwrite = ext4_page_mkwrite,
87}; 87};
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 07475740b512..2b1531266ee2 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,6 +44,8 @@
44 * 44 *
45 * What we do is just kick off a commit and wait on it. This will snapshot the 45 * What we do is just kick off a commit and wait on it. This will snapshot the
46 * inode to disk. 46 * inode to disk.
47 *
48 * i_mutex lock is held when entering and exiting this function
47 */ 49 */
48 50
49int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) 51int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
@@ -56,6 +58,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
56 58
57 trace_ext4_sync_file(file, dentry, datasync); 59 trace_ext4_sync_file(file, dentry, datasync);
58 60
61 ret = flush_aio_dio_completed_IO(inode);
62 if (ret < 0)
63 goto out;
59 /* 64 /*
60 * data=writeback: 65 * data=writeback:
61 * The caller's filemap_fdatawrite()/wait will sync the data. 66 * The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 064746fad581..2c8caa51addb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
37#include <linux/namei.h> 37#include <linux/namei.h>
38#include <linux/uio.h> 38#include <linux/uio.h>
39#include <linux/bio.h> 39#include <linux/bio.h>
40#include <linux/workqueue.h>
40 41
41#include "ext4_jbd2.h" 42#include "ext4_jbd2.h"
42#include "xattr.h" 43#include "xattr.h"
@@ -192,7 +193,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
192 * so before we call here everything must be consistently dirtied against 193 * so before we call here everything must be consistently dirtied against
193 * this transaction. 194 * this transaction.
194 */ 195 */
195 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, 196int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
196 int nblocks) 197 int nblocks)
197{ 198{
198 int ret; 199 int ret;
@@ -208,6 +209,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
208 up_write(&EXT4_I(inode)->i_data_sem); 209 up_write(&EXT4_I(inode)->i_data_sem);
209 ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); 210 ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
210 down_write(&EXT4_I(inode)->i_data_sem); 211 down_write(&EXT4_I(inode)->i_data_sem);
212 ext4_discard_preallocations(inode);
211 213
212 return ret; 214 return ret;
213} 215}
@@ -1145,6 +1147,64 @@ static int check_block_validity(struct inode *inode, const char *msg,
1145} 1147}
1146 1148
1147/* 1149/*
1150 * Return the number of contiguous dirty pages in a given inode
1151 * starting at page frame idx.
1152 */
1153static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1154 unsigned int max_pages)
1155{
1156 struct address_space *mapping = inode->i_mapping;
1157 pgoff_t index;
1158 struct pagevec pvec;
1159 pgoff_t num = 0;
1160 int i, nr_pages, done = 0;
1161
1162 if (max_pages == 0)
1163 return 0;
1164 pagevec_init(&pvec, 0);
1165 while (!done) {
1166 index = idx;
1167 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1168 PAGECACHE_TAG_DIRTY,
1169 (pgoff_t)PAGEVEC_SIZE);
1170 if (nr_pages == 0)
1171 break;
1172 for (i = 0; i < nr_pages; i++) {
1173 struct page *page = pvec.pages[i];
1174 struct buffer_head *bh, *head;
1175
1176 lock_page(page);
1177 if (unlikely(page->mapping != mapping) ||
1178 !PageDirty(page) ||
1179 PageWriteback(page) ||
1180 page->index != idx) {
1181 done = 1;
1182 unlock_page(page);
1183 break;
1184 }
1185 if (page_has_buffers(page)) {
1186 bh = head = page_buffers(page);
1187 do {
1188 if (!buffer_delay(bh) &&
1189 !buffer_unwritten(bh))
1190 done = 1;
1191 bh = bh->b_this_page;
1192 } while (!done && (bh != head));
1193 }
1194 unlock_page(page);
1195 if (done)
1196 break;
1197 idx++;
1198 num++;
1199 if (num >= max_pages)
1200 break;
1201 }
1202 pagevec_release(&pvec);
1203 }
1204 return num;
1205}
1206
1207/*
1148 * The ext4_get_blocks() function tries to look up the requested blocks, 1208 * The ext4_get_blocks() function tries to look up the requested blocks,
1149 * and returns if the blocks are already mapped. 1209 * and returns if the blocks are already mapped.
1150 * 1210 *
@@ -1175,6 +1235,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1175 clear_buffer_mapped(bh); 1235 clear_buffer_mapped(bh);
1176 clear_buffer_unwritten(bh); 1236 clear_buffer_unwritten(bh);
1177 1237
1238 ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
1239 "logical block %lu\n", inode->i_ino, flags, max_blocks,
1240 (unsigned long)block);
1178 /* 1241 /*
1179 * Try to see if we can get the block without requesting a new 1242 * Try to see if we can get the block without requesting a new
1180 * file system block. 1243 * file system block.
@@ -1796,11 +1859,11 @@ repeat:
1796 1859
1797 if (ext4_claim_free_blocks(sbi, total)) { 1860 if (ext4_claim_free_blocks(sbi, total)) {
1798 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1861 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1862 vfs_dq_release_reservation_block(inode, total);
1799 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1863 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1800 yield(); 1864 yield();
1801 goto repeat; 1865 goto repeat;
1802 } 1866 }
1803 vfs_dq_release_reservation_block(inode, total);
1804 return -ENOSPC; 1867 return -ENOSPC;
1805 } 1868 }
1806 EXT4_I(inode)->i_reserved_data_blocks += nrblocks; 1869 EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -2092,18 +2155,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2092static void ext4_print_free_blocks(struct inode *inode) 2155static void ext4_print_free_blocks(struct inode *inode)
2093{ 2156{
2094 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2157 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2095 printk(KERN_EMERG "Total free blocks count %lld\n", 2158 printk(KERN_CRIT "Total free blocks count %lld\n",
2096 ext4_count_free_blocks(inode->i_sb)); 2159 ext4_count_free_blocks(inode->i_sb));
2097 printk(KERN_EMERG "Free/Dirty block details\n"); 2160 printk(KERN_CRIT "Free/Dirty block details\n");
2098 printk(KERN_EMERG "free_blocks=%lld\n", 2161 printk(KERN_CRIT "free_blocks=%lld\n",
2099 (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); 2162 (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
2100 printk(KERN_EMERG "dirty_blocks=%lld\n", 2163 printk(KERN_CRIT "dirty_blocks=%lld\n",
2101 (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); 2164 (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
2102 printk(KERN_EMERG "Block reservation details\n"); 2165 printk(KERN_CRIT "Block reservation details\n");
2103 printk(KERN_EMERG "i_reserved_data_blocks=%u\n", 2166 printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
2104 EXT4_I(inode)->i_reserved_data_blocks); 2167 EXT4_I(inode)->i_reserved_data_blocks);
2105 printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", 2168 printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
2106 EXT4_I(inode)->i_reserved_meta_blocks); 2169 EXT4_I(inode)->i_reserved_meta_blocks);
2107 return; 2170 return;
2108} 2171}
2109 2172
@@ -2189,14 +2252,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2189 * writepage and writepages will again try to write 2252 * writepage and writepages will again try to write
2190 * the same. 2253 * the same.
2191 */ 2254 */
2192 printk(KERN_EMERG "%s block allocation failed for inode %lu " 2255 ext4_msg(mpd->inode->i_sb, KERN_CRIT,
2193 "at logical offset %llu with max blocks " 2256 "delayed block allocation failed for inode %lu at "
2194 "%zd with error %d\n", 2257 "logical offset %llu with max blocks %zd with "
2195 __func__, mpd->inode->i_ino, 2258 "error %d\n", mpd->inode->i_ino,
2196 (unsigned long long)next, 2259 (unsigned long long) next,
2197 mpd->b_size >> mpd->inode->i_blkbits, err); 2260 mpd->b_size >> mpd->inode->i_blkbits, err);
2198 printk(KERN_EMERG "This should not happen.!! " 2261 printk(KERN_CRIT "This should not happen!! "
2199 "Data will be lost\n"); 2262 "Data will be lost\n");
2200 if (err == -ENOSPC) { 2263 if (err == -ENOSPC) {
2201 ext4_print_free_blocks(mpd->inode); 2264 ext4_print_free_blocks(mpd->inode);
2202 } 2265 }
@@ -2743,8 +2806,10 @@ static int ext4_da_writepages(struct address_space *mapping,
2743 int no_nrwrite_index_update; 2806 int no_nrwrite_index_update;
2744 int pages_written = 0; 2807 int pages_written = 0;
2745 long pages_skipped; 2808 long pages_skipped;
2809 unsigned int max_pages;
2746 int range_cyclic, cycled = 1, io_done = 0; 2810 int range_cyclic, cycled = 1, io_done = 0;
2747 int needed_blocks, ret = 0, nr_to_writebump = 0; 2811 int needed_blocks, ret = 0;
2812 long desired_nr_to_write, nr_to_writebump = 0;
2748 loff_t range_start = wbc->range_start; 2813 loff_t range_start = wbc->range_start;
2749 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2814 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2750 2815
@@ -2771,16 +2836,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2771 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2836 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2772 return -EROFS; 2837 return -EROFS;
2773 2838
2774 /*
2775 * Make sure nr_to_write is >= sbi->s_mb_stream_request
2776 * This make sure small files blocks are allocated in
2777 * single attempt. This ensure that small files
2778 * get less fragmented.
2779 */
2780 if (wbc->nr_to_write < sbi->s_mb_stream_request) {
2781 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
2782 wbc->nr_to_write = sbi->s_mb_stream_request;
2783 }
2784 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2839 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2785 range_whole = 1; 2840 range_whole = 1;
2786 2841
@@ -2795,6 +2850,36 @@ static int ext4_da_writepages(struct address_space *mapping,
2795 } else 2850 } else
2796 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2851 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2797 2852
2853 /*
2854 * This works around two forms of stupidity. The first is in
2855 * the writeback code, which caps the maximum number of pages
2856 * written to be 1024 pages. This is wrong on multiple
2857 * levels; different architectues have a different page size,
2858 * which changes the maximum amount of data which gets
2859 * written. Secondly, 4 megabytes is way too small. XFS
2860 * forces this value to be 16 megabytes by multiplying
2861 * nr_to_write parameter by four, and then relies on its
2862 * allocator to allocate larger extents to make them
2863 * contiguous. Unfortunately this brings us to the second
2864 * stupidity, which is that ext4's mballoc code only allocates
2865 * at most 2048 blocks. So we force contiguous writes up to
2866 * the number of dirty blocks in the inode, or
2867 * sbi->max_writeback_mb_bump whichever is smaller.
2868 */
2869 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2870 if (!range_cyclic && range_whole)
2871 desired_nr_to_write = wbc->nr_to_write * 8;
2872 else
2873 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2874 max_pages);
2875 if (desired_nr_to_write > max_pages)
2876 desired_nr_to_write = max_pages;
2877
2878 if (wbc->nr_to_write < desired_nr_to_write) {
2879 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2880 wbc->nr_to_write = desired_nr_to_write;
2881 }
2882
2798 mpd.wbc = wbc; 2883 mpd.wbc = wbc;
2799 mpd.inode = mapping->host; 2884 mpd.inode = mapping->host;
2800 2885
@@ -2822,10 +2907,9 @@ retry:
2822 handle = ext4_journal_start(inode, needed_blocks); 2907 handle = ext4_journal_start(inode, needed_blocks);
2823 if (IS_ERR(handle)) { 2908 if (IS_ERR(handle)) {
2824 ret = PTR_ERR(handle); 2909 ret = PTR_ERR(handle);
2825 printk(KERN_CRIT "%s: jbd2_start: " 2910 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2826 "%ld pages, ino %lu; err %d\n", __func__, 2911 "%ld pages, ino %lu; err %d\n", __func__,
2827 wbc->nr_to_write, inode->i_ino, ret); 2912 wbc->nr_to_write, inode->i_ino, ret);
2828 dump_stack();
2829 goto out_writepages; 2913 goto out_writepages;
2830 } 2914 }
2831 2915
@@ -2897,9 +2981,10 @@ retry:
2897 goto retry; 2981 goto retry;
2898 } 2982 }
2899 if (pages_skipped != wbc->pages_skipped) 2983 if (pages_skipped != wbc->pages_skipped)
2900 printk(KERN_EMERG "This should not happen leaving %s " 2984 ext4_msg(inode->i_sb, KERN_CRIT,
2901 "with nr_to_write = %ld ret = %d\n", 2985 "This should not happen leaving %s "
2902 __func__, wbc->nr_to_write, ret); 2986 "with nr_to_write = %ld ret = %d\n",
2987 __func__, wbc->nr_to_write, ret);
2903 2988
2904 /* Update index */ 2989 /* Update index */
2905 index += pages_written; 2990 index += pages_written;
@@ -2914,7 +2999,8 @@ retry:
2914out_writepages: 2999out_writepages:
2915 if (!no_nrwrite_index_update) 3000 if (!no_nrwrite_index_update)
2916 wbc->no_nrwrite_index_update = 0; 3001 wbc->no_nrwrite_index_update = 0;
2917 wbc->nr_to_write -= nr_to_writebump; 3002 if (wbc->nr_to_write > nr_to_writebump)
3003 wbc->nr_to_write -= nr_to_writebump;
2918 wbc->range_start = range_start; 3004 wbc->range_start = range_start;
2919 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3005 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2920 return ret; 3006 return ret;
@@ -3272,6 +3358,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3272} 3358}
3273 3359
3274/* 3360/*
3361 * O_DIRECT for ext3 (or indirect map) based files
3362 *
3275 * If the O_DIRECT write will extend the file then add this inode to the 3363 * If the O_DIRECT write will extend the file then add this inode to the
3276 * orphan list. So recovery will truncate it back to the original size 3364 * orphan list. So recovery will truncate it back to the original size
3277 * if the machine crashes during the write. 3365 * if the machine crashes during the write.
@@ -3280,7 +3368,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3280 * crashes then stale disk data _may_ be exposed inside the file. But current 3368 * crashes then stale disk data _may_ be exposed inside the file. But current
3281 * VFS code falls back into buffered path in that case so we are safe. 3369 * VFS code falls back into buffered path in that case so we are safe.
3282 */ 3370 */
3283static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, 3371static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3284 const struct iovec *iov, loff_t offset, 3372 const struct iovec *iov, loff_t offset,
3285 unsigned long nr_segs) 3373 unsigned long nr_segs)
3286{ 3374{
@@ -3291,6 +3379,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3291 ssize_t ret; 3379 ssize_t ret;
3292 int orphan = 0; 3380 int orphan = 0;
3293 size_t count = iov_length(iov, nr_segs); 3381 size_t count = iov_length(iov, nr_segs);
3382 int retries = 0;
3294 3383
3295 if (rw == WRITE) { 3384 if (rw == WRITE) {
3296 loff_t final_size = offset + count; 3385 loff_t final_size = offset + count;
@@ -3313,9 +3402,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3313 } 3402 }
3314 } 3403 }
3315 3404
3405retry:
3316 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 3406 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
3317 offset, nr_segs, 3407 offset, nr_segs,
3318 ext4_get_block, NULL); 3408 ext4_get_block, NULL);
3409 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3410 goto retry;
3319 3411
3320 if (orphan) { 3412 if (orphan) {
3321 int err; 3413 int err;
@@ -3354,6 +3446,364 @@ out:
3354 return ret; 3446 return ret;
3355} 3447}
3356 3448
3449static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
3450 struct buffer_head *bh_result, int create)
3451{
3452 handle_t *handle = NULL;
3453 int ret = 0;
3454 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3455 int dio_credits;
3456
3457 ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
3458 inode->i_ino, create);
3459 /*
3460 * DIO VFS code passes create = 0 flag for write to
3461 * the middle of file. It does this to avoid block
3462 * allocation for holes, to prevent expose stale data
3463 * out when there is parallel buffered read (which does
3464 * not hold the i_mutex lock) while direct IO write has
3465 * not completed. DIO request on holes finally falls back
3466 * to buffered IO for this reason.
3467 *
3468 * For ext4 extent based file, since we support fallocate,
3469 * new allocated extent as uninitialized, for holes, we
3470 * could fallocate blocks for holes, thus parallel
3471 * buffered IO read will zero out the page when read on
3472 * a hole while parallel DIO write to the hole has not completed.
3473 *
3474 * when we come here, we know it's a direct IO write to
3475 * to the middle of file (<i_size)
3476 * so it's safe to override the create flag from VFS.
3477 */
3478 create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
3479
3480 if (max_blocks > DIO_MAX_BLOCKS)
3481 max_blocks = DIO_MAX_BLOCKS;
3482 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3483 handle = ext4_journal_start(inode, dio_credits);
3484 if (IS_ERR(handle)) {
3485 ret = PTR_ERR(handle);
3486 goto out;
3487 }
3488 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3489 create);
3490 if (ret > 0) {
3491 bh_result->b_size = (ret << inode->i_blkbits);
3492 ret = 0;
3493 }
3494 ext4_journal_stop(handle);
3495out:
3496 return ret;
3497}
3498
3499static void ext4_free_io_end(ext4_io_end_t *io)
3500{
3501 BUG_ON(!io);
3502 iput(io->inode);
3503 kfree(io);
3504}
3505static void dump_aio_dio_list(struct inode * inode)
3506{
3507#ifdef EXT4_DEBUG
3508 struct list_head *cur, *before, *after;
3509 ext4_io_end_t *io, *io0, *io1;
3510
3511 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3512 ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
3513 return;
3514 }
3515
3516 ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
3517 list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
3518 cur = &io->list;
3519 before = cur->prev;
3520 io0 = container_of(before, ext4_io_end_t, list);
3521 after = cur->next;
3522 io1 = container_of(after, ext4_io_end_t, list);
3523
3524 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
3525 io, inode->i_ino, io0, io1);
3526 }
3527#endif
3528}
3529
3530/*
3531 * check a range of space and convert unwritten extents to written.
3532 */
3533static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
3534{
3535 struct inode *inode = io->inode;
3536 loff_t offset = io->offset;
3537 size_t size = io->size;
3538 int ret = 0;
3539
3540 ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
3541 "list->prev 0x%p\n",
3542 io, inode->i_ino, io->list.next, io->list.prev);
3543
3544 if (list_empty(&io->list))
3545 return ret;
3546
3547 if (io->flag != DIO_AIO_UNWRITTEN)
3548 return ret;
3549
3550 if (offset + size <= i_size_read(inode))
3551 ret = ext4_convert_unwritten_extents(inode, offset, size);
3552
3553 if (ret < 0) {
3554 printk(KERN_EMERG "%s: failed to convert unwritten"
3555 "extents to written extents, error is %d"
3556 " io is still on inode %lu aio dio list\n",
3557 __func__, ret, inode->i_ino);
3558 return ret;
3559 }
3560
3561 /* clear the DIO AIO unwritten flag */
3562 io->flag = 0;
3563 return ret;
3564}
3565/*
3566 * work on completed aio dio IO, to convert unwritten extents to extents
3567 */
3568static void ext4_end_aio_dio_work(struct work_struct *work)
3569{
3570 ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
3571 struct inode *inode = io->inode;
3572 int ret = 0;
3573
3574 mutex_lock(&inode->i_mutex);
3575 ret = ext4_end_aio_dio_nolock(io);
3576 if (ret >= 0) {
3577 if (!list_empty(&io->list))
3578 list_del_init(&io->list);
3579 ext4_free_io_end(io);
3580 }
3581 mutex_unlock(&inode->i_mutex);
3582}
3583/*
3584 * This function is called from ext4_sync_file().
3585 *
3586 * When AIO DIO IO is completed, the work to convert unwritten
3587 * extents to written is queued on workqueue but may not get immediately
3588 * scheduled. When fsync is called, we need to ensure the
3589 * conversion is complete before fsync returns.
3590 * The inode keeps track of a list of completed AIO from DIO path
3591 * that might needs to do the conversion. This function walks through
3592 * the list and convert the related unwritten extents to written.
3593 */
3594int flush_aio_dio_completed_IO(struct inode *inode)
3595{
3596 ext4_io_end_t *io;
3597 int ret = 0;
3598 int ret2 = 0;
3599
3600 if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
3601 return ret;
3602
3603 dump_aio_dio_list(inode);
3604 while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
3605 io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
3606 ext4_io_end_t, list);
3607 /*
3608 * Calling ext4_end_aio_dio_nolock() to convert completed
3609 * IO to written.
3610 *
3611 * When ext4_sync_file() is called, run_queue() may already
3612 * about to flush the work corresponding to this io structure.
3613 * It will be upset if it founds the io structure related
3614 * to the work-to-be schedule is freed.
3615 *
3616 * Thus we need to keep the io structure still valid here after
3617 * convertion finished. The io structure has a flag to
3618 * avoid double converting from both fsync and background work
3619 * queue work.
3620 */
3621 ret = ext4_end_aio_dio_nolock(io);
3622 if (ret < 0)
3623 ret2 = ret;
3624 else
3625 list_del_init(&io->list);
3626 }
3627 return (ret2 < 0) ? ret2 : 0;
3628}
3629
3630static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
3631{
3632 ext4_io_end_t *io = NULL;
3633
3634 io = kmalloc(sizeof(*io), GFP_NOFS);
3635
3636 if (io) {
3637 igrab(inode);
3638 io->inode = inode;
3639 io->flag = 0;
3640 io->offset = 0;
3641 io->size = 0;
3642 io->error = 0;
3643 INIT_WORK(&io->work, ext4_end_aio_dio_work);
3644 INIT_LIST_HEAD(&io->list);
3645 }
3646
3647 return io;
3648}
3649
3650static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3651 ssize_t size, void *private)
3652{
3653 ext4_io_end_t *io_end = iocb->private;
3654 struct workqueue_struct *wq;
3655
3656 /* if not async direct IO or dio with 0 bytes write, just return */
3657 if (!io_end || !size)
3658 return;
3659
3660 ext_debug("ext4_end_io_dio(): io_end 0x%p"
3661 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
3662 iocb->private, io_end->inode->i_ino, iocb, offset,
3663 size);
3664
3665 /* if not aio dio with unwritten extents, just free io and return */
3666 if (io_end->flag != DIO_AIO_UNWRITTEN){
3667 ext4_free_io_end(io_end);
3668 iocb->private = NULL;
3669 return;
3670 }
3671
3672 io_end->offset = offset;
3673 io_end->size = size;
3674 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
3675
3676 /* queue the work to convert unwritten extents to written */
3677 queue_work(wq, &io_end->work);
3678
3679 /* Add the io_end to per-inode completed aio dio list*/
3680 list_add_tail(&io_end->list,
3681 &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
3682 iocb->private = NULL;
3683}
3684/*
3685 * For ext4 extent files, ext4 will do direct-io write to holes,
3686 * preallocated extents, and those write extend the file, no need to
3687 * fall back to buffered IO.
3688 *
3689 * For holes, we fallocate those blocks, mark them as unintialized
3690 * If those blocks were preallocated, we mark sure they are splited, but
3691 * still keep the range to write as unintialized.
3692 *
3693 * The unwrritten extents will be converted to written when DIO is completed.
3694 * For async direct IO, since the IO may still pending when return, we
3695 * set up an end_io call back function, which will do the convertion
3696 * when async direct IO completed.
3697 *
3698 * If the O_DIRECT write will extend the file then add this inode to the
3699 * orphan list. So recovery will truncate it back to the original size
3700 * if the machine crashes during the write.
3701 *
3702 */
3703static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3704 const struct iovec *iov, loff_t offset,
3705 unsigned long nr_segs)
3706{
3707 struct file *file = iocb->ki_filp;
3708 struct inode *inode = file->f_mapping->host;
3709 ssize_t ret;
3710 size_t count = iov_length(iov, nr_segs);
3711
3712 loff_t final_size = offset + count;
3713 if (rw == WRITE && final_size <= inode->i_size) {
3714 /*
3715 * We could direct write to holes and fallocate.
3716 *
3717 * Allocated blocks to fill the hole are marked as uninitialized
3718 * to prevent paralel buffered read to expose the stale data
3719 * before DIO complete the data IO.
3720 *
3721 * As to previously fallocated extents, ext4 get_block
3722 * will just simply mark the buffer mapped but still
3723 * keep the extents uninitialized.
3724 *
3725 * for non AIO case, we will convert those unwritten extents
3726 * to written after return back from blockdev_direct_IO.
3727 *
3728 * for async DIO, the conversion needs to be defered when
3729 * the IO is completed. The ext4 end_io callback function
3730 * will be called to take care of the conversion work.
3731 * Here for async case, we allocate an io_end structure to
3732 * hook to the iocb.
3733 */
3734 iocb->private = NULL;
3735 EXT4_I(inode)->cur_aio_dio = NULL;
3736 if (!is_sync_kiocb(iocb)) {
3737 iocb->private = ext4_init_io_end(inode);
3738 if (!iocb->private)
3739 return -ENOMEM;
3740 /*
3741 * we save the io structure for current async
3742 * direct IO, so that later ext4_get_blocks()
3743 * could flag the io structure whether there
3744 * is a unwritten extents needs to be converted
3745 * when IO is completed.
3746 */
3747 EXT4_I(inode)->cur_aio_dio = iocb->private;
3748 }
3749
3750 ret = blockdev_direct_IO(rw, iocb, inode,
3751 inode->i_sb->s_bdev, iov,
3752 offset, nr_segs,
3753 ext4_get_block_dio_write,
3754 ext4_end_io_dio);
3755 if (iocb->private)
3756 EXT4_I(inode)->cur_aio_dio = NULL;
3757 /*
3758 * The io_end structure takes a reference to the inode,
3759 * that structure needs to be destroyed and the
3760 * reference to the inode need to be dropped, when IO is
3761 * complete, even with 0 byte write, or failed.
3762 *
3763 * In the successful AIO DIO case, the io_end structure will be
3764 * desctroyed and the reference to the inode will be dropped
3765 * after the end_io call back function is called.
3766 *
3767 * In the case there is 0 byte write, or error case, since
3768 * VFS direct IO won't invoke the end_io call back function,
3769 * we need to free the end_io structure here.
3770 */
3771 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
3772 ext4_free_io_end(iocb->private);
3773 iocb->private = NULL;
3774 } else if (ret > 0 && (EXT4_I(inode)->i_state &
3775 EXT4_STATE_DIO_UNWRITTEN)) {
3776 int err;
3777 /*
3778 * for non AIO case, since the IO is already
3779 * completed, we could do the convertion right here
3780 */
3781 err = ext4_convert_unwritten_extents(inode,
3782 offset, ret);
3783 if (err < 0)
3784 ret = err;
3785 EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
3786 }
3787 return ret;
3788 }
3789
3790 /* for write the the end of file case, we fall back to old way */
3791 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3792}
3793
3794static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3795 const struct iovec *iov, loff_t offset,
3796 unsigned long nr_segs)
3797{
3798 struct file *file = iocb->ki_filp;
3799 struct inode *inode = file->f_mapping->host;
3800
3801 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
3802 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3803
3804 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3805}
3806
3357/* 3807/*
3358 * Pages can be marked dirty completely asynchronously from ext4's journalling 3808 * Pages can be marked dirty completely asynchronously from ext4's journalling
3359 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do 3809 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
@@ -4551,8 +5001,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
4551 */ 5001 */
4552static int ext4_do_update_inode(handle_t *handle, 5002static int ext4_do_update_inode(handle_t *handle,
4553 struct inode *inode, 5003 struct inode *inode,
4554 struct ext4_iloc *iloc, 5004 struct ext4_iloc *iloc)
4555 int do_sync)
4556{ 5005{
4557 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 5006 struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
4558 struct ext4_inode_info *ei = EXT4_I(inode); 5007 struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4653,22 +5102,10 @@ static int ext4_do_update_inode(handle_t *handle,
4653 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 5102 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
4654 } 5103 }
4655 5104
4656 /* 5105 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4657 * If we're not using a journal and we were called from 5106 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4658 * ext4_write_inode() to sync the inode (making do_sync true), 5107 if (!err)
4659 * we can just use sync_dirty_buffer() directly to do our dirty 5108 err = rc;
4660 * work. Testing s_journal here is a bit redundant but it's
4661 * worth it to avoid potential future trouble.
4662 */
4663 if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
4664 BUFFER_TRACE(bh, "call sync_dirty_buffer");
4665 sync_dirty_buffer(bh);
4666 } else {
4667 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4668 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4669 if (!err)
4670 err = rc;
4671 }
4672 ei->i_state &= ~EXT4_STATE_NEW; 5109 ei->i_state &= ~EXT4_STATE_NEW;
4673 5110
4674out_brelse: 5111out_brelse:
@@ -4736,8 +5173,16 @@ int ext4_write_inode(struct inode *inode, int wait)
4736 err = ext4_get_inode_loc(inode, &iloc); 5173 err = ext4_get_inode_loc(inode, &iloc);
4737 if (err) 5174 if (err)
4738 return err; 5175 return err;
4739 err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE, 5176 if (wait)
4740 inode, &iloc, wait); 5177 sync_dirty_buffer(iloc.bh);
5178 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5179 ext4_error(inode->i_sb, __func__,
5180 "IO error syncing inode, "
5181 "inode=%lu, block=%llu",
5182 inode->i_ino,
5183 (unsigned long long)iloc.bh->b_blocknr);
5184 err = -EIO;
5185 }
4741 } 5186 }
4742 return err; 5187 return err;
4743} 5188}
@@ -5033,7 +5478,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
5033 get_bh(iloc->bh); 5478 get_bh(iloc->bh);
5034 5479
5035 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 5480 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
5036 err = ext4_do_update_inode(handle, inode, iloc, 0); 5481 err = ext4_do_update_inode(handle, inode, iloc);
5037 put_bh(iloc->bh); 5482 put_bh(iloc->bh);
5038 return err; 5483 return err;
5039} 5484}
@@ -5177,27 +5622,14 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5177 */ 5622 */
5178void ext4_dirty_inode(struct inode *inode) 5623void ext4_dirty_inode(struct inode *inode)
5179{ 5624{
5180 handle_t *current_handle = ext4_journal_current_handle();
5181 handle_t *handle; 5625 handle_t *handle;
5182 5626
5183 if (!ext4_handle_valid(current_handle)) {
5184 ext4_mark_inode_dirty(current_handle, inode);
5185 return;
5186 }
5187
5188 handle = ext4_journal_start(inode, 2); 5627 handle = ext4_journal_start(inode, 2);
5189 if (IS_ERR(handle)) 5628 if (IS_ERR(handle))
5190 goto out; 5629 goto out;
5191 if (current_handle && 5630
5192 current_handle->h_transaction != handle->h_transaction) { 5631 ext4_mark_inode_dirty(handle, inode);
5193 /* This task has a transaction open against a different fs */ 5632
5194 printk(KERN_EMERG "%s: transactions do not match!\n",
5195 __func__);
5196 } else {
5197 jbd_debug(5, "marking dirty. outer handle=%p\n",
5198 current_handle);
5199 ext4_mark_inode_dirty(handle, inode);
5200 }
5201 ext4_journal_stop(handle); 5633 ext4_journal_stop(handle);
5202out: 5634out:
5203 return; 5635 return;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e9c61896d605..bba12824defa 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2096,207 +2096,6 @@ out:
2096 return err; 2096 return err;
2097} 2097}
2098 2098
2099#ifdef EXT4_MB_HISTORY
2100struct ext4_mb_proc_session {
2101 struct ext4_mb_history *history;
2102 struct super_block *sb;
2103 int start;
2104 int max;
2105};
2106
2107static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
2108 struct ext4_mb_history *hs,
2109 int first)
2110{
2111 if (hs == s->history + s->max)
2112 hs = s->history;
2113 if (!first && hs == s->history + s->start)
2114 return NULL;
2115 while (hs->orig.fe_len == 0) {
2116 hs++;
2117 if (hs == s->history + s->max)
2118 hs = s->history;
2119 if (hs == s->history + s->start)
2120 return NULL;
2121 }
2122 return hs;
2123}
2124
2125static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
2126{
2127 struct ext4_mb_proc_session *s = seq->private;
2128 struct ext4_mb_history *hs;
2129 int l = *pos;
2130
2131 if (l == 0)
2132 return SEQ_START_TOKEN;
2133 hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2134 if (!hs)
2135 return NULL;
2136 while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
2137 return hs;
2138}
2139
2140static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
2141 loff_t *pos)
2142{
2143 struct ext4_mb_proc_session *s = seq->private;
2144 struct ext4_mb_history *hs = v;
2145
2146 ++*pos;
2147 if (v == SEQ_START_TOKEN)
2148 return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
2149 else
2150 return ext4_mb_history_skip_empty(s, ++hs, 0);
2151}
2152
2153static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2154{
2155 char buf[25], buf2[25], buf3[25], *fmt;
2156 struct ext4_mb_history *hs = v;
2157
2158 if (v == SEQ_START_TOKEN) {
2159 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
2160 "%-5s %-2s %-6s %-5s %-5s %-6s\n",
2161 "pid", "inode", "original", "goal", "result", "found",
2162 "grps", "cr", "flags", "merge", "tail", "broken");
2163 return 0;
2164 }
2165
2166 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
2167 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
2168 "0x%04x %-5s %-5u %-6u\n";
2169 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2170 hs->result.fe_start, hs->result.fe_len,
2171 hs->result.fe_logical);
2172 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
2173 hs->orig.fe_start, hs->orig.fe_len,
2174 hs->orig.fe_logical);
2175 sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
2176 hs->goal.fe_start, hs->goal.fe_len,
2177 hs->goal.fe_logical);
2178 seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
2179 hs->found, hs->groups, hs->cr, hs->flags,
2180 hs->merged ? "M" : "", hs->tail,
2181 hs->buddy ? 1 << hs->buddy : 0);
2182 } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
2183 fmt = "%-5u %-8u %-23s %-23s %-23s\n";
2184 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2185 hs->result.fe_start, hs->result.fe_len,
2186 hs->result.fe_logical);
2187 sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
2188 hs->orig.fe_start, hs->orig.fe_len,
2189 hs->orig.fe_logical);
2190 seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
2191 } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
2192 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
2193 hs->result.fe_start, hs->result.fe_len);
2194 seq_printf(seq, "%-5u %-8u %-23s discard\n",
2195 hs->pid, hs->ino, buf2);
2196 } else if (hs->op == EXT4_MB_HISTORY_FREE) {
2197 sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
2198 hs->result.fe_start, hs->result.fe_len);
2199 seq_printf(seq, "%-5u %-8u %-23s free\n",
2200 hs->pid, hs->ino, buf2);
2201 }
2202 return 0;
2203}
2204
2205static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
2206{
2207}
2208
2209static const struct seq_operations ext4_mb_seq_history_ops = {
2210 .start = ext4_mb_seq_history_start,
2211 .next = ext4_mb_seq_history_next,
2212 .stop = ext4_mb_seq_history_stop,
2213 .show = ext4_mb_seq_history_show,
2214};
2215
2216static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
2217{
2218 struct super_block *sb = PDE(inode)->data;
2219 struct ext4_sb_info *sbi = EXT4_SB(sb);
2220 struct ext4_mb_proc_session *s;
2221 int rc;
2222 int size;
2223
2224 if (unlikely(sbi->s_mb_history == NULL))
2225 return -ENOMEM;
2226 s = kmalloc(sizeof(*s), GFP_KERNEL);
2227 if (s == NULL)
2228 return -ENOMEM;
2229 s->sb = sb;
2230 size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
2231 s->history = kmalloc(size, GFP_KERNEL);
2232 if (s->history == NULL) {
2233 kfree(s);
2234 return -ENOMEM;
2235 }
2236
2237 spin_lock(&sbi->s_mb_history_lock);
2238 memcpy(s->history, sbi->s_mb_history, size);
2239 s->max = sbi->s_mb_history_max;
2240 s->start = sbi->s_mb_history_cur % s->max;
2241 spin_unlock(&sbi->s_mb_history_lock);
2242
2243 rc = seq_open(file, &ext4_mb_seq_history_ops);
2244 if (rc == 0) {
2245 struct seq_file *m = (struct seq_file *)file->private_data;
2246 m->private = s;
2247 } else {
2248 kfree(s->history);
2249 kfree(s);
2250 }
2251 return rc;
2252
2253}
2254
2255static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
2256{
2257 struct seq_file *seq = (struct seq_file *)file->private_data;
2258 struct ext4_mb_proc_session *s = seq->private;
2259 kfree(s->history);
2260 kfree(s);
2261 return seq_release(inode, file);
2262}
2263
2264static ssize_t ext4_mb_seq_history_write(struct file *file,
2265 const char __user *buffer,
2266 size_t count, loff_t *ppos)
2267{
2268 struct seq_file *seq = (struct seq_file *)file->private_data;
2269 struct ext4_mb_proc_session *s = seq->private;
2270 struct super_block *sb = s->sb;
2271 char str[32];
2272 int value;
2273
2274 if (count >= sizeof(str)) {
2275 printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
2276 "mb_history", (int)sizeof(str));
2277 return -EOVERFLOW;
2278 }
2279
2280 if (copy_from_user(str, buffer, count))
2281 return -EFAULT;
2282
2283 value = simple_strtol(str, NULL, 0);
2284 if (value < 0)
2285 return -ERANGE;
2286 EXT4_SB(sb)->s_mb_history_filter = value;
2287
2288 return count;
2289}
2290
2291static const struct file_operations ext4_mb_seq_history_fops = {
2292 .owner = THIS_MODULE,
2293 .open = ext4_mb_seq_history_open,
2294 .read = seq_read,
2295 .write = ext4_mb_seq_history_write,
2296 .llseek = seq_lseek,
2297 .release = ext4_mb_seq_history_release,
2298};
2299
2300static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2099static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2301{ 2100{
2302 struct super_block *sb = seq->private; 2101 struct super_block *sb = seq->private;
@@ -2396,82 +2195,6 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
2396 .release = seq_release, 2195 .release = seq_release,
2397}; 2196};
2398 2197
2399static void ext4_mb_history_release(struct super_block *sb)
2400{
2401 struct ext4_sb_info *sbi = EXT4_SB(sb);
2402
2403 if (sbi->s_proc != NULL) {
2404 remove_proc_entry("mb_groups", sbi->s_proc);
2405 if (sbi->s_mb_history_max)
2406 remove_proc_entry("mb_history", sbi->s_proc);
2407 }
2408 kfree(sbi->s_mb_history);
2409}
2410
2411static void ext4_mb_history_init(struct super_block *sb)
2412{
2413 struct ext4_sb_info *sbi = EXT4_SB(sb);
2414 int i;
2415
2416 if (sbi->s_proc != NULL) {
2417 if (sbi->s_mb_history_max)
2418 proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
2419 &ext4_mb_seq_history_fops, sb);
2420 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2421 &ext4_mb_seq_groups_fops, sb);
2422 }
2423
2424 sbi->s_mb_history_cur = 0;
2425 spin_lock_init(&sbi->s_mb_history_lock);
2426 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
2427 sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
2428 /* if we can't allocate history, then we simple won't use it */
2429}
2430
2431static noinline_for_stack void
2432ext4_mb_store_history(struct ext4_allocation_context *ac)
2433{
2434 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2435 struct ext4_mb_history h;
2436
2437 if (sbi->s_mb_history == NULL)
2438 return;
2439
2440 if (!(ac->ac_op & sbi->s_mb_history_filter))
2441 return;
2442
2443 h.op = ac->ac_op;
2444 h.pid = current->pid;
2445 h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
2446 h.orig = ac->ac_o_ex;
2447 h.result = ac->ac_b_ex;
2448 h.flags = ac->ac_flags;
2449 h.found = ac->ac_found;
2450 h.groups = ac->ac_groups_scanned;
2451 h.cr = ac->ac_criteria;
2452 h.tail = ac->ac_tail;
2453 h.buddy = ac->ac_buddy;
2454 h.merged = 0;
2455 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
2456 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
2457 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
2458 h.merged = 1;
2459 h.goal = ac->ac_g_ex;
2460 h.result = ac->ac_f_ex;
2461 }
2462
2463 spin_lock(&sbi->s_mb_history_lock);
2464 memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
2465 if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
2466 sbi->s_mb_history_cur = 0;
2467 spin_unlock(&sbi->s_mb_history_lock);
2468}
2469
2470#else
2471#define ext4_mb_history_release(sb)
2472#define ext4_mb_history_init(sb)
2473#endif
2474
2475 2198
2476/* Create and initialize ext4_group_info data for the given group. */ 2199/* Create and initialize ext4_group_info data for the given group. */
2477int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2200int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
@@ -2690,7 +2413,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2690 sbi->s_mb_stats = MB_DEFAULT_STATS; 2413 sbi->s_mb_stats = MB_DEFAULT_STATS;
2691 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 2414 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2692 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 2415 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2693 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
2694 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2416 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2695 2417
2696 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2418 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
@@ -2708,12 +2430,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2708 spin_lock_init(&lg->lg_prealloc_lock); 2430 spin_lock_init(&lg->lg_prealloc_lock);
2709 } 2431 }
2710 2432
2711 ext4_mb_history_init(sb); 2433 if (sbi->s_proc)
2434 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2435 &ext4_mb_seq_groups_fops, sb);
2712 2436
2713 if (sbi->s_journal) 2437 if (sbi->s_journal)
2714 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2438 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2715
2716 printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
2717 return 0; 2439 return 0;
2718} 2440}
2719 2441
@@ -2790,7 +2512,8 @@ int ext4_mb_release(struct super_block *sb)
2790 } 2512 }
2791 2513
2792 free_percpu(sbi->s_locality_groups); 2514 free_percpu(sbi->s_locality_groups);
2793 ext4_mb_history_release(sb); 2515 if (sbi->s_proc)
2516 remove_proc_entry("mb_groups", sbi->s_proc);
2794 2517
2795 return 0; 2518 return 0;
2796} 2519}
@@ -3276,7 +2999,10 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3276 atomic_inc(&sbi->s_bal_breaks); 2999 atomic_inc(&sbi->s_bal_breaks);
3277 } 3000 }
3278 3001
3279 ext4_mb_store_history(ac); 3002 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3003 trace_ext4_mballoc_alloc(ac);
3004 else
3005 trace_ext4_mballoc_prealloc(ac);
3280} 3006}
3281 3007
3282/* 3008/*
@@ -3776,7 +3502,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3776 if (ac) { 3502 if (ac) {
3777 ac->ac_sb = sb; 3503 ac->ac_sb = sb;
3778 ac->ac_inode = pa->pa_inode; 3504 ac->ac_inode = pa->pa_inode;
3779 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3780 } 3505 }
3781 3506
3782 while (bit < end) { 3507 while (bit < end) {
@@ -3796,7 +3521,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3796 ac->ac_b_ex.fe_start = bit; 3521 ac->ac_b_ex.fe_start = bit;
3797 ac->ac_b_ex.fe_len = next - bit; 3522 ac->ac_b_ex.fe_len = next - bit;
3798 ac->ac_b_ex.fe_logical = 0; 3523 ac->ac_b_ex.fe_logical = 0;
3799 ext4_mb_store_history(ac); 3524 trace_ext4_mballoc_discard(ac);
3800 } 3525 }
3801 3526
3802 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, 3527 trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
@@ -3831,9 +3556,6 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3831 ext4_group_t group; 3556 ext4_group_t group;
3832 ext4_grpblk_t bit; 3557 ext4_grpblk_t bit;
3833 3558
3834 if (ac)
3835 ac->ac_op = EXT4_MB_HISTORY_DISCARD;
3836
3837 trace_ext4_mb_release_group_pa(ac, pa); 3559 trace_ext4_mb_release_group_pa(ac, pa);
3838 BUG_ON(pa->pa_deleted == 0); 3560 BUG_ON(pa->pa_deleted == 0);
3839 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3561 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
@@ -3848,7 +3570,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3848 ac->ac_b_ex.fe_start = bit; 3570 ac->ac_b_ex.fe_start = bit;
3849 ac->ac_b_ex.fe_len = pa->pa_len; 3571 ac->ac_b_ex.fe_len = pa->pa_len;
3850 ac->ac_b_ex.fe_logical = 0; 3572 ac->ac_b_ex.fe_logical = 0;
3851 ext4_mb_store_history(ac); 3573 trace_ext4_mballoc_discard(ac);
3852 } 3574 }
3853 3575
3854 return 0; 3576 return 0;
@@ -4189,7 +3911,6 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4189 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 3911 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
4190 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) 3912 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4191 >> bsbits; 3913 >> bsbits;
4192 size = max(size, isize);
4193 3914
4194 if ((size == isize) && 3915 if ((size == isize) &&
4195 !ext4_fs_is_busy(sbi) && 3916 !ext4_fs_is_busy(sbi) &&
@@ -4199,6 +3920,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4199 } 3920 }
4200 3921
4201 /* don't use group allocation for large files */ 3922 /* don't use group allocation for large files */
3923 size = max(size, isize);
4202 if (size >= sbi->s_mb_stream_request) { 3924 if (size >= sbi->s_mb_stream_request) {
4203 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 3925 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4204 return; 3926 return;
@@ -4739,7 +4461,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4739 4461
4740 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4462 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4741 if (ac) { 4463 if (ac) {
4742 ac->ac_op = EXT4_MB_HISTORY_FREE;
4743 ac->ac_inode = inode; 4464 ac->ac_inode = inode;
4744 ac->ac_sb = sb; 4465 ac->ac_sb = sb;
4745 } 4466 }
@@ -4806,7 +4527,7 @@ do_more:
4806 ac->ac_b_ex.fe_group = block_group; 4527 ac->ac_b_ex.fe_group = block_group;
4807 ac->ac_b_ex.fe_start = bit; 4528 ac->ac_b_ex.fe_start = bit;
4808 ac->ac_b_ex.fe_len = count; 4529 ac->ac_b_ex.fe_len = count;
4809 ext4_mb_store_history(ac); 4530 trace_ext4_mballoc_free(ac);
4810 } 4531 }
4811 4532
4812 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4533 err = ext4_mb_load_buddy(sb, block_group, &e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 188d3d709b24..0ca811061bc7 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -52,18 +52,8 @@ extern u8 mb_enable_debug;
52#define mb_debug(n, fmt, a...) 52#define mb_debug(n, fmt, a...)
53#endif 53#endif
54 54
55/*
56 * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
57 * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
58 */
59#define EXT4_MB_HISTORY
60#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ 55#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
61#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ 56#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
62#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */
63#define EXT4_MB_HISTORY_FREE 8 /* free */
64
65#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \
66 EXT4_MB_HISTORY_PREALLOC)
67 57
68/* 58/*
69 * How long mballoc can look for a best extent (in found extents) 59 * How long mballoc can look for a best extent (in found extents)
@@ -84,7 +74,7 @@ extern u8 mb_enable_debug;
84 * with 'ext4_mb_stats' allocator will collect stats that will be 74 * with 'ext4_mb_stats' allocator will collect stats that will be
85 * shown at umount. The collecting costs though! 75 * shown at umount. The collecting costs though!
86 */ 76 */
87#define MB_DEFAULT_STATS 1 77#define MB_DEFAULT_STATS 0
88 78
89/* 79/*
90 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served 80 * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
@@ -217,22 +207,6 @@ struct ext4_allocation_context {
217#define AC_STATUS_FOUND 2 207#define AC_STATUS_FOUND 2
218#define AC_STATUS_BREAK 3 208#define AC_STATUS_BREAK 3
219 209
220struct ext4_mb_history {
221 struct ext4_free_extent orig; /* orig allocation */
222 struct ext4_free_extent goal; /* goal allocation */
223 struct ext4_free_extent result; /* result allocation */
224 unsigned pid;
225 unsigned ino;
226 __u16 found; /* how many extents have been found */
227 __u16 groups; /* how many groups have been scanned */
228 __u16 tail; /* what tail broke some buddy */
229 __u16 buddy; /* buddy the tail ^^^ broke */
230 __u16 flags;
231 __u8 cr:3; /* which phase the result extent was found at */
232 __u8 op:4;
233 __u8 merged:1;
234};
235
236struct ext4_buddy { 210struct ext4_buddy {
237 struct page *bd_buddy_page; 211 struct page *bd_buddy_page;
238 void *bd_buddy; 212 void *bd_buddy;
@@ -247,13 +221,6 @@ struct ext4_buddy {
247#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 221#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
248#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 222#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
249 223
250#ifndef EXT4_MB_HISTORY
251static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
252{
253 return;
254}
255#endif
256
257#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 224#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
258 225
259static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, 226static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index bf519f239ae6..a93d5b80f3e2 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
75 goto err_out; 75 goto err_out;
76 } 76 }
77 } 77 }
78 retval = ext4_ext_insert_extent(handle, inode, path, &newext); 78 retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
79err_out: 79err_out:
80 if (path) { 80 if (path) {
81 ext4_ext_drop_refs(path); 81 ext4_ext_drop_refs(path);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c07a2915e40b..25b6b1457360 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -322,7 +322,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
322 goto out; 322 goto out;
323 323
324 if (ext4_ext_insert_extent(handle, orig_inode, 324 if (ext4_ext_insert_extent(handle, orig_inode,
325 orig_path, new_ext)) 325 orig_path, new_ext, 0))
326 goto out; 326 goto out;
327 } 327 }
328 328
@@ -333,7 +333,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
333 goto out; 333 goto out;
334 334
335 if (ext4_ext_insert_extent(handle, orig_inode, 335 if (ext4_ext_insert_extent(handle, orig_inode,
336 orig_path, end_ext)) 336 orig_path, end_ext, 0))
337 goto out; 337 goto out;
338 } 338 }
339out: 339out:
@@ -1001,14 +1001,6 @@ mext_check_arguments(struct inode *orig_inode,
1001 return -EINVAL; 1001 return -EINVAL;
1002 } 1002 }
1003 1003
1004 /* orig and donor should be different file */
1005 if (orig_inode->i_ino == donor_inode->i_ino) {
1006 ext4_debug("ext4 move extent: The argument files should not "
1007 "be same file [ino:orig %lu, donor %lu]\n",
1008 orig_inode->i_ino, donor_inode->i_ino);
1009 return -EINVAL;
1010 }
1011
1012 /* Ext4 move extent supports only extent based file */ 1004 /* Ext4 move extent supports only extent based file */
1013 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { 1005 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
1014 ext4_debug("ext4 move extent: orig file is not extents " 1006 ext4_debug("ext4 move extent: orig file is not extents "
@@ -1232,6 +1224,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1232 int block_len_in_page; 1224 int block_len_in_page;
1233 int uninit; 1225 int uninit;
1234 1226
1227 /* orig and donor should be different file */
1228 if (orig_inode->i_ino == donor_inode->i_ino) {
1229 ext4_debug("ext4 move extent: The argument files should not "
1230 "be same file [ino:orig %lu, donor %lu]\n",
1231 orig_inode->i_ino, donor_inode->i_ino);
1232 return -EINVAL;
1233 }
1234
1235 /* protect orig and donor against a truncate */ 1235 /* protect orig and donor against a truncate */
1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode);
1237 if (ret1 < 0) 1237 if (ret1 < 0)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 42f81d285cd5..6d2c1b897fc7 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1518,12 +1518,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1518 return retval; 1518 return retval;
1519 1519
1520 if (blocks == 1 && !dx_fallback && 1520 if (blocks == 1 && !dx_fallback &&
1521 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { 1521 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
1522 retval = make_indexed_dir(handle, dentry, inode, bh); 1522 return make_indexed_dir(handle, dentry, inode, bh);
1523 if (retval == -ENOSPC)
1524 brelse(bh);
1525 return retval;
1526 }
1527 brelse(bh); 1523 brelse(bh);
1528 } 1524 }
1529 bh = ext4_append(handle, dir, &block, &retval); 1525 bh = ext4_append(handle, dir, &block, &retval);
@@ -1532,10 +1528,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1532 de = (struct ext4_dir_entry_2 *) bh->b_data; 1528 de = (struct ext4_dir_entry_2 *) bh->b_data;
1533 de->inode = 0; 1529 de->inode = 0;
1534 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1530 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1535 retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1531 return add_dirent_to_buf(handle, dentry, inode, de, bh);
1536 if (retval == -ENOSPC)
1537 brelse(bh);
1538 return retval;
1539} 1532}
1540 1533
1541/* 1534/*
@@ -1664,8 +1657,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1664 if (!de) 1657 if (!de)
1665 goto cleanup; 1658 goto cleanup;
1666 err = add_dirent_to_buf(handle, dentry, inode, de, bh); 1659 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1667 if (err != -ENOSPC) 1660 bh = NULL;
1668 bh = NULL;
1669 goto cleanup; 1661 goto cleanup;
1670 1662
1671journal_error: 1663journal_error:
@@ -2076,7 +2068,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
2076 struct ext4_iloc iloc; 2068 struct ext4_iloc iloc;
2077 int err = 0; 2069 int err = 0;
2078 2070
2079 if (!ext4_handle_valid(handle)) 2071 /* ext4_handle_valid() assumes a valid handle_t pointer */
2072 if (handle && !ext4_handle_valid(handle))
2080 return 0; 2073 return 0;
2081 2074
2082 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); 2075 mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index df539ba27779..d4ca92aab514 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -50,13 +50,6 @@
50#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
51#include <trace/events/ext4.h> 51#include <trace/events/ext4.h>
52 52
53static int default_mb_history_length = 1000;
54
55module_param_named(default_mb_history_length, default_mb_history_length,
56 int, 0644);
57MODULE_PARM_DESC(default_mb_history_length,
58 "Default number of entries saved for mb_history");
59
60struct proc_dir_entry *ext4_proc_root; 53struct proc_dir_entry *ext4_proc_root;
61static struct kset *ext4_kset; 54static struct kset *ext4_kset;
62 55
@@ -189,6 +182,36 @@ void ext4_itable_unused_set(struct super_block *sb,
189 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); 182 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
190} 183}
191 184
185
186/* Just increment the non-pointer handle value */
187static handle_t *ext4_get_nojournal(void)
188{
189 handle_t *handle = current->journal_info;
190 unsigned long ref_cnt = (unsigned long)handle;
191
192 BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
193
194 ref_cnt++;
195 handle = (handle_t *)ref_cnt;
196
197 current->journal_info = handle;
198 return handle;
199}
200
201
202/* Decrement the non-pointer handle value */
203static void ext4_put_nojournal(handle_t *handle)
204{
205 unsigned long ref_cnt = (unsigned long)handle;
206
207 BUG_ON(ref_cnt == 0);
208
209 ref_cnt--;
210 handle = (handle_t *)ref_cnt;
211
212 current->journal_info = handle;
213}
214
192/* 215/*
193 * Wrappers for jbd2_journal_start/end. 216 * Wrappers for jbd2_journal_start/end.
194 * 217 *
@@ -215,11 +238,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
215 } 238 }
216 return jbd2_journal_start(journal, nblocks); 239 return jbd2_journal_start(journal, nblocks);
217 } 240 }
218 /* 241 return ext4_get_nojournal();
219 * We're not journaling, return the appropriate indication.
220 */
221 current->journal_info = EXT4_NOJOURNAL_HANDLE;
222 return current->journal_info;
223} 242}
224 243
225/* 244/*
@@ -235,11 +254,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
235 int rc; 254 int rc;
236 255
237 if (!ext4_handle_valid(handle)) { 256 if (!ext4_handle_valid(handle)) {
238 /* 257 ext4_put_nojournal(handle);
239 * Do this here since we don't call jbd2_journal_stop() in
240 * no-journal mode.
241 */
242 current->journal_info = NULL;
243 return 0; 258 return 0;
244 } 259 }
245 sb = handle->h_transaction->t_journal->j_private; 260 sb = handle->h_transaction->t_journal->j_private;
@@ -580,6 +595,9 @@ static void ext4_put_super(struct super_block *sb)
580 struct ext4_super_block *es = sbi->s_es; 595 struct ext4_super_block *es = sbi->s_es;
581 int i, err; 596 int i, err;
582 597
598 flush_workqueue(sbi->dio_unwritten_wq);
599 destroy_workqueue(sbi->dio_unwritten_wq);
600
583 lock_super(sb); 601 lock_super(sb);
584 lock_kernel(); 602 lock_kernel();
585 if (sb->s_dirt) 603 if (sb->s_dirt)
@@ -684,6 +702,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
684 ei->i_allocated_meta_blocks = 0; 702 ei->i_allocated_meta_blocks = 0;
685 ei->i_delalloc_reserved_flag = 0; 703 ei->i_delalloc_reserved_flag = 0;
686 spin_lock_init(&(ei->i_block_reservation_lock)); 704 spin_lock_init(&(ei->i_block_reservation_lock));
705 INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
706 ei->cur_aio_dio = NULL;
687 707
688 return &ei->vfs_inode; 708 return &ei->vfs_inode;
689} 709}
@@ -1052,7 +1072,7 @@ enum {
1052 Opt_journal_update, Opt_journal_dev, 1072 Opt_journal_update, Opt_journal_dev,
1053 Opt_journal_checksum, Opt_journal_async_commit, 1073 Opt_journal_checksum, Opt_journal_async_commit,
1054 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1074 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1055 Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length, 1075 Opt_data_err_abort, Opt_data_err_ignore,
1056 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1076 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1057 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, 1077 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
1058 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, 1078 Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
@@ -1099,7 +1119,6 @@ static const match_table_t tokens = {
1099 {Opt_data_writeback, "data=writeback"}, 1119 {Opt_data_writeback, "data=writeback"},
1100 {Opt_data_err_abort, "data_err=abort"}, 1120 {Opt_data_err_abort, "data_err=abort"},
1101 {Opt_data_err_ignore, "data_err=ignore"}, 1121 {Opt_data_err_ignore, "data_err=ignore"},
1102 {Opt_mb_history_length, "mb_history_length=%u"},
1103 {Opt_offusrjquota, "usrjquota="}, 1122 {Opt_offusrjquota, "usrjquota="},
1104 {Opt_usrjquota, "usrjquota=%s"}, 1123 {Opt_usrjquota, "usrjquota=%s"},
1105 {Opt_offgrpjquota, "grpjquota="}, 1124 {Opt_offgrpjquota, "grpjquota="},
@@ -1281,9 +1300,11 @@ static int parse_options(char *options, struct super_block *sb,
1281 *journal_devnum = option; 1300 *journal_devnum = option;
1282 break; 1301 break;
1283 case Opt_journal_checksum: 1302 case Opt_journal_checksum:
1284 break; /* Kept for backwards compatibility */ 1303 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1304 break;
1285 case Opt_journal_async_commit: 1305 case Opt_journal_async_commit:
1286 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); 1306 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
1307 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1287 break; 1308 break;
1288 case Opt_noload: 1309 case Opt_noload:
1289 set_opt(sbi->s_mount_opt, NOLOAD); 1310 set_opt(sbi->s_mount_opt, NOLOAD);
@@ -1340,13 +1361,6 @@ static int parse_options(char *options, struct super_block *sb,
1340 case Opt_data_err_ignore: 1361 case Opt_data_err_ignore:
1341 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); 1362 clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
1342 break; 1363 break;
1343 case Opt_mb_history_length:
1344 if (match_int(&args[0], &option))
1345 return 0;
1346 if (option < 0)
1347 return 0;
1348 sbi->s_mb_history_max = option;
1349 break;
1350#ifdef CONFIG_QUOTA 1364#ifdef CONFIG_QUOTA
1351 case Opt_usrjquota: 1365 case Opt_usrjquota:
1352 qtype = USRQUOTA; 1366 qtype = USRQUOTA;
@@ -1646,13 +1660,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1646 EXT4_INODES_PER_GROUP(sb), 1660 EXT4_INODES_PER_GROUP(sb),
1647 sbi->s_mount_opt); 1661 sbi->s_mount_opt);
1648 1662
1649 if (EXT4_SB(sb)->s_journal) {
1650 ext4_msg(sb, KERN_INFO, "%s journal on %s",
1651 EXT4_SB(sb)->s_journal->j_inode ? "internal" :
1652 "external", EXT4_SB(sb)->s_journal->j_devname);
1653 } else {
1654 ext4_msg(sb, KERN_INFO, "no journal");
1655 }
1656 return res; 1663 return res;
1657} 1664}
1658 1665
@@ -2197,6 +2204,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2197EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 2204EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2198EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2205EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2199EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2206EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2207EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
2200 2208
2201static struct attribute *ext4_attrs[] = { 2209static struct attribute *ext4_attrs[] = {
2202 ATTR_LIST(delayed_allocation_blocks), 2210 ATTR_LIST(delayed_allocation_blocks),
@@ -2210,6 +2218,7 @@ static struct attribute *ext4_attrs[] = {
2210 ATTR_LIST(mb_order2_req), 2218 ATTR_LIST(mb_order2_req),
2211 ATTR_LIST(mb_stream_req), 2219 ATTR_LIST(mb_stream_req),
2212 ATTR_LIST(mb_group_prealloc), 2220 ATTR_LIST(mb_group_prealloc),
2221 ATTR_LIST(max_writeback_mb_bump),
2213 NULL, 2222 NULL,
2214}; 2223};
2215 2224
@@ -2413,7 +2422,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2413 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; 2422 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
2414 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 2423 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
2415 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 2424 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
2416 sbi->s_mb_history_max = default_mb_history_length;
2417 2425
2418 set_opt(sbi->s_mount_opt, BARRIER); 2426 set_opt(sbi->s_mount_opt, BARRIER);
2419 2427
@@ -2679,6 +2687,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2679 } 2687 }
2680 2688
2681 sbi->s_stripe = ext4_get_stripe_size(sbi); 2689 sbi->s_stripe = ext4_get_stripe_size(sbi);
2690 sbi->s_max_writeback_mb_bump = 128;
2682 2691
2683 /* 2692 /*
2684 * set up enough so that it can read an inode 2693 * set up enough so that it can read an inode
@@ -2752,14 +2761,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2752 goto failed_mount4; 2761 goto failed_mount4;
2753 } 2762 }
2754 2763
2755 jbd2_journal_set_features(sbi->s_journal, 2764 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
2756 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0); 2765 jbd2_journal_set_features(sbi->s_journal,
2757 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) 2766 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2758 jbd2_journal_set_features(sbi->s_journal, 0, 0,
2759 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 2767 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2760 else 2768 } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
2769 jbd2_journal_set_features(sbi->s_journal,
2770 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
2761 jbd2_journal_clear_features(sbi->s_journal, 0, 0, 2771 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
2762 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 2772 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2773 } else {
2774 jbd2_journal_clear_features(sbi->s_journal,
2775 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2776 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2777 }
2763 2778
2764 /* We have now updated the journal if required, so we can 2779 /* We have now updated the journal if required, so we can
2765 * validate the data journaling mode. */ 2780 * validate the data journaling mode. */
@@ -2798,6 +2813,12 @@ no_journal:
2798 clear_opt(sbi->s_mount_opt, NOBH); 2813 clear_opt(sbi->s_mount_opt, NOBH);
2799 } 2814 }
2800 } 2815 }
2816 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
2817 if (!EXT4_SB(sb)->dio_unwritten_wq) {
2818 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
2819 goto failed_mount_wq;
2820 }
2821
2801 /* 2822 /*
2802 * The jbd2_journal_load will have done any necessary log recovery, 2823 * The jbd2_journal_load will have done any necessary log recovery,
2803 * so we can safely mount the rest of the filesystem now. 2824 * so we can safely mount the rest of the filesystem now.
@@ -2849,12 +2870,12 @@ no_journal:
2849 "available"); 2870 "available");
2850 } 2871 }
2851 2872
2852 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 2873 if (test_opt(sb, DELALLOC) &&
2874 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
2853 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " 2875 ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
2854 "requested data journaling mode"); 2876 "requested data journaling mode");
2855 clear_opt(sbi->s_mount_opt, DELALLOC); 2877 clear_opt(sbi->s_mount_opt, DELALLOC);
2856 } else if (test_opt(sb, DELALLOC)) 2878 }
2857 ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
2858 2879
2859 err = ext4_setup_system_zone(sb); 2880 err = ext4_setup_system_zone(sb);
2860 if (err) { 2881 if (err) {
@@ -2910,6 +2931,8 @@ cantfind_ext4:
2910 2931
2911failed_mount4: 2932failed_mount4:
2912 ext4_msg(sb, KERN_ERR, "mount failed"); 2933 ext4_msg(sb, KERN_ERR, "mount failed");
2934 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
2935failed_mount_wq:
2913 ext4_release_system_zone(sb); 2936 ext4_release_system_zone(sb);
2914 if (sbi->s_journal) { 2937 if (sbi->s_journal) {
2915 jbd2_journal_destroy(sbi->s_journal); 2938 jbd2_journal_destroy(sbi->s_journal);
@@ -3164,9 +3187,7 @@ static int ext4_load_journal(struct super_block *sb,
3164 return -EINVAL; 3187 return -EINVAL;
3165 } 3188 }
3166 3189
3167 if (journal->j_flags & JBD2_BARRIER) 3190 if (!(journal->j_flags & JBD2_BARRIER))
3168 ext4_msg(sb, KERN_INFO, "barriers enabled");
3169 else
3170 ext4_msg(sb, KERN_INFO, "barriers disabled"); 3191 ext4_msg(sb, KERN_INFO, "barriers disabled");
3171 3192
3172 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { 3193 if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
@@ -3361,11 +3382,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
3361{ 3382{
3362 int ret = 0; 3383 int ret = 0;
3363 tid_t target; 3384 tid_t target;
3385 struct ext4_sb_info *sbi = EXT4_SB(sb);
3364 3386
3365 trace_ext4_sync_fs(sb, wait); 3387 trace_ext4_sync_fs(sb, wait);
3366 if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { 3388 flush_workqueue(sbi->dio_unwritten_wq);
3389 if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
3367 if (wait) 3390 if (wait)
3368 jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); 3391 jbd2_log_wait_commit(sbi->s_journal, target);
3369 } 3392 }
3370 return ret; 3393 return ret;
3371} 3394}
@@ -3951,27 +3974,6 @@ static struct file_system_type ext4_fs_type = {
3951 .fs_flags = FS_REQUIRES_DEV, 3974 .fs_flags = FS_REQUIRES_DEV,
3952}; 3975};
3953 3976
3954#ifdef CONFIG_EXT4DEV_COMPAT
3955static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
3956 const char *dev_name, void *data,struct vfsmount *mnt)
3957{
3958 printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
3959 "to mount using ext4\n", dev_name);
3960 printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
3961 "will go away by 2.6.31\n", dev_name);
3962 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
3963}
3964
3965static struct file_system_type ext4dev_fs_type = {
3966 .owner = THIS_MODULE,
3967 .name = "ext4dev",
3968 .get_sb = ext4dev_get_sb,
3969 .kill_sb = kill_block_super,
3970 .fs_flags = FS_REQUIRES_DEV,
3971};
3972MODULE_ALIAS("ext4dev");
3973#endif
3974
3975static int __init init_ext4_fs(void) 3977static int __init init_ext4_fs(void)
3976{ 3978{
3977 int err; 3979 int err;
@@ -3996,13 +3998,6 @@ static int __init init_ext4_fs(void)
3996 err = register_filesystem(&ext4_fs_type); 3998 err = register_filesystem(&ext4_fs_type);
3997 if (err) 3999 if (err)
3998 goto out; 4000 goto out;
3999#ifdef CONFIG_EXT4DEV_COMPAT
4000 err = register_filesystem(&ext4dev_fs_type);
4001 if (err) {
4002 unregister_filesystem(&ext4_fs_type);
4003 goto out;
4004 }
4005#endif
4006 return 0; 4001 return 0;
4007out: 4002out:
4008 destroy_inodecache(); 4003 destroy_inodecache();
@@ -4021,9 +4016,6 @@ out4:
4021static void __exit exit_ext4_fs(void) 4016static void __exit exit_ext4_fs(void)
4022{ 4017{
4023 unregister_filesystem(&ext4_fs_type); 4018 unregister_filesystem(&ext4_fs_type);
4024#ifdef CONFIG_EXT4DEV_COMPAT
4025 unregister_filesystem(&ext4dev_fs_type);
4026#endif
4027 destroy_inodecache(); 4019 destroy_inodecache();
4028 exit_ext4_xattr(); 4020 exit_ext4_xattr();
4029 exit_ext4_mballoc(); 4021 exit_ext4_mballoc();
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index adb0e72a176d..7db0979c6b72 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -323,7 +323,7 @@ extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
323/* fat/misc.c */ 323/* fat/misc.c */
324extern void fat_fs_error(struct super_block *s, const char *fmt, ...) 324extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
325 __attribute__ ((format (printf, 2, 3))) __cold; 325 __attribute__ ((format (printf, 2, 3))) __cold;
326extern void fat_clusters_flush(struct super_block *sb); 326extern int fat_clusters_flush(struct super_block *sb);
327extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); 327extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
328extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, 328extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
329 __le16 __time, __le16 __date, u8 time_cs); 329 __le16 __time, __le16 __date, u8 time_cs);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 04629d1302fc..76b7961ab663 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -451,12 +451,16 @@ static void fat_write_super(struct super_block *sb)
451 451
452static int fat_sync_fs(struct super_block *sb, int wait) 452static int fat_sync_fs(struct super_block *sb, int wait)
453{ 453{
454 lock_super(sb); 454 int err = 0;
455 fat_clusters_flush(sb);
456 sb->s_dirt = 0;
457 unlock_super(sb);
458 455
459 return 0; 456 if (sb->s_dirt) {
457 lock_super(sb);
458 sb->s_dirt = 0;
459 err = fat_clusters_flush(sb);
460 unlock_super(sb);
461 }
462
463 return err;
460} 464}
461 465
462static void fat_put_super(struct super_block *sb) 466static void fat_put_super(struct super_block *sb)
@@ -812,7 +816,7 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
812 seq_puts(m, ",shortname=mixed"); 816 seq_puts(m, ",shortname=mixed");
813 break; 817 break;
814 case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95: 818 case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95:
815 /* seq_puts(m, ",shortname=lower"); */ 819 seq_puts(m, ",shortname=lower");
816 break; 820 break;
817 default: 821 default:
818 seq_puts(m, ",shortname=unknown"); 822 seq_puts(m, ",shortname=unknown");
@@ -963,7 +967,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
963 opts->codepage = fat_default_codepage; 967 opts->codepage = fat_default_codepage;
964 opts->iocharset = fat_default_iocharset; 968 opts->iocharset = fat_default_iocharset;
965 if (is_vfat) { 969 if (is_vfat) {
966 opts->shortname = VFAT_SFN_DISPLAY_LOWER|VFAT_SFN_CREATE_WIN95; 970 opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95;
967 opts->rodir = 0; 971 opts->rodir = 0;
968 } else { 972 } else {
969 opts->shortname = 0; 973 opts->shortname = 0;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 4e35be873e09..0f55f5cb732f 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -43,19 +43,19 @@ EXPORT_SYMBOL_GPL(fat_fs_error);
43 43
44/* Flushes the number of free clusters on FAT32 */ 44/* Flushes the number of free clusters on FAT32 */
45/* XXX: Need to write one per FSINFO block. Currently only writes 1 */ 45/* XXX: Need to write one per FSINFO block. Currently only writes 1 */
46void fat_clusters_flush(struct super_block *sb) 46int fat_clusters_flush(struct super_block *sb)
47{ 47{
48 struct msdos_sb_info *sbi = MSDOS_SB(sb); 48 struct msdos_sb_info *sbi = MSDOS_SB(sb);
49 struct buffer_head *bh; 49 struct buffer_head *bh;
50 struct fat_boot_fsinfo *fsinfo; 50 struct fat_boot_fsinfo *fsinfo;
51 51
52 if (sbi->fat_bits != 32) 52 if (sbi->fat_bits != 32)
53 return; 53 return 0;
54 54
55 bh = sb_bread(sb, sbi->fsinfo_sector); 55 bh = sb_bread(sb, sbi->fsinfo_sector);
56 if (bh == NULL) { 56 if (bh == NULL) {
57 printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n"); 57 printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n");
58 return; 58 return -EIO;
59 } 59 }
60 60
61 fsinfo = (struct fat_boot_fsinfo *)bh->b_data; 61 fsinfo = (struct fat_boot_fsinfo *)bh->b_data;
@@ -74,6 +74,8 @@ void fat_clusters_flush(struct super_block *sb)
74 mark_buffer_dirty(bh); 74 mark_buffer_dirty(bh);
75 } 75 }
76 brelse(bh); 76 brelse(bh);
77
78 return 0;
77} 79}
78 80
79/* 81/*
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index cb6e83557112..f565f24019b5 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -499,17 +499,10 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
499 int charlen; 499 int charlen;
500 500
501 if (utf8) { 501 if (utf8) {
502 int name_len = strlen(name); 502 *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
503 503 if (*outlen < 0)
504 *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname); 504 return *outlen;
505 505 else if (*outlen > 255)
506 /*
507 * We stripped '.'s before and set len appropriately,
508 * but utf8s_to_utf16s doesn't care about len
509 */
510 *outlen -= (name_len - len);
511
512 if (*outlen > 255)
513 return -ENAMETOOLONG; 506 return -ENAMETOOLONG;
514 507
515 op = &outname[*outlen * sizeof(wchar_t)]; 508 op = &outname[*outlen * sizeof(wchar_t)];
diff --git a/fs/fcntl.c b/fs/fcntl.c
index fc089f2f7f56..2cf93ec40a67 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -284,7 +284,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
284 type = PIDTYPE_PID; 284 type = PIDTYPE_PID;
285 break; 285 break;
286 286
287 case F_OWNER_GID: 287 case F_OWNER_PGRP:
288 type = PIDTYPE_PGID; 288 type = PIDTYPE_PGID;
289 break; 289 break;
290 290
@@ -321,7 +321,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
321 break; 321 break;
322 322
323 case PIDTYPE_PGID: 323 case PIDTYPE_PGID:
324 owner.type = F_OWNER_GID; 324 owner.type = F_OWNER_PGRP;
325 break; 325 break;
326 326
327 default: 327 default:
diff --git a/fs/file.c b/fs/file.c
index f313314f996f..87e129030ab1 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -10,6 +10,7 @@
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/sched.h>
13#include <linux/slab.h> 14#include <linux/slab.h>
14#include <linux/vmalloc.h> 15#include <linux/vmalloc.h>
15#include <linux/file.h> 16#include <linux/file.h>
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 8e1e5e19d21e..9d5360c4c2af 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -41,8 +41,9 @@ struct wb_writeback_args {
41 long nr_pages; 41 long nr_pages;
42 struct super_block *sb; 42 struct super_block *sb;
43 enum writeback_sync_modes sync_mode; 43 enum writeback_sync_modes sync_mode;
44 int for_kupdate; 44 int for_kupdate:1;
45 int range_cyclic; 45 int range_cyclic:1;
46 int for_background:1;
46}; 47};
47 48
48/* 49/*
@@ -249,14 +250,25 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
249 * completion. Caller need not hold sb s_umount semaphore. 250 * completion. Caller need not hold sb s_umount semaphore.
250 * 251 *
251 */ 252 */
252void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages) 253void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
254 long nr_pages)
253{ 255{
254 struct wb_writeback_args args = { 256 struct wb_writeback_args args = {
257 .sb = sb,
255 .sync_mode = WB_SYNC_NONE, 258 .sync_mode = WB_SYNC_NONE,
256 .nr_pages = nr_pages, 259 .nr_pages = nr_pages,
257 .range_cyclic = 1, 260 .range_cyclic = 1,
258 }; 261 };
259 262
263 /*
264 * We treat @nr_pages=0 as the special case to do background writeback,
265 * ie. to sync pages until the background dirty threshold is reached.
266 */
267 if (!nr_pages) {
268 args.nr_pages = LONG_MAX;
269 args.for_background = 1;
270 }
271
260 bdi_alloc_queue_work(bdi, &args); 272 bdi_alloc_queue_work(bdi, &args);
261} 273}
262 274
@@ -310,7 +322,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
310 * For inodes being constantly redirtied, dirtied_when can get stuck. 322 * For inodes being constantly redirtied, dirtied_when can get stuck.
311 * It _appears_ to be in the future, but is actually in distant past. 323 * It _appears_ to be in the future, but is actually in distant past.
312 * This test is necessary to prevent such wrapped-around relative times 324 * This test is necessary to prevent such wrapped-around relative times
313 * from permanently stopping the whole pdflush writeback. 325 * from permanently stopping the whole bdi writeback.
314 */ 326 */
315 ret = ret && time_before_eq(inode->dirtied_when, jiffies); 327 ret = ret && time_before_eq(inode->dirtied_when, jiffies);
316#endif 328#endif
@@ -324,13 +336,38 @@ static void move_expired_inodes(struct list_head *delaying_queue,
324 struct list_head *dispatch_queue, 336 struct list_head *dispatch_queue,
325 unsigned long *older_than_this) 337 unsigned long *older_than_this)
326{ 338{
339 LIST_HEAD(tmp);
340 struct list_head *pos, *node;
341 struct super_block *sb = NULL;
342 struct inode *inode;
343 int do_sb_sort = 0;
344
327 while (!list_empty(delaying_queue)) { 345 while (!list_empty(delaying_queue)) {
328 struct inode *inode = list_entry(delaying_queue->prev, 346 inode = list_entry(delaying_queue->prev, struct inode, i_list);
329 struct inode, i_list);
330 if (older_than_this && 347 if (older_than_this &&
331 inode_dirtied_after(inode, *older_than_this)) 348 inode_dirtied_after(inode, *older_than_this))
332 break; 349 break;
333 list_move(&inode->i_list, dispatch_queue); 350 if (sb && sb != inode->i_sb)
351 do_sb_sort = 1;
352 sb = inode->i_sb;
353 list_move(&inode->i_list, &tmp);
354 }
355
356 /* just one sb in list, splice to dispatch_queue and we're done */
357 if (!do_sb_sort) {
358 list_splice(&tmp, dispatch_queue);
359 return;
360 }
361
362 /* Move inodes from one superblock together */
363 while (!list_empty(&tmp)) {
364 inode = list_entry(tmp.prev, struct inode, i_list);
365 sb = inode->i_sb;
366 list_for_each_prev_safe(pos, node, &tmp) {
367 inode = list_entry(pos, struct inode, i_list);
368 if (inode->i_sb == sb)
369 list_move(&inode->i_list, dispatch_queue);
370 }
334 } 371 }
335} 372}
336 373
@@ -439,8 +476,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
439 spin_lock(&inode_lock); 476 spin_lock(&inode_lock);
440 inode->i_state &= ~I_SYNC; 477 inode->i_state &= ~I_SYNC;
441 if (!(inode->i_state & (I_FREEING | I_CLEAR))) { 478 if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
442 if (!(inode->i_state & I_DIRTY) && 479 if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
443 mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 480 /*
481 * More pages get dirtied by a fast dirtier.
482 */
483 goto select_queue;
484 } else if (inode->i_state & I_DIRTY) {
485 /*
486 * At least XFS will redirty the inode during the
487 * writeback (delalloc) and on io completion (isize).
488 */
489 redirty_tail(inode);
490 } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
444 /* 491 /*
445 * We didn't write back all the pages. nfs_writepages() 492 * We didn't write back all the pages. nfs_writepages()
446 * sometimes bales out without doing anything. Redirty 493 * sometimes bales out without doing anything. Redirty
@@ -462,6 +509,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
462 * soon as the queue becomes uncongested. 509 * soon as the queue becomes uncongested.
463 */ 510 */
464 inode->i_state |= I_DIRTY_PAGES; 511 inode->i_state |= I_DIRTY_PAGES;
512select_queue:
465 if (wbc->nr_to_write <= 0) { 513 if (wbc->nr_to_write <= 0) {
466 /* 514 /*
467 * slice used up: queue for next turn 515 * slice used up: queue for next turn
@@ -484,12 +532,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
484 inode->i_state |= I_DIRTY_PAGES; 532 inode->i_state |= I_DIRTY_PAGES;
485 redirty_tail(inode); 533 redirty_tail(inode);
486 } 534 }
487 } else if (inode->i_state & I_DIRTY) {
488 /*
489 * Someone redirtied the inode while were writing back
490 * the pages.
491 */
492 redirty_tail(inode);
493 } else if (atomic_read(&inode->i_count)) { 535 } else if (atomic_read(&inode->i_count)) {
494 /* 536 /*
495 * The inode is clean, inuse 537 * The inode is clean, inuse
@@ -506,6 +548,17 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
506 return ret; 548 return ret;
507} 549}
508 550
551static void unpin_sb_for_writeback(struct super_block **psb)
552{
553 struct super_block *sb = *psb;
554
555 if (sb) {
556 up_read(&sb->s_umount);
557 put_super(sb);
558 *psb = NULL;
559 }
560}
561
509/* 562/*
510 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned 563 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
511 * before calling writeback. So make sure that we do pin it, so it doesn't 564 * before calling writeback. So make sure that we do pin it, so it doesn't
@@ -515,11 +568,20 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
515 * 1 if we failed. 568 * 1 if we failed.
516 */ 569 */
517static int pin_sb_for_writeback(struct writeback_control *wbc, 570static int pin_sb_for_writeback(struct writeback_control *wbc,
518 struct inode *inode) 571 struct inode *inode, struct super_block **psb)
519{ 572{
520 struct super_block *sb = inode->i_sb; 573 struct super_block *sb = inode->i_sb;
521 574
522 /* 575 /*
576 * If this sb is already pinned, nothing more to do. If not and
577 * *psb is non-NULL, unpin the old one first
578 */
579 if (sb == *psb)
580 return 0;
581 else if (*psb)
582 unpin_sb_for_writeback(psb);
583
584 /*
523 * Caller must already hold the ref for this 585 * Caller must already hold the ref for this
524 */ 586 */
525 if (wbc->sync_mode == WB_SYNC_ALL) { 587 if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -532,7 +594,7 @@ static int pin_sb_for_writeback(struct writeback_control *wbc,
532 if (down_read_trylock(&sb->s_umount)) { 594 if (down_read_trylock(&sb->s_umount)) {
533 if (sb->s_root) { 595 if (sb->s_root) {
534 spin_unlock(&sb_lock); 596 spin_unlock(&sb_lock);
535 return 0; 597 goto pinned;
536 } 598 }
537 /* 599 /*
538 * umounted, drop rwsem again and fall through to failure 600 * umounted, drop rwsem again and fall through to failure
@@ -543,24 +605,15 @@ static int pin_sb_for_writeback(struct writeback_control *wbc,
543 sb->s_count--; 605 sb->s_count--;
544 spin_unlock(&sb_lock); 606 spin_unlock(&sb_lock);
545 return 1; 607 return 1;
546} 608pinned:
547 609 *psb = sb;
548static void unpin_sb_for_writeback(struct writeback_control *wbc, 610 return 0;
549 struct inode *inode)
550{
551 struct super_block *sb = inode->i_sb;
552
553 if (wbc->sync_mode == WB_SYNC_ALL)
554 return;
555
556 up_read(&sb->s_umount);
557 put_super(sb);
558} 611}
559 612
560static void writeback_inodes_wb(struct bdi_writeback *wb, 613static void writeback_inodes_wb(struct bdi_writeback *wb,
561 struct writeback_control *wbc) 614 struct writeback_control *wbc)
562{ 615{
563 struct super_block *sb = wbc->sb; 616 struct super_block *sb = wbc->sb, *pin_sb = NULL;
564 const int is_blkdev_sb = sb_is_blkdev_sb(sb); 617 const int is_blkdev_sb = sb_is_blkdev_sb(sb);
565 const unsigned long start = jiffies; /* livelock avoidance */ 618 const unsigned long start = jiffies; /* livelock avoidance */
566 619
@@ -619,7 +672,7 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
619 if (inode_dirtied_after(inode, start)) 672 if (inode_dirtied_after(inode, start))
620 break; 673 break;
621 674
622 if (pin_sb_for_writeback(wbc, inode)) { 675 if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
623 requeue_io(inode); 676 requeue_io(inode);
624 continue; 677 continue;
625 } 678 }
@@ -628,7 +681,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
628 __iget(inode); 681 __iget(inode);
629 pages_skipped = wbc->pages_skipped; 682 pages_skipped = wbc->pages_skipped;
630 writeback_single_inode(inode, wbc); 683 writeback_single_inode(inode, wbc);
631 unpin_sb_for_writeback(wbc, inode);
632 if (wbc->pages_skipped != pages_skipped) { 684 if (wbc->pages_skipped != pages_skipped) {
633 /* 685 /*
634 * writeback is not making progress due to locked 686 * writeback is not making progress due to locked
@@ -648,6 +700,8 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
648 wbc->more_io = 1; 700 wbc->more_io = 1;
649 } 701 }
650 702
703 unpin_sb_for_writeback(&pin_sb);
704
651 spin_unlock(&inode_lock); 705 spin_unlock(&inode_lock);
652 /* Leave any unwritten inodes on b_io */ 706 /* Leave any unwritten inodes on b_io */
653} 707}
@@ -706,6 +760,7 @@ static long wb_writeback(struct bdi_writeback *wb,
706 }; 760 };
707 unsigned long oldest_jif; 761 unsigned long oldest_jif;
708 long wrote = 0; 762 long wrote = 0;
763 struct inode *inode;
709 764
710 if (wbc.for_kupdate) { 765 if (wbc.for_kupdate) {
711 wbc.older_than_this = &oldest_jif; 766 wbc.older_than_this = &oldest_jif;
@@ -719,20 +774,16 @@ static long wb_writeback(struct bdi_writeback *wb,
719 774
720 for (;;) { 775 for (;;) {
721 /* 776 /*
722 * Don't flush anything for non-integrity writeback where 777 * Stop writeback when nr_pages has been consumed
723 * no nr_pages was given
724 */ 778 */
725 if (!args->for_kupdate && args->nr_pages <= 0 && 779 if (args->nr_pages <= 0)
726 args->sync_mode == WB_SYNC_NONE)
727 break; 780 break;
728 781
729 /* 782 /*
730 * If no specific pages were given and this is just a 783 * For background writeout, stop when we are below the
731 * periodic background writeout and we are below the 784 * background dirty threshold
732 * background dirty threshold, don't do anything
733 */ 785 */
734 if (args->for_kupdate && args->nr_pages <= 0 && 786 if (args->for_background && !over_bground_thresh())
735 !over_bground_thresh())
736 break; 787 break;
737 788
738 wbc.more_io = 0; 789 wbc.more_io = 0;
@@ -744,13 +795,32 @@ static long wb_writeback(struct bdi_writeback *wb,
744 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; 795 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
745 796
746 /* 797 /*
747 * If we ran out of stuff to write, bail unless more_io got set 798 * If we consumed everything, see if we have more
748 */ 799 */
749 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { 800 if (wbc.nr_to_write <= 0)
750 if (wbc.more_io && !wbc.for_kupdate) 801 continue;
751 continue; 802 /*
803 * Didn't write everything and we don't have more IO, bail
804 */
805 if (!wbc.more_io)
752 break; 806 break;
807 /*
808 * Did we write something? Try for more
809 */
810 if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
811 continue;
812 /*
813 * Nothing written. Wait for some inode to
814 * become available for writeback. Otherwise
815 * we'll just busyloop.
816 */
817 spin_lock(&inode_lock);
818 if (!list_empty(&wb->b_more_io)) {
819 inode = list_entry(wb->b_more_io.prev,
820 struct inode, i_list);
821 inode_wait_for_writeback(inode);
753 } 822 }
823 spin_unlock(&inode_lock);
754 } 824 }
755 825
756 return wrote; 826 return wrote;
@@ -1060,9 +1130,6 @@ EXPORT_SYMBOL(__mark_inode_dirty);
1060 * If older_than_this is non-NULL, then only write out inodes which 1130 * If older_than_this is non-NULL, then only write out inodes which
1061 * had their first dirtying at a time earlier than *older_than_this. 1131 * had their first dirtying at a time earlier than *older_than_this.
1062 * 1132 *
1063 * If we're a pdlfush thread, then implement pdflush collision avoidance
1064 * against the entire list.
1065 *
1066 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 1133 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
1067 * This function assumes that the blockdev superblock's inodes are backed by 1134 * This function assumes that the blockdev superblock's inodes are backed by
1068 * a variety of queues, so all inodes are searched. For other superblocks, 1135 * a variety of queues, so all inodes are searched. For other superblocks,
@@ -1141,7 +1208,7 @@ void writeback_inodes_sb(struct super_block *sb)
1141 nr_to_write = nr_dirty + nr_unstable + 1208 nr_to_write = nr_dirty + nr_unstable +
1142 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1209 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
1143 1210
1144 bdi_writeback_all(sb, nr_to_write); 1211 bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
1145} 1212}
1146EXPORT_SYMBOL(writeback_inodes_sb); 1213EXPORT_SYMBOL(writeback_inodes_sb);
1147 1214
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 992f6c9410bb..8ada78aade58 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -712,8 +712,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
712 fuse_invalidate_attr(newdir); 712 fuse_invalidate_attr(newdir);
713 713
714 /* newent will end up negative */ 714 /* newent will end up negative */
715 if (newent->d_inode) 715 if (newent->d_inode) {
716 fuse_invalidate_attr(newent->d_inode);
716 fuse_invalidate_entry_cache(newent); 717 fuse_invalidate_entry_cache(newent);
718 }
717 } else if (err == -EINTR) { 719 } else if (err == -EINTR) {
718 /* If request was interrupted, DEITY only knows if the 720 /* If request was interrupted, DEITY only knows if the
719 rename actually took place. If the invalidation 721 rename actually took place. If the invalidation
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index cbc464043b6f..c18913a777ae 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1063,7 +1063,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
1063 break; 1063 break;
1064 } 1064 }
1065 } 1065 }
1066 fuse_put_request(fc, req); 1066 if (!IS_ERR(req))
1067 fuse_put_request(fc, req);
1067 if (res > 0) 1068 if (res > 0)
1068 *ppos = pos; 1069 *ppos = pos;
1069 1070
@@ -1313,7 +1314,7 @@ static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1313 return 0; 1314 return 0;
1314} 1315}
1315 1316
1316static struct vm_operations_struct fuse_file_vm_ops = { 1317static const struct vm_operations_struct fuse_file_vm_ops = {
1317 .close = fuse_vma_close, 1318 .close = fuse_vma_close,
1318 .fault = filemap_fault, 1319 .fault = filemap_fault,
1319 .page_mkwrite = fuse_page_mkwrite, 1320 .page_mkwrite = fuse_page_mkwrite,
@@ -1599,7 +1600,7 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1599 kaddr += copy; 1600 kaddr += copy;
1600 } 1601 }
1601 1602
1602 kunmap(map); 1603 kunmap(page);
1603 } 1604 }
1604 1605
1605 return 0; 1606 return 0;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 166f38fbd246..4eb308aa3234 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -418,7 +418,7 @@ out:
418 return ret; 418 return ret;
419} 419}
420 420
421static struct vm_operations_struct gfs2_vm_ops = { 421static const struct vm_operations_struct gfs2_vm_ops = {
422 .fault = filemap_fault, 422 .fault = filemap_fault,
423 .page_mkwrite = gfs2_page_mkwrite, 423 .page_mkwrite = gfs2_page_mkwrite,
424}; 424};
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 9b9d6395bad3..052f214ea6f0 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -58,6 +58,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
58 } 58 }
59 unlock_new_inode(tree->inode); 59 unlock_new_inode(tree->inode);
60 60
61 if (!HFS_I(tree->inode)->first_blocks) {
62 printk(KERN_ERR "hfs: invalid btree extent records (0 size).\n");
63 goto free_inode;
64 }
65
61 mapping = tree->inode->i_mapping; 66 mapping = tree->inode->i_mapping;
62 page = read_mapping_page(mapping, 0, NULL); 67 page = read_mapping_page(mapping, 0, NULL);
63 if (IS_ERR(page)) 68 if (IS_ERR(page))
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 175d08eacc86..bed78ac8f6d1 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -99,6 +99,10 @@ int hfsplus_read_wrapper(struct super_block *sb)
99 99
100 if (hfsplus_get_last_session(sb, &part_start, &part_size)) 100 if (hfsplus_get_last_session(sb, &part_start, &part_size))
101 return -EINVAL; 101 return -EINVAL;
102 if ((u64)part_start + part_size > 0x100000000ULL) {
103 pr_err("hfs: volumes larger than 2TB are not supported yet\n");
104 return -EINVAL;
105 }
102 while (1) { 106 while (1) {
103 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr); 107 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
104 if (!bh) 108 if (!bh)
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7b17a14396ff..6c751106c2e5 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -254,7 +254,7 @@ int __generic_block_fiemap(struct inode *inode,
254 u64 len, get_block_t *get_block) 254 u64 len, get_block_t *get_block)
255{ 255{
256 struct buffer_head tmp; 256 struct buffer_head tmp;
257 unsigned int start_blk; 257 unsigned long long start_blk;
258 long long length = 0, map_len = 0; 258 long long length = 0, map_len = 0;
259 u64 logical = 0, phys = 0, size = 0; 259 u64 logical = 0, phys = 0, size = 0;
260 u32 flags = FIEMAP_EXTENT_MERGED; 260 u32 flags = FIEMAP_EXTENT_MERGED;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index bd3c073b485d..4160afad6d00 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -73,6 +73,7 @@ EXPORT_SYMBOL(journal_errno);
73EXPORT_SYMBOL(journal_ack_err); 73EXPORT_SYMBOL(journal_ack_err);
74EXPORT_SYMBOL(journal_clear_err); 74EXPORT_SYMBOL(journal_clear_err);
75EXPORT_SYMBOL(log_wait_commit); 75EXPORT_SYMBOL(log_wait_commit);
76EXPORT_SYMBOL(log_start_commit);
76EXPORT_SYMBOL(journal_start_commit); 77EXPORT_SYMBOL(journal_start_commit);
77EXPORT_SYMBOL(journal_force_commit_nested); 78EXPORT_SYMBOL(journal_force_commit_nested);
78EXPORT_SYMBOL(journal_wipe); 79EXPORT_SYMBOL(journal_wipe);
@@ -756,6 +757,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
756 757
757 return journal; 758 return journal;
758out_err: 759out_err:
760 kfree(journal->j_wbuf);
759 kfree(journal); 761 kfree(journal);
760 return NULL; 762 return NULL;
761} 763}
@@ -820,6 +822,7 @@ journal_t * journal_init_inode (struct inode *inode)
820 822
821 return journal; 823 return journal;
822out_err: 824out_err:
825 kfree(journal->j_wbuf);
823 kfree(journal); 826 kfree(journal);
824 return NULL; 827 return NULL;
825} 828}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5d70b3e6d49b..ca0f5eb62b20 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -643,6 +643,7 @@ out:
643 643
644int __jbd2_journal_remove_checkpoint(struct journal_head *jh) 644int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
645{ 645{
646 struct transaction_chp_stats_s *stats;
646 transaction_t *transaction; 647 transaction_t *transaction;
647 journal_t *journal; 648 journal_t *journal;
648 int ret = 0; 649 int ret = 0;
@@ -679,6 +680,12 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
679 680
680 /* OK, that was the last buffer for the transaction: we can now 681 /* OK, that was the last buffer for the transaction: we can now
681 safely remove this transaction from the log */ 682 safely remove this transaction from the log */
683 stats = &transaction->t_chp_stats;
684 if (stats->cs_chp_time)
685 stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time,
686 jiffies);
687 trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev,
688 transaction->t_tid, stats);
682 689
683 __jbd2_journal_drop_transaction(journal, transaction); 690 __jbd2_journal_drop_transaction(journal, transaction);
684 kfree(transaction); 691 kfree(transaction);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 26d991ddc1e6..d4cfd6d2779e 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -410,10 +410,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
410 if (commit_transaction->t_synchronous_commit) 410 if (commit_transaction->t_synchronous_commit)
411 write_op = WRITE_SYNC_PLUG; 411 write_op = WRITE_SYNC_PLUG;
412 trace_jbd2_commit_locking(journal, commit_transaction); 412 trace_jbd2_commit_locking(journal, commit_transaction);
413 stats.u.run.rs_wait = commit_transaction->t_max_wait; 413 stats.run.rs_wait = commit_transaction->t_max_wait;
414 stats.u.run.rs_locked = jiffies; 414 stats.run.rs_locked = jiffies;
415 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 415 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
416 stats.u.run.rs_locked); 416 stats.run.rs_locked);
417 417
418 spin_lock(&commit_transaction->t_handle_lock); 418 spin_lock(&commit_transaction->t_handle_lock);
419 while (commit_transaction->t_updates) { 419 while (commit_transaction->t_updates) {
@@ -486,9 +486,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
486 jbd2_journal_switch_revoke_table(journal); 486 jbd2_journal_switch_revoke_table(journal);
487 487
488 trace_jbd2_commit_flushing(journal, commit_transaction); 488 trace_jbd2_commit_flushing(journal, commit_transaction);
489 stats.u.run.rs_flushing = jiffies; 489 stats.run.rs_flushing = jiffies;
490 stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked, 490 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
491 stats.u.run.rs_flushing); 491 stats.run.rs_flushing);
492 492
493 commit_transaction->t_state = T_FLUSH; 493 commit_transaction->t_state = T_FLUSH;
494 journal->j_committing_transaction = commit_transaction; 494 journal->j_committing_transaction = commit_transaction;
@@ -523,11 +523,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
523 spin_unlock(&journal->j_state_lock); 523 spin_unlock(&journal->j_state_lock);
524 524
525 trace_jbd2_commit_logging(journal, commit_transaction); 525 trace_jbd2_commit_logging(journal, commit_transaction);
526 stats.u.run.rs_logging = jiffies; 526 stats.run.rs_logging = jiffies;
527 stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing, 527 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
528 stats.u.run.rs_logging); 528 stats.run.rs_logging);
529 stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits; 529 stats.run.rs_blocks = commit_transaction->t_outstanding_credits;
530 stats.u.run.rs_blocks_logged = 0; 530 stats.run.rs_blocks_logged = 0;
531 531
532 J_ASSERT(commit_transaction->t_nr_buffers <= 532 J_ASSERT(commit_transaction->t_nr_buffers <=
533 commit_transaction->t_outstanding_credits); 533 commit_transaction->t_outstanding_credits);
@@ -695,7 +695,7 @@ start_journal_io:
695 submit_bh(write_op, bh); 695 submit_bh(write_op, bh);
696 } 696 }
697 cond_resched(); 697 cond_resched();
698 stats.u.run.rs_blocks_logged += bufs; 698 stats.run.rs_blocks_logged += bufs;
699 699
700 /* Force a new descriptor to be generated next 700 /* Force a new descriptor to be generated next
701 time round the loop. */ 701 time round the loop. */
@@ -988,33 +988,30 @@ restart_loop:
988 J_ASSERT(commit_transaction->t_state == T_COMMIT); 988 J_ASSERT(commit_transaction->t_state == T_COMMIT);
989 989
990 commit_transaction->t_start = jiffies; 990 commit_transaction->t_start = jiffies;
991 stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging, 991 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
992 commit_transaction->t_start); 992 commit_transaction->t_start);
993 993
994 /* 994 /*
995 * File the transaction for history 995 * File the transaction statistics
996 */ 996 */
997 stats.ts_type = JBD2_STATS_RUN;
998 stats.ts_tid = commit_transaction->t_tid; 997 stats.ts_tid = commit_transaction->t_tid;
999 stats.u.run.rs_handle_count = commit_transaction->t_handle_count; 998 stats.run.rs_handle_count = commit_transaction->t_handle_count;
1000 spin_lock(&journal->j_history_lock); 999 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1001 memcpy(journal->j_history + journal->j_history_cur, &stats, 1000 commit_transaction->t_tid, &stats.run);
1002 sizeof(stats));
1003 if (++journal->j_history_cur == journal->j_history_max)
1004 journal->j_history_cur = 0;
1005 1001
1006 /* 1002 /*
1007 * Calculate overall stats 1003 * Calculate overall stats
1008 */ 1004 */
1005 spin_lock(&journal->j_history_lock);
1009 journal->j_stats.ts_tid++; 1006 journal->j_stats.ts_tid++;
1010 journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait; 1007 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1011 journal->j_stats.u.run.rs_running += stats.u.run.rs_running; 1008 journal->j_stats.run.rs_running += stats.run.rs_running;
1012 journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked; 1009 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1013 journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing; 1010 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1014 journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging; 1011 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1015 journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count; 1012 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1016 journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks; 1013 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1017 journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged; 1014 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1018 spin_unlock(&journal->j_history_lock); 1015 spin_unlock(&journal->j_history_lock);
1019 1016
1020 commit_transaction->t_state = T_FINISHED; 1017 commit_transaction->t_state = T_FINISHED;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 53b86e16e5fe..fed85388ee86 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -136,10 +136,6 @@ static int kjournald2(void *arg)
136 journal->j_task = current; 136 journal->j_task = current;
137 wake_up(&journal->j_wait_done_commit); 137 wake_up(&journal->j_wait_done_commit);
138 138
139 printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
140 "commit interval %ld seconds\n", current->pid,
141 journal->j_devname, journal->j_commit_interval / HZ);
142
143 /* 139 /*
144 * And now, wait forever for commit wakeup events. 140 * And now, wait forever for commit wakeup events.
145 */ 141 */
@@ -223,7 +219,8 @@ static int jbd2_journal_start_thread(journal_t *journal)
223{ 219{
224 struct task_struct *t; 220 struct task_struct *t;
225 221
226 t = kthread_run(kjournald2, journal, "kjournald2"); 222 t = kthread_run(kjournald2, journal, "jbd2/%s",
223 journal->j_devname);
227 if (IS_ERR(t)) 224 if (IS_ERR(t))
228 return PTR_ERR(t); 225 return PTR_ERR(t);
229 226
@@ -679,153 +676,6 @@ struct jbd2_stats_proc_session {
679 int max; 676 int max;
680}; 677};
681 678
682static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s,
683 struct transaction_stats_s *ts,
684 int first)
685{
686 if (ts == s->stats + s->max)
687 ts = s->stats;
688 if (!first && ts == s->stats + s->start)
689 return NULL;
690 while (ts->ts_type == 0) {
691 ts++;
692 if (ts == s->stats + s->max)
693 ts = s->stats;
694 if (ts == s->stats + s->start)
695 return NULL;
696 }
697 return ts;
698
699}
700
701static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos)
702{
703 struct jbd2_stats_proc_session *s = seq->private;
704 struct transaction_stats_s *ts;
705 int l = *pos;
706
707 if (l == 0)
708 return SEQ_START_TOKEN;
709 ts = jbd2_history_skip_empty(s, s->stats + s->start, 1);
710 if (!ts)
711 return NULL;
712 l--;
713 while (l) {
714 ts = jbd2_history_skip_empty(s, ++ts, 0);
715 if (!ts)
716 break;
717 l--;
718 }
719 return ts;
720}
721
722static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
723{
724 struct jbd2_stats_proc_session *s = seq->private;
725 struct transaction_stats_s *ts = v;
726
727 ++*pos;
728 if (v == SEQ_START_TOKEN)
729 return jbd2_history_skip_empty(s, s->stats + s->start, 1);
730 else
731 return jbd2_history_skip_empty(s, ++ts, 0);
732}
733
734static int jbd2_seq_history_show(struct seq_file *seq, void *v)
735{
736 struct transaction_stats_s *ts = v;
737 if (v == SEQ_START_TOKEN) {
738 seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s "
739 "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid",
740 "wait", "run", "lock", "flush", "log", "hndls",
741 "block", "inlog", "ctime", "write", "drop",
742 "close");
743 return 0;
744 }
745 if (ts->ts_type == JBD2_STATS_RUN)
746 seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u "
747 "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid,
748 jiffies_to_msecs(ts->u.run.rs_wait),
749 jiffies_to_msecs(ts->u.run.rs_running),
750 jiffies_to_msecs(ts->u.run.rs_locked),
751 jiffies_to_msecs(ts->u.run.rs_flushing),
752 jiffies_to_msecs(ts->u.run.rs_logging),
753 ts->u.run.rs_handle_count,
754 ts->u.run.rs_blocks,
755 ts->u.run.rs_blocks_logged);
756 else if (ts->ts_type == JBD2_STATS_CHECKPOINT)
757 seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n",
758 "C", ts->ts_tid, " ",
759 jiffies_to_msecs(ts->u.chp.cs_chp_time),
760 ts->u.chp.cs_written, ts->u.chp.cs_dropped,
761 ts->u.chp.cs_forced_to_close);
762 else
763 J_ASSERT(0);
764 return 0;
765}
766
767static void jbd2_seq_history_stop(struct seq_file *seq, void *v)
768{
769}
770
771static const struct seq_operations jbd2_seq_history_ops = {
772 .start = jbd2_seq_history_start,
773 .next = jbd2_seq_history_next,
774 .stop = jbd2_seq_history_stop,
775 .show = jbd2_seq_history_show,
776};
777
778static int jbd2_seq_history_open(struct inode *inode, struct file *file)
779{
780 journal_t *journal = PDE(inode)->data;
781 struct jbd2_stats_proc_session *s;
782 int rc, size;
783
784 s = kmalloc(sizeof(*s), GFP_KERNEL);
785 if (s == NULL)
786 return -ENOMEM;
787 size = sizeof(struct transaction_stats_s) * journal->j_history_max;
788 s->stats = kmalloc(size, GFP_KERNEL);
789 if (s->stats == NULL) {
790 kfree(s);
791 return -ENOMEM;
792 }
793 spin_lock(&journal->j_history_lock);
794 memcpy(s->stats, journal->j_history, size);
795 s->max = journal->j_history_max;
796 s->start = journal->j_history_cur % s->max;
797 spin_unlock(&journal->j_history_lock);
798
799 rc = seq_open(file, &jbd2_seq_history_ops);
800 if (rc == 0) {
801 struct seq_file *m = file->private_data;
802 m->private = s;
803 } else {
804 kfree(s->stats);
805 kfree(s);
806 }
807 return rc;
808
809}
810
811static int jbd2_seq_history_release(struct inode *inode, struct file *file)
812{
813 struct seq_file *seq = file->private_data;
814 struct jbd2_stats_proc_session *s = seq->private;
815
816 kfree(s->stats);
817 kfree(s);
818 return seq_release(inode, file);
819}
820
821static struct file_operations jbd2_seq_history_fops = {
822 .owner = THIS_MODULE,
823 .open = jbd2_seq_history_open,
824 .read = seq_read,
825 .llseek = seq_lseek,
826 .release = jbd2_seq_history_release,
827};
828
829static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) 679static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
830{ 680{
831 return *pos ? NULL : SEQ_START_TOKEN; 681 return *pos ? NULL : SEQ_START_TOKEN;
@@ -842,29 +692,29 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
842 692
843 if (v != SEQ_START_TOKEN) 693 if (v != SEQ_START_TOKEN)
844 return 0; 694 return 0;
845 seq_printf(seq, "%lu transaction, each upto %u blocks\n", 695 seq_printf(seq, "%lu transaction, each up to %u blocks\n",
846 s->stats->ts_tid, 696 s->stats->ts_tid,
847 s->journal->j_max_transaction_buffers); 697 s->journal->j_max_transaction_buffers);
848 if (s->stats->ts_tid == 0) 698 if (s->stats->ts_tid == 0)
849 return 0; 699 return 0;
850 seq_printf(seq, "average: \n %ums waiting for transaction\n", 700 seq_printf(seq, "average: \n %ums waiting for transaction\n",
851 jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid)); 701 jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
852 seq_printf(seq, " %ums running transaction\n", 702 seq_printf(seq, " %ums running transaction\n",
853 jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid)); 703 jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
854 seq_printf(seq, " %ums transaction was being locked\n", 704 seq_printf(seq, " %ums transaction was being locked\n",
855 jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid)); 705 jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
856 seq_printf(seq, " %ums flushing data (in ordered mode)\n", 706 seq_printf(seq, " %ums flushing data (in ordered mode)\n",
857 jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); 707 jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
858 seq_printf(seq, " %ums logging transaction\n", 708 seq_printf(seq, " %ums logging transaction\n",
859 jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); 709 jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
860 seq_printf(seq, " %lluus average transaction commit time\n", 710 seq_printf(seq, " %lluus average transaction commit time\n",
861 div_u64(s->journal->j_average_commit_time, 1000)); 711 div_u64(s->journal->j_average_commit_time, 1000));
862 seq_printf(seq, " %lu handles per transaction\n", 712 seq_printf(seq, " %lu handles per transaction\n",
863 s->stats->u.run.rs_handle_count / s->stats->ts_tid); 713 s->stats->run.rs_handle_count / s->stats->ts_tid);
864 seq_printf(seq, " %lu blocks per transaction\n", 714 seq_printf(seq, " %lu blocks per transaction\n",
865 s->stats->u.run.rs_blocks / s->stats->ts_tid); 715 s->stats->run.rs_blocks / s->stats->ts_tid);
866 seq_printf(seq, " %lu logged blocks per transaction\n", 716 seq_printf(seq, " %lu logged blocks per transaction\n",
867 s->stats->u.run.rs_blocks_logged / s->stats->ts_tid); 717 s->stats->run.rs_blocks_logged / s->stats->ts_tid);
868 return 0; 718 return 0;
869} 719}
870 720
@@ -920,7 +770,7 @@ static int jbd2_seq_info_release(struct inode *inode, struct file *file)
920 return seq_release(inode, file); 770 return seq_release(inode, file);
921} 771}
922 772
923static struct file_operations jbd2_seq_info_fops = { 773static const struct file_operations jbd2_seq_info_fops = {
924 .owner = THIS_MODULE, 774 .owner = THIS_MODULE,
925 .open = jbd2_seq_info_open, 775 .open = jbd2_seq_info_open,
926 .read = seq_read, 776 .read = seq_read,
@@ -934,8 +784,6 @@ static void jbd2_stats_proc_init(journal_t *journal)
934{ 784{
935 journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); 785 journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
936 if (journal->j_proc_entry) { 786 if (journal->j_proc_entry) {
937 proc_create_data("history", S_IRUGO, journal->j_proc_entry,
938 &jbd2_seq_history_fops, journal);
939 proc_create_data("info", S_IRUGO, journal->j_proc_entry, 787 proc_create_data("info", S_IRUGO, journal->j_proc_entry,
940 &jbd2_seq_info_fops, journal); 788 &jbd2_seq_info_fops, journal);
941 } 789 }
@@ -944,27 +792,9 @@ static void jbd2_stats_proc_init(journal_t *journal)
944static void jbd2_stats_proc_exit(journal_t *journal) 792static void jbd2_stats_proc_exit(journal_t *journal)
945{ 793{
946 remove_proc_entry("info", journal->j_proc_entry); 794 remove_proc_entry("info", journal->j_proc_entry);
947 remove_proc_entry("history", journal->j_proc_entry);
948 remove_proc_entry(journal->j_devname, proc_jbd2_stats); 795 remove_proc_entry(journal->j_devname, proc_jbd2_stats);
949} 796}
950 797
951static void journal_init_stats(journal_t *journal)
952{
953 int size;
954
955 if (!proc_jbd2_stats)
956 return;
957
958 journal->j_history_max = 100;
959 size = sizeof(struct transaction_stats_s) * journal->j_history_max;
960 journal->j_history = kzalloc(size, GFP_KERNEL);
961 if (!journal->j_history) {
962 journal->j_history_max = 0;
963 return;
964 }
965 spin_lock_init(&journal->j_history_lock);
966}
967
968/* 798/*
969 * Management for journal control blocks: functions to create and 799 * Management for journal control blocks: functions to create and
970 * destroy journal_t structures, and to initialise and read existing 800 * destroy journal_t structures, and to initialise and read existing
@@ -1009,7 +839,7 @@ static journal_t * journal_init_common (void)
1009 goto fail; 839 goto fail;
1010 } 840 }
1011 841
1012 journal_init_stats(journal); 842 spin_lock_init(&journal->j_history_lock);
1013 843
1014 return journal; 844 return journal;
1015fail: 845fail:
@@ -1083,6 +913,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
1083 913
1084 return journal; 914 return journal;
1085out_err: 915out_err:
916 kfree(journal->j_wbuf);
1086 jbd2_stats_proc_exit(journal); 917 jbd2_stats_proc_exit(journal);
1087 kfree(journal); 918 kfree(journal);
1088 return NULL; 919 return NULL;
@@ -1115,7 +946,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1115 while ((p = strchr(p, '/'))) 946 while ((p = strchr(p, '/')))
1116 *p = '!'; 947 *p = '!';
1117 p = journal->j_devname + strlen(journal->j_devname); 948 p = journal->j_devname + strlen(journal->j_devname);
1118 sprintf(p, ":%lu", journal->j_inode->i_ino); 949 sprintf(p, "-%lu", journal->j_inode->i_ino);
1119 jbd_debug(1, 950 jbd_debug(1,
1120 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", 951 "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
1121 journal, inode->i_sb->s_id, inode->i_ino, 952 journal, inode->i_sb->s_id, inode->i_ino,
@@ -1156,6 +987,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1156 987
1157 return journal; 988 return journal;
1158out_err: 989out_err:
990 kfree(journal->j_wbuf);
1159 jbd2_stats_proc_exit(journal); 991 jbd2_stats_proc_exit(journal);
1160 kfree(journal); 992 kfree(journal);
1161 return NULL; 993 return NULL;
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 5d8dcb9ee326..15458decdb8a 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -95,7 +95,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area,
95 return VM_FAULT_MAJOR; 95 return VM_FAULT_MAJOR;
96} 96}
97 97
98static struct vm_operations_struct ncp_file_mmap = 98static const struct vm_operations_struct ncp_file_mmap =
99{ 99{
100 .fault = ncp_file_mmap_fault, 100 .fault = ncp_file_mmap_fault,
101}; 101};
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 63976c0ccc25..99ea196f071f 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1180,7 +1180,7 @@ static int nfs4_init_client(struct nfs_client *clp,
1180 1, flags & NFS_MOUNT_NORESVPORT); 1180 1, flags & NFS_MOUNT_NORESVPORT);
1181 if (error < 0) 1181 if (error < 0)
1182 goto error; 1182 goto error;
1183 memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); 1183 strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
1184 1184
1185 error = nfs_idmap_new(clp); 1185 error = nfs_idmap_new(clp);
1186 if (error < 0) { 1186 if (error < 0) {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 32062c33c859..7cb298525eef 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1536,6 +1536,8 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1536 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1536 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1537 dentry->d_parent->d_name.name, dentry->d_name.name); 1537 dentry->d_parent->d_name.name, dentry->d_name.name);
1538 1538
1539 nfs_inode_return_delegation(inode);
1540
1539 d_drop(dentry); 1541 d_drop(dentry);
1540 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); 1542 error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
1541 if (error == 0) { 1543 if (error == 0) {
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 6c3210099d51..e1d415e97849 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -457,6 +457,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
457 }; 457 };
458 struct rpc_task_setup task_setup_data = { 458 struct rpc_task_setup task_setup_data = {
459 .rpc_client = NFS_CLIENT(inode), 459 .rpc_client = NFS_CLIENT(inode),
460 .rpc_message = &msg,
460 .callback_ops = &nfs_write_direct_ops, 461 .callback_ops = &nfs_write_direct_ops,
461 .workqueue = nfsiod_workqueue, 462 .workqueue = nfsiod_workqueue,
462 .flags = RPC_TASK_ASYNC, 463 .flags = RPC_TASK_ASYNC,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 86d6b4db1096..f5fdd39e037a 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -59,7 +59,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
59static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); 59static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
60static int nfs_setlease(struct file *file, long arg, struct file_lock **fl); 60static int nfs_setlease(struct file *file, long arg, struct file_lock **fl);
61 61
62static struct vm_operations_struct nfs_file_vm_ops; 62static const struct vm_operations_struct nfs_file_vm_ops;
63 63
64const struct file_operations nfs_file_operations = { 64const struct file_operations nfs_file_operations = {
65 .llseek = nfs_file_llseek, 65 .llseek = nfs_file_llseek,
@@ -572,7 +572,7 @@ out_unlock:
572 return VM_FAULT_SIGBUS; 572 return VM_FAULT_SIGBUS;
573} 573}
574 574
575static struct vm_operations_struct nfs_file_vm_ops = { 575static const struct vm_operations_struct nfs_file_vm_ops = {
576 .fault = filemap_fault, 576 .fault = filemap_fault,
577 .page_mkwrite = nfs_vm_page_mkwrite, 577 .page_mkwrite = nfs_vm_page_mkwrite,
578}; 578};
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 2636c26d56fa..fa3408f20112 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -121,7 +121,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
121 121
122 mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); 122 mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
123 if (IS_ERR(mnt_path)) 123 if (IS_ERR(mnt_path))
124 return mnt; 124 return ERR_CAST(mnt_path);
125 mountdata->mnt_path = mnt_path; 125 mountdata->mnt_path = mnt_path;
126 maxbuflen = mnt_path - 1 - page2; 126 maxbuflen = mnt_path - 1 - page2;
127 127
@@ -132,15 +132,15 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
132 if (buf->len <= 0 || buf->len >= maxbuflen) 132 if (buf->len <= 0 || buf->len >= maxbuflen)
133 continue; 133 continue;
134 134
135 mountdata->addr = (struct sockaddr *)&addr;
136
137 if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) 135 if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
138 continue; 136 continue;
139 mountdata->addrlen = nfs_parse_server_name(buf->data, 137
140 buf->len, 138 mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
141 mountdata->addr, mountdata->addrlen); 139 (struct sockaddr *)&addr, sizeof(addr));
142 if (mountdata->addrlen == 0) 140 if (mountdata->addrlen == 0)
143 continue; 141 continue;
142
143 mountdata->addr = (struct sockaddr *)&addr;
144 rpc_set_port(mountdata->addr, NFS_PORT); 144 rpc_set_port(mountdata->addr, NFS_PORT);
145 145
146 memcpy(page2, buf->data, buf->len); 146 memcpy(page2, buf->data, buf->len);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ed7c269e2514..ff37454fa783 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -72,12 +72,17 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
72/* Prevent leaks of NFSv4 errors into userland */ 72/* Prevent leaks of NFSv4 errors into userland */
73static int nfs4_map_errors(int err) 73static int nfs4_map_errors(int err)
74{ 74{
75 if (err < -1000) { 75 if (err >= -1000)
76 return err;
77 switch (err) {
78 case -NFS4ERR_RESOURCE:
79 return -EREMOTEIO;
80 default:
76 dprintk("%s could not handle NFSv4 error %d\n", 81 dprintk("%s could not handle NFSv4 error %d\n",
77 __func__, -err); 82 __func__, -err);
78 return -EIO; 83 break;
79 } 84 }
80 return err; 85 return -EIO;
81} 86}
82 87
83/* 88/*
@@ -3060,9 +3065,6 @@ static void nfs4_renew_done(struct rpc_task *task, void *data)
3060 if (time_before(clp->cl_last_renewal,timestamp)) 3065 if (time_before(clp->cl_last_renewal,timestamp))
3061 clp->cl_last_renewal = timestamp; 3066 clp->cl_last_renewal = timestamp;
3062 spin_unlock(&clp->cl_lock); 3067 spin_unlock(&clp->cl_lock);
3063 dprintk("%s calling put_rpccred on rpc_cred %p\n", __func__,
3064 task->tk_msg.rpc_cred);
3065 put_rpccred(task->tk_msg.rpc_cred);
3066} 3068}
3067 3069
3068static const struct rpc_call_ops nfs4_renew_ops = { 3070static const struct rpc_call_ops nfs4_renew_ops = {
@@ -4877,7 +4879,6 @@ void nfs41_sequence_call_done(struct rpc_task *task, void *data)
4877 nfs41_sequence_free_slot(clp, task->tk_msg.rpc_resp); 4879 nfs41_sequence_free_slot(clp, task->tk_msg.rpc_resp);
4878 dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); 4880 dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred);
4879 4881
4880 put_rpccred(task->tk_msg.rpc_cred);
4881 kfree(task->tk_msg.rpc_argp); 4882 kfree(task->tk_msg.rpc_argp);
4882 kfree(task->tk_msg.rpc_resp); 4883 kfree(task->tk_msg.rpc_resp);
4883 4884
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index e27c6cef18f2..0156c01c212c 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -127,12 +127,6 @@ nfs4_schedule_state_renewal(struct nfs_client *clp)
127} 127}
128 128
129void 129void
130nfs4_renewd_prepare_shutdown(struct nfs_server *server)
131{
132 cancel_delayed_work(&server->nfs_client->cl_renewd);
133}
134
135void
136nfs4_kill_renewd(struct nfs_client *clp) 130nfs4_kill_renewd(struct nfs_client *clp)
137{ 131{
138 cancel_delayed_work_sync(&clp->cl_renewd); 132 cancel_delayed_work_sync(&clp->cl_renewd);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 83ad47cbdd8a..20b4e30e6c82 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5681,7 +5681,6 @@ static struct {
5681 { NFS4ERR_SERVERFAULT, -ESERVERFAULT }, 5681 { NFS4ERR_SERVERFAULT, -ESERVERFAULT },
5682 { NFS4ERR_BADTYPE, -EBADTYPE }, 5682 { NFS4ERR_BADTYPE, -EBADTYPE },
5683 { NFS4ERR_LOCKED, -EAGAIN }, 5683 { NFS4ERR_LOCKED, -EAGAIN },
5684 { NFS4ERR_RESOURCE, -EREMOTEIO },
5685 { NFS4ERR_SYMLINK, -ELOOP }, 5684 { NFS4ERR_SYMLINK, -ELOOP },
5686 { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, 5685 { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP },
5687 { NFS4ERR_DEADLOCK, -EDEADLK }, 5686 { NFS4ERR_DEADLOCK, -EDEADLK },
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 810770f96816..90be551b80c1 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -728,22 +728,24 @@ static void nfs_umount_begin(struct super_block *sb)
728 unlock_kernel(); 728 unlock_kernel();
729} 729}
730 730
731static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(int flags) 731static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version)
732{ 732{
733 struct nfs_parsed_mount_data *data; 733 struct nfs_parsed_mount_data *data;
734 734
735 data = kzalloc(sizeof(*data), GFP_KERNEL); 735 data = kzalloc(sizeof(*data), GFP_KERNEL);
736 if (data) { 736 if (data) {
737 data->flags = flags;
738 data->rsize = NFS_MAX_FILE_IO_SIZE; 737 data->rsize = NFS_MAX_FILE_IO_SIZE;
739 data->wsize = NFS_MAX_FILE_IO_SIZE; 738 data->wsize = NFS_MAX_FILE_IO_SIZE;
740 data->acregmin = NFS_DEF_ACREGMIN; 739 data->acregmin = NFS_DEF_ACREGMIN;
741 data->acregmax = NFS_DEF_ACREGMAX; 740 data->acregmax = NFS_DEF_ACREGMAX;
742 data->acdirmin = NFS_DEF_ACDIRMIN; 741 data->acdirmin = NFS_DEF_ACDIRMIN;
743 data->acdirmax = NFS_DEF_ACDIRMAX; 742 data->acdirmax = NFS_DEF_ACDIRMAX;
743 data->mount_server.port = NFS_UNSPEC_PORT;
744 data->nfs_server.port = NFS_UNSPEC_PORT; 744 data->nfs_server.port = NFS_UNSPEC_PORT;
745 data->nfs_server.protocol = XPRT_TRANSPORT_TCP;
745 data->auth_flavors[0] = RPC_AUTH_UNIX; 746 data->auth_flavors[0] = RPC_AUTH_UNIX;
746 data->auth_flavor_len = 1; 747 data->auth_flavor_len = 1;
748 data->version = version;
747 data->minorversion = 0; 749 data->minorversion = 0;
748 } 750 }
749 return data; 751 return data;
@@ -776,15 +778,13 @@ static int nfs_verify_server_address(struct sockaddr *addr)
776 * Select between a default port value and a user-specified port value. 778 * Select between a default port value and a user-specified port value.
777 * If a zero value is set, then autobind will be used. 779 * If a zero value is set, then autobind will be used.
778 */ 780 */
779static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port, 781static void nfs_set_port(struct sockaddr *sap, int *port,
780 const unsigned short default_port) 782 const unsigned short default_port)
781{ 783{
782 unsigned short port = default_port; 784 if (*port == NFS_UNSPEC_PORT)
785 *port = default_port;
783 786
784 if (parsed_port != NFS_UNSPEC_PORT) 787 rpc_set_port(sap, *port);
785 port = parsed_port;
786
787 rpc_set_port(sap, port);
788} 788}
789 789
790/* 790/*
@@ -1253,6 +1253,7 @@ static int nfs_parse_mount_options(char *raw,
1253 default: 1253 default:
1254 dfprintk(MOUNT, "NFS: unrecognized " 1254 dfprintk(MOUNT, "NFS: unrecognized "
1255 "transport protocol\n"); 1255 "transport protocol\n");
1256 kfree(string);
1256 return 0; 1257 return 0;
1257 } 1258 }
1258 break; 1259 break;
@@ -1475,7 +1476,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1475 args->mount_server.addrlen = args->nfs_server.addrlen; 1476 args->mount_server.addrlen = args->nfs_server.addrlen;
1476 } 1477 }
1477 request.salen = args->mount_server.addrlen; 1478 request.salen = args->mount_server.addrlen;
1478 nfs_set_default_port(request.sap, args->mount_server.port, 0); 1479 nfs_set_port(request.sap, &args->mount_server.port, 0);
1479 1480
1480 /* 1481 /*
1481 * Now ask the mount server to map our export path 1482 * Now ask the mount server to map our export path
@@ -1765,7 +1766,7 @@ static int nfs_validate_mount_data(void *options,
1765 goto out_v4_not_compiled; 1766 goto out_v4_not_compiled;
1766#endif 1767#endif
1767 1768
1768 nfs_set_default_port(sap, args->nfs_server.port, 0); 1769 nfs_set_port(sap, &args->nfs_server.port, 0);
1769 1770
1770 nfs_set_mount_transport_protocol(args); 1771 nfs_set_mount_transport_protocol(args);
1771 1772
@@ -1846,9 +1847,10 @@ nfs_compare_remount_data(struct nfs_server *nfss,
1846 data->acdirmin != nfss->acdirmin / HZ || 1847 data->acdirmin != nfss->acdirmin / HZ ||
1847 data->acdirmax != nfss->acdirmax / HZ || 1848 data->acdirmax != nfss->acdirmax / HZ ||
1848 data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) || 1849 data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
1850 data->nfs_server.port != nfss->port ||
1849 data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen || 1851 data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
1850 memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr, 1852 !rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address,
1851 data->nfs_server.addrlen) != 0) 1853 (struct sockaddr *)&nfss->nfs_client->cl_addr))
1852 return -EINVAL; 1854 return -EINVAL;
1853 1855
1854 return 0; 1856 return 0;
@@ -1891,6 +1893,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
1891 data->acdirmin = nfss->acdirmin / HZ; 1893 data->acdirmin = nfss->acdirmin / HZ;
1892 data->acdirmax = nfss->acdirmax / HZ; 1894 data->acdirmax = nfss->acdirmax / HZ;
1893 data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; 1895 data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
1896 data->nfs_server.port = nfss->port;
1894 data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; 1897 data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
1895 memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr, 1898 memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
1896 data->nfs_server.addrlen); 1899 data->nfs_server.addrlen);
@@ -2104,7 +2107,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2104 }; 2107 };
2105 int error = -ENOMEM; 2108 int error = -ENOMEM;
2106 2109
2107 data = nfs_alloc_parsed_mount_data(NFS_MOUNT_VER3 | NFS_MOUNT_TCP); 2110 data = nfs_alloc_parsed_mount_data(3);
2108 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL); 2111 mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
2109 if (data == NULL || mntfh == NULL) 2112 if (data == NULL || mntfh == NULL)
2110 goto out_free_fh; 2113 goto out_free_fh;
@@ -2329,7 +2332,7 @@ static int nfs4_validate_text_mount_data(void *options,
2329{ 2332{
2330 struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; 2333 struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
2331 2334
2332 nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT); 2335 nfs_set_port(sap, &args->nfs_server.port, NFS_PORT);
2333 2336
2334 nfs_validate_transport_protocol(args); 2337 nfs_validate_transport_protocol(args);
2335 2338
@@ -2374,7 +2377,6 @@ static int nfs4_validate_mount_data(void *options,
2374 if (data == NULL) 2377 if (data == NULL)
2375 goto out_no_data; 2378 goto out_no_data;
2376 2379
2377 args->version = 4;
2378 switch (data->version) { 2380 switch (data->version) {
2379 case 1: 2381 case 1:
2380 if (data->host_addrlen > sizeof(args->nfs_server.address)) 2382 if (data->host_addrlen > sizeof(args->nfs_server.address))
@@ -2658,7 +2660,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2658 struct nfs_parsed_mount_data *data; 2660 struct nfs_parsed_mount_data *data;
2659 int error = -ENOMEM; 2661 int error = -ENOMEM;
2660 2662
2661 data = nfs_alloc_parsed_mount_data(0); 2663 data = nfs_alloc_parsed_mount_data(4);
2662 if (data == NULL) 2664 if (data == NULL)
2663 goto out_free_data; 2665 goto out_free_data;
2664 2666
@@ -2688,7 +2690,6 @@ static void nfs4_kill_super(struct super_block *sb)
2688 dprintk("--> %s\n", __func__); 2690 dprintk("--> %s\n", __func__);
2689 nfs_super_return_all_delegations(sb); 2691 nfs_super_return_all_delegations(sb);
2690 kill_anon_super(sb); 2692 kill_anon_super(sb);
2691 nfs4_renewd_prepare_shutdown(server);
2692 nfs_fscache_release_super_cookie(sb); 2693 nfs_fscache_release_super_cookie(sb);
2693 nfs_free_server(server); 2694 nfs_free_server(server);
2694 dprintk("<-- %s\n", __func__); 2695 dprintk("<-- %s\n", __func__);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index edf926e1062f..d0a2ce1b4324 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -958,7 +958,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
958 p1 = encode_entry_baggage(cd, p1, name, namlen, ino); 958 p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
959 959
960 if (plus) 960 if (plus)
961 p = encode_entryplus_baggage(cd, p1, name, namlen); 961 p1 = encode_entryplus_baggage(cd, p1, name, namlen);
962 962
963 /* determine entry word length and lengths to go in pages */ 963 /* determine entry word length and lengths to go in pages */
964 num_entry_words = p1 - tmp; 964 num_entry_words = p1 - tmp;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 00388d2a3c99..5c01fc148ce8 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -176,7 +176,7 @@ static const struct file_operations exports_operations = {
176extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); 176extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
177extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); 177extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
178 178
179static struct file_operations pool_stats_operations = { 179static const struct file_operations pool_stats_operations = {
180 .open = nfsd_pool_stats_open, 180 .open = nfsd_pool_stats_open,
181 .read = seq_read, 181 .read = seq_read,
182 .llseek = seq_lseek, 182 .llseek = seq_lseek,
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 6a2711f4c321..84c25382f8e3 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -36,6 +36,7 @@
36 36
37void nilfs_btnode_cache_init_once(struct address_space *btnc) 37void nilfs_btnode_cache_init_once(struct address_space *btnc)
38{ 38{
39 memset(btnc, 0, sizeof(*btnc));
39 INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC); 40 INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
40 spin_lock_init(&btnc->tree_lock); 41 spin_lock_init(&btnc->tree_lock);
41 INIT_LIST_HEAD(&btnc->private_list); 42 INIT_LIST_HEAD(&btnc->private_list);
@@ -86,6 +87,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
86 brelse(bh); 87 brelse(bh);
87 BUG(); 88 BUG();
88 } 89 }
90 memset(bh->b_data, 0, 1 << inode->i_blkbits);
89 bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; 91 bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
90 bh->b_blocknr = blocknr; 92 bh->b_blocknr = blocknr;
91 set_buffer_mapped(bh); 93 set_buffer_mapped(bh);
@@ -275,8 +277,7 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
275 "invalid oldkey %lld (newkey=%lld)", 277 "invalid oldkey %lld (newkey=%lld)",
276 (unsigned long long)oldkey, 278 (unsigned long long)oldkey,
277 (unsigned long long)newkey); 279 (unsigned long long)newkey);
278 if (!test_set_buffer_dirty(obh) && TestSetPageDirty(opage)) 280 nilfs_btnode_mark_dirty(obh);
279 BUG();
280 281
281 spin_lock_irq(&btnc->tree_lock); 282 spin_lock_irq(&btnc->tree_lock);
282 radix_tree_delete(&btnc->page_tree, oldkey); 283 radix_tree_delete(&btnc->page_tree, oldkey);
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 1c6cfb59128d..3f5d5d06f53c 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -871,7 +871,6 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
871 * exclusive with a new mount job. Though it doesn't cover 871 * exclusive with a new mount job. Though it doesn't cover
872 * umount, it's enough for the purpose. 872 * umount, it's enough for the purpose.
873 */ 873 */
874 mutex_lock(&nilfs->ns_mount_mutex);
875 if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) { 874 if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
876 /* Current implementation does not have to protect 875 /* Current implementation does not have to protect
877 plain read-only mounts since they are exclusive 876 plain read-only mounts since they are exclusive
@@ -880,7 +879,6 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
880 ret = -EBUSY; 879 ret = -EBUSY;
881 } else 880 } else
882 ret = nilfs_cpfile_clear_snapshot(cpfile, cno); 881 ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
883 mutex_unlock(&nilfs->ns_mount_mutex);
884 return ret; 882 return ret;
885 case NILFS_SNAPSHOT: 883 case NILFS_SNAPSHOT:
886 return nilfs_cpfile_set_snapshot(cpfile, cno); 884 return nilfs_cpfile_set_snapshot(cpfile, cno);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 1a4fa04cf071..e097099bfc8f 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -697,7 +697,7 @@ not_empty:
697 return 0; 697 return 0;
698} 698}
699 699
700struct file_operations nilfs_dir_operations = { 700const struct file_operations nilfs_dir_operations = {
701 .llseek = generic_file_llseek, 701 .llseek = generic_file_llseek,
702 .read = generic_read_dir, 702 .read = generic_read_dir,
703 .readdir = nilfs_readdir, 703 .readdir = nilfs_readdir,
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index fc8278c77cdd..30292df443ce 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -117,7 +117,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
117 return 0; 117 return 0;
118} 118}
119 119
120struct vm_operations_struct nilfs_file_vm_ops = { 120static const struct vm_operations_struct nilfs_file_vm_ops = {
121 .fault = filemap_fault, 121 .fault = filemap_fault,
122 .page_mkwrite = nilfs_page_mkwrite, 122 .page_mkwrite = nilfs_page_mkwrite,
123}; 123};
@@ -134,7 +134,7 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
134 * We have mostly NULL's here: the current defaults are ok for 134 * We have mostly NULL's here: the current defaults are ok for
135 * the nilfs filesystem. 135 * the nilfs filesystem.
136 */ 136 */
137struct file_operations nilfs_file_operations = { 137const struct file_operations nilfs_file_operations = {
138 .llseek = generic_file_llseek, 138 .llseek = generic_file_llseek,
139 .read = do_sync_read, 139 .read = do_sync_read,
140 .write = do_sync_write, 140 .write = do_sync_write,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2d2c501deb54..2a0a5a3ac134 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -400,6 +400,7 @@ int nilfs_read_inode_common(struct inode *inode,
400 ii->i_dir_acl = S_ISREG(inode->i_mode) ? 400 ii->i_dir_acl = S_ISREG(inode->i_mode) ?
401 0 : le32_to_cpu(raw_inode->i_dir_acl); 401 0 : le32_to_cpu(raw_inode->i_dir_acl);
402#endif 402#endif
403 ii->i_dir_start_lookup = 0;
403 ii->i_cno = 0; 404 ii->i_cno = 0;
404 inode->i_generation = le32_to_cpu(raw_inode->i_generation); 405 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
405 406
@@ -663,7 +664,6 @@ int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
663 int err; 664 int err;
664 665
665 spin_lock(&sbi->s_inode_lock); 666 spin_lock(&sbi->s_inode_lock);
666 /* Caller of this function MUST lock s_inode_lock */
667 if (ii->i_bh == NULL) { 667 if (ii->i_bh == NULL) {
668 spin_unlock(&sbi->s_inode_lock); 668 spin_unlock(&sbi->s_inode_lock);
669 err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino, 669 err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino,
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 6572ea4bc4df..f6af76042d80 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -99,7 +99,8 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
99static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, 99static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
100 unsigned int cmd, void __user *argp) 100 unsigned int cmd, void __user *argp)
101{ 101{
102 struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile; 102 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
103 struct inode *cpfile = nilfs->ns_cpfile;
103 struct nilfs_transaction_info ti; 104 struct nilfs_transaction_info ti;
104 struct nilfs_cpmode cpmode; 105 struct nilfs_cpmode cpmode;
105 int ret; 106 int ret;
@@ -109,14 +110,17 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
109 if (copy_from_user(&cpmode, argp, sizeof(cpmode))) 110 if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
110 return -EFAULT; 111 return -EFAULT;
111 112
113 mutex_lock(&nilfs->ns_mount_mutex);
112 nilfs_transaction_begin(inode->i_sb, &ti, 0); 114 nilfs_transaction_begin(inode->i_sb, &ti, 0);
113 ret = nilfs_cpfile_change_cpmode( 115 ret = nilfs_cpfile_change_cpmode(
114 cpfile, cpmode.cm_cno, cpmode.cm_mode); 116 cpfile, cpmode.cm_cno, cpmode.cm_mode);
115 if (unlikely(ret < 0)) { 117 if (unlikely(ret < 0)) {
116 nilfs_transaction_abort(inode->i_sb); 118 nilfs_transaction_abort(inode->i_sb);
119 mutex_unlock(&nilfs->ns_mount_mutex);
117 return ret; 120 return ret;
118 } 121 }
119 nilfs_transaction_commit(inode->i_sb); /* never fails */ 122 nilfs_transaction_commit(inode->i_sb); /* never fails */
123 mutex_unlock(&nilfs->ns_mount_mutex);
120 return ret; 124 return ret;
121} 125}
122 126
@@ -297,7 +301,18 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
297 (unsigned long long)vdesc->vd_vblocknr); 301 (unsigned long long)vdesc->vd_vblocknr);
298 return ret; 302 return ret;
299 } 303 }
300 bh->b_private = vdesc; 304 if (unlikely(!list_empty(&bh->b_assoc_buffers))) {
305 printk(KERN_CRIT "%s: conflicting %s buffer: ino=%llu, "
306 "cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu\n",
307 __func__, vdesc->vd_flags ? "node" : "data",
308 (unsigned long long)vdesc->vd_ino,
309 (unsigned long long)vdesc->vd_cno,
310 (unsigned long long)vdesc->vd_offset,
311 (unsigned long long)vdesc->vd_blocknr,
312 (unsigned long long)vdesc->vd_vblocknr);
313 brelse(bh);
314 return -EEXIST;
315 }
301 list_add_tail(&bh->b_assoc_buffers, buffers); 316 list_add_tail(&bh->b_assoc_buffers, buffers);
302 return 0; 317 return 0;
303} 318}
@@ -335,24 +350,10 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
335 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { 350 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
336 ret = nilfs_gccache_wait_and_mark_dirty(bh); 351 ret = nilfs_gccache_wait_and_mark_dirty(bh);
337 if (unlikely(ret < 0)) { 352 if (unlikely(ret < 0)) {
338 if (ret == -EEXIST) { 353 WARN_ON(ret == -EEXIST);
339 vdesc = bh->b_private;
340 printk(KERN_CRIT
341 "%s: conflicting %s buffer: "
342 "ino=%llu, cno=%llu, offset=%llu, "
343 "blocknr=%llu, vblocknr=%llu\n",
344 __func__,
345 vdesc->vd_flags ? "node" : "data",
346 (unsigned long long)vdesc->vd_ino,
347 (unsigned long long)vdesc->vd_cno,
348 (unsigned long long)vdesc->vd_offset,
349 (unsigned long long)vdesc->vd_blocknr,
350 (unsigned long long)vdesc->vd_vblocknr);
351 }
352 goto failed; 354 goto failed;
353 } 355 }
354 list_del_init(&bh->b_assoc_buffers); 356 list_del_init(&bh->b_assoc_buffers);
355 bh->b_private = NULL;
356 brelse(bh); 357 brelse(bh);
357 } 358 }
358 return nmembs; 359 return nmembs;
@@ -360,7 +361,6 @@ static int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
360 failed: 361 failed:
361 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) { 362 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
362 list_del_init(&bh->b_assoc_buffers); 363 list_del_init(&bh->b_assoc_buffers);
363 bh->b_private = NULL;
364 brelse(bh); 364 brelse(bh);
365 } 365 }
366 return ret; 366 return ret;
@@ -471,7 +471,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
471 return 0; 471 return 0;
472 472
473 failed: 473 failed:
474 nilfs_remove_all_gcinode(nilfs);
475 printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n", 474 printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n",
476 msg, ret); 475 msg, ret);
477 return ret; 476 return ret;
@@ -560,6 +559,8 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
560 else 559 else
561 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); 560 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
562 561
562 if (ret < 0)
563 nilfs_remove_all_gcinode(nilfs);
563 clear_nilfs_gc_running(nilfs); 564 clear_nilfs_gc_running(nilfs);
564 565
565 out_free: 566 out_free:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index b18c4998f8d0..f6326112d647 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -433,7 +433,7 @@ static const struct address_space_operations def_mdt_aops = {
433}; 433};
434 434
435static const struct inode_operations def_mdt_iops; 435static const struct inode_operations def_mdt_iops;
436static struct file_operations def_mdt_fops; 436static const struct file_operations def_mdt_fops;
437 437
438/* 438/*
439 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile, 439 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index bad7368782d0..4da6f67e9a91 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -294,9 +294,9 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *);
294/* 294/*
295 * Inodes and files operations 295 * Inodes and files operations
296 */ 296 */
297extern struct file_operations nilfs_dir_operations; 297extern const struct file_operations nilfs_dir_operations;
298extern const struct inode_operations nilfs_file_inode_operations; 298extern const struct inode_operations nilfs_file_inode_operations;
299extern struct file_operations nilfs_file_operations; 299extern const struct file_operations nilfs_file_operations;
300extern const struct address_space_operations nilfs_aops; 300extern const struct address_space_operations nilfs_aops;
301extern const struct inode_operations nilfs_dir_inode_operations; 301extern const struct inode_operations nilfs_dir_inode_operations;
302extern const struct inode_operations nilfs_special_inode_operations; 302extern const struct inode_operations nilfs_special_inode_operations;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 683df89dbae5..6eff66a070d5 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2468,17 +2468,22 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
2468 /* Clear requests (even when the construction failed) */ 2468 /* Clear requests (even when the construction failed) */
2469 spin_lock(&sci->sc_state_lock); 2469 spin_lock(&sci->sc_state_lock);
2470 2470
2471 sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
2472
2473 if (req->mode == SC_LSEG_SR) { 2471 if (req->mode == SC_LSEG_SR) {
2472 sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
2474 sci->sc_seq_done = req->seq_accepted; 2473 sci->sc_seq_done = req->seq_accepted;
2475 nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err); 2474 nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err);
2476 sci->sc_flush_request = 0; 2475 sci->sc_flush_request = 0;
2477 } else if (req->mode == SC_FLUSH_FILE) 2476 } else {
2478 sci->sc_flush_request &= ~FLUSH_FILE_BIT; 2477 if (req->mode == SC_FLUSH_FILE)
2479 else if (req->mode == SC_FLUSH_DAT) 2478 sci->sc_flush_request &= ~FLUSH_FILE_BIT;
2480 sci->sc_flush_request &= ~FLUSH_DAT_BIT; 2479 else if (req->mode == SC_FLUSH_DAT)
2480 sci->sc_flush_request &= ~FLUSH_DAT_BIT;
2481 2481
2482 /* re-enable timer if checkpoint creation was not done */
2483 if (sci->sc_timer && (sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2484 time_before(jiffies, sci->sc_timer->expires))
2485 add_timer(sci->sc_timer);
2486 }
2482 spin_unlock(&sci->sc_state_lock); 2487 spin_unlock(&sci->sc_state_lock);
2483} 2488}
2484 2489
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 2224b4d07bf0..44a88a9fa2c8 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -124,10 +124,10 @@ int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
124 while (*s && len > 0) { 124 while (*s && len > 0) {
125 if (*s & 0x80) { 125 if (*s & 0x80) {
126 size = utf8_to_utf32(s, len, &u); 126 size = utf8_to_utf32(s, len, &u);
127 if (size < 0) { 127 if (size < 0)
128 /* Ignore character and move on */ 128 return -EINVAL;
129 size = 1; 129
130 } else if (u >= PLANE_SIZE) { 130 if (u >= PLANE_SIZE) {
131 u -= PLANE_SIZE; 131 u -= PLANE_SIZE;
132 *op++ = (wchar_t) (SURROGATE_PAIR | 132 *op++ = (wchar_t) (SURROGATE_PAIR |
133 ((u >> 10) & SURROGATE_BITS)); 133 ((u >> 10) & SURROGATE_BITS));
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 828a889be909..7e54e52964dd 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -91,6 +91,7 @@ static int dnotify_handle_event(struct fsnotify_group *group,
91 struct dnotify_struct *dn; 91 struct dnotify_struct *dn;
92 struct dnotify_struct **prev; 92 struct dnotify_struct **prev;
93 struct fown_struct *fown; 93 struct fown_struct *fown;
94 __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD;
94 95
95 to_tell = event->to_tell; 96 to_tell = event->to_tell;
96 97
@@ -106,7 +107,7 @@ static int dnotify_handle_event(struct fsnotify_group *group,
106 spin_lock(&entry->lock); 107 spin_lock(&entry->lock);
107 prev = &dnentry->dn; 108 prev = &dnentry->dn;
108 while ((dn = *prev) != NULL) { 109 while ((dn = *prev) != NULL) {
109 if ((dn->dn_mask & event->mask) == 0) { 110 if ((dn->dn_mask & test_mask) == 0) {
110 prev = &dn->dn_next; 111 prev = &dn->dn_next;
111 continue; 112 continue;
112 } 113 }
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index c8a07c65482b..3165d85aada2 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -324,11 +324,11 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
324 spin_lock(&group->mark_lock); 324 spin_lock(&group->mark_lock);
325 spin_lock(&inode->i_lock); 325 spin_lock(&inode->i_lock);
326 326
327 entry->group = group;
328 entry->inode = inode;
329
330 lentry = fsnotify_find_mark_entry(group, inode); 327 lentry = fsnotify_find_mark_entry(group, inode);
331 if (!lentry) { 328 if (!lentry) {
329 entry->group = group;
330 entry->inode = inode;
331
332 hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries); 332 hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
333 list_add(&entry->g_list, &group->mark_entries); 333 list_add(&entry->g_list, &group->mark_entries);
334 334
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 3816d5750dd5..b8bf53b4c108 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -143,7 +143,7 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
143 /* remember, after old was put on the wait_q we aren't 143 /* remember, after old was put on the wait_q we aren't
144 * allowed to look at the inode any more, only thing 144 * allowed to look at the inode any more, only thing
145 * left to check was if the file_name is the same */ 145 * left to check was if the file_name is the same */
146 if (old->name_len && 146 if (!old->name_len ||
147 !strcmp(old->file_name, new->file_name)) 147 !strcmp(old->file_name, new->file_name))
148 return true; 148 return true;
149 break; 149 break;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 09cc25d04611..c452d116b892 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -966,7 +966,7 @@ static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
966} 966}
967#endif /* CONFIG_DEBUG_FS */ 967#endif /* CONFIG_DEBUG_FS */
968 968
969static struct file_operations o2hb_debug_fops = { 969static const struct file_operations o2hb_debug_fops = {
970 .open = o2hb_debug_open, 970 .open = o2hb_debug_open,
971 .release = o2hb_debug_release, 971 .release = o2hb_debug_release,
972 .read = o2hb_debug_read, 972 .read = o2hb_debug_read,
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index cfb2be708abe..da794bc07a6c 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -207,7 +207,7 @@ static int nst_fop_release(struct inode *inode, struct file *file)
207 return seq_release_private(inode, file); 207 return seq_release_private(inode, file);
208} 208}
209 209
210static struct file_operations nst_seq_fops = { 210static const struct file_operations nst_seq_fops = {
211 .open = nst_fop_open, 211 .open = nst_fop_open,
212 .read = seq_read, 212 .read = seq_read,
213 .llseek = seq_lseek, 213 .llseek = seq_lseek,
@@ -388,7 +388,7 @@ static int sc_fop_release(struct inode *inode, struct file *file)
388 return seq_release_private(inode, file); 388 return seq_release_private(inode, file);
389} 389}
390 390
391static struct file_operations sc_seq_fops = { 391static const struct file_operations sc_seq_fops = {
392 .open = sc_fop_open, 392 .open = sc_fop_open,
393 .read = seq_read, 393 .read = seq_read,
394 .llseek = seq_lseek, 394 .llseek = seq_lseek,
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index ca46002ec10e..42b0bad7a612 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -478,7 +478,7 @@ bail:
478 return -ENOMEM; 478 return -ENOMEM;
479} 479}
480 480
481static struct file_operations debug_purgelist_fops = { 481static const struct file_operations debug_purgelist_fops = {
482 .open = debug_purgelist_open, 482 .open = debug_purgelist_open,
483 .release = debug_buffer_release, 483 .release = debug_buffer_release,
484 .read = debug_buffer_read, 484 .read = debug_buffer_read,
@@ -538,7 +538,7 @@ bail:
538 return -ENOMEM; 538 return -ENOMEM;
539} 539}
540 540
541static struct file_operations debug_mle_fops = { 541static const struct file_operations debug_mle_fops = {
542 .open = debug_mle_open, 542 .open = debug_mle_open,
543 .release = debug_buffer_release, 543 .release = debug_buffer_release,
544 .read = debug_buffer_read, 544 .read = debug_buffer_read,
@@ -741,7 +741,7 @@ static int debug_lockres_release(struct inode *inode, struct file *file)
741 return seq_release_private(inode, file); 741 return seq_release_private(inode, file);
742} 742}
743 743
744static struct file_operations debug_lockres_fops = { 744static const struct file_operations debug_lockres_fops = {
745 .open = debug_lockres_open, 745 .open = debug_lockres_open,
746 .release = debug_lockres_release, 746 .release = debug_lockres_release,
747 .read = seq_read, 747 .read = seq_read,
@@ -925,7 +925,7 @@ bail:
925 return -ENOMEM; 925 return -ENOMEM;
926} 926}
927 927
928static struct file_operations debug_state_fops = { 928static const struct file_operations debug_state_fops = {
929 .open = debug_state_open, 929 .open = debug_state_open,
930 .release = debug_buffer_release, 930 .release = debug_buffer_release,
931 .read = debug_buffer_read, 931 .read = debug_buffer_read,
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index b606496b72ec..39737613424a 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -202,7 +202,7 @@ out:
202 return ret; 202 return ret;
203} 203}
204 204
205static struct vm_operations_struct ocfs2_file_vm_ops = { 205static const struct vm_operations_struct ocfs2_file_vm_ops = {
206 .fault = ocfs2_fault, 206 .fault = ocfs2_fault,
207 .page_mkwrite = ocfs2_page_mkwrite, 207 .page_mkwrite = ocfs2_page_mkwrite,
208}; 208};
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4cc3c890a2cd..c0e48aeebb1c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -373,7 +373,7 @@ static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
373} 373}
374#endif /* CONFIG_DEBUG_FS */ 374#endif /* CONFIG_DEBUG_FS */
375 375
376static struct file_operations ocfs2_osb_debug_fops = { 376static const struct file_operations ocfs2_osb_debug_fops = {
377 .open = ocfs2_osb_debug_open, 377 .open = ocfs2_osb_debug_open,
378 .release = ocfs2_debug_release, 378 .release = ocfs2_debug_release,
379 .read = ocfs2_debug_read, 379 .read = ocfs2_debug_read,
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 3680bae335b5..b42d62419034 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -498,7 +498,7 @@ const struct inode_operations omfs_dir_inops = {
498 .rmdir = omfs_rmdir, 498 .rmdir = omfs_rmdir,
499}; 499};
500 500
501struct file_operations omfs_dir_operations = { 501const struct file_operations omfs_dir_operations = {
502 .read = generic_read_dir, 502 .read = generic_read_dir,
503 .readdir = omfs_readdir, 503 .readdir = omfs_readdir,
504 .llseek = generic_file_llseek, 504 .llseek = generic_file_llseek,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 4845fbb18e6e..399487c09364 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -322,7 +322,7 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
322 return generic_block_bmap(mapping, block, omfs_get_block); 322 return generic_block_bmap(mapping, block, omfs_get_block);
323} 323}
324 324
325struct file_operations omfs_file_operations = { 325const struct file_operations omfs_file_operations = {
326 .llseek = generic_file_llseek, 326 .llseek = generic_file_llseek,
327 .read = do_sync_read, 327 .read = do_sync_read,
328 .write = do_sync_write, 328 .write = do_sync_write,
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index df71039945ac..ebe2fdbe535e 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -44,14 +44,14 @@ extern int omfs_allocate_range(struct super_block *sb, int min_request,
44extern int omfs_clear_range(struct super_block *sb, u64 block, int count); 44extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
45 45
46/* dir.c */ 46/* dir.c */
47extern struct file_operations omfs_dir_operations; 47extern const struct file_operations omfs_dir_operations;
48extern const struct inode_operations omfs_dir_inops; 48extern const struct inode_operations omfs_dir_inops;
49extern int omfs_make_empty(struct inode *inode, struct super_block *sb); 49extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
50extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header, 50extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
51 u64 fsblock); 51 u64 fsblock);
52 52
53/* file.c */ 53/* file.c */
54extern struct file_operations omfs_file_operations; 54extern const struct file_operations omfs_file_operations;
55extern const struct inode_operations omfs_file_inops; 55extern const struct inode_operations omfs_file_inops;
56extern const struct address_space_operations omfs_aops; 56extern const struct address_space_operations omfs_aops;
57extern void omfs_make_empty_table(struct buffer_head *bh, int offset); 57extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
diff --git a/fs/pipe.c b/fs/pipe.c
index 52c415114838..ae17d026aaa3 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -777,36 +777,55 @@ pipe_rdwr_release(struct inode *inode, struct file *filp)
777static int 777static int
778pipe_read_open(struct inode *inode, struct file *filp) 778pipe_read_open(struct inode *inode, struct file *filp)
779{ 779{
780 /* We could have perhaps used atomic_t, but this and friends 780 int ret = -ENOENT;
781 below are the only places. So it doesn't seem worthwhile. */ 781
782 mutex_lock(&inode->i_mutex); 782 mutex_lock(&inode->i_mutex);
783 inode->i_pipe->readers++; 783
784 if (inode->i_pipe) {
785 ret = 0;
786 inode->i_pipe->readers++;
787 }
788
784 mutex_unlock(&inode->i_mutex); 789 mutex_unlock(&inode->i_mutex);
785 790
786 return 0; 791 return ret;
787} 792}
788 793
789static int 794static int
790pipe_write_open(struct inode *inode, struct file *filp) 795pipe_write_open(struct inode *inode, struct file *filp)
791{ 796{
797 int ret = -ENOENT;
798
792 mutex_lock(&inode->i_mutex); 799 mutex_lock(&inode->i_mutex);
793 inode->i_pipe->writers++; 800
801 if (inode->i_pipe) {
802 ret = 0;
803 inode->i_pipe->writers++;
804 }
805
794 mutex_unlock(&inode->i_mutex); 806 mutex_unlock(&inode->i_mutex);
795 807
796 return 0; 808 return ret;
797} 809}
798 810
799static int 811static int
800pipe_rdwr_open(struct inode *inode, struct file *filp) 812pipe_rdwr_open(struct inode *inode, struct file *filp)
801{ 813{
814 int ret = -ENOENT;
815
802 mutex_lock(&inode->i_mutex); 816 mutex_lock(&inode->i_mutex);
803 if (filp->f_mode & FMODE_READ) 817
804 inode->i_pipe->readers++; 818 if (inode->i_pipe) {
805 if (filp->f_mode & FMODE_WRITE) 819 ret = 0;
806 inode->i_pipe->writers++; 820 if (filp->f_mode & FMODE_READ)
821 inode->i_pipe->readers++;
822 if (filp->f_mode & FMODE_WRITE)
823 inode->i_pipe->writers++;
824 }
825
807 mutex_unlock(&inode->i_mutex); 826 mutex_unlock(&inode->i_mutex);
808 827
809 return 0; 828 return ret;
810} 829}
811 830
812/* 831/*
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 0c6bc602e6c4..822c2d506518 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -322,6 +322,8 @@ static inline void task_context_switch_counts(struct seq_file *m,
322 p->nivcsw); 322 p->nivcsw);
323} 323}
324 324
325#ifdef CONFIG_MMU
326
325struct stack_stats { 327struct stack_stats {
326 struct vm_area_struct *vma; 328 struct vm_area_struct *vma;
327 unsigned long startpage; 329 unsigned long startpage;
@@ -402,6 +404,11 @@ static inline void task_show_stack_usage(struct seq_file *m,
402 mmput(mm); 404 mmput(mm);
403 } 405 }
404} 406}
407#else
408static void task_show_stack_usage(struct seq_file *m, struct task_struct *task)
409{
410}
411#endif /* CONFIG_MMU */
405 412
406int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, 413int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
407 struct pid *pid, struct task_struct *task) 414 struct pid *pid, struct task_struct *task)
@@ -564,7 +571,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
564 rsslim, 571 rsslim,
565 mm ? mm->start_code : 0, 572 mm ? mm->start_code : 0,
566 mm ? mm->end_code : 0, 573 mm ? mm->end_code : 0,
567 (permitted) ? task->stack_start : 0, 574 (permitted && mm) ? task->stack_start : 0,
568 esp, 575 esp,
569 eip, 576 eip,
570 /* The signal information here is obsolete. 577 /* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 837469a96598..af643b5aefe8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2597,8 +2597,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
2597 name.len = snprintf(buf, sizeof(buf), "%d", pid); 2597 name.len = snprintf(buf, sizeof(buf), "%d", pid);
2598 dentry = d_hash_and_lookup(mnt->mnt_root, &name); 2598 dentry = d_hash_and_lookup(mnt->mnt_root, &name);
2599 if (dentry) { 2599 if (dentry) {
2600 if (!(current->flags & PF_EXITING)) 2600 shrink_dcache_parent(dentry);
2601 shrink_dcache_parent(dentry);
2602 d_drop(dentry); 2601 d_drop(dentry);
2603 dput(dentry); 2602 dput(dentry);
2604 } 2603 }
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 56013371f9f3..a44a7897fd4d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -23,7 +23,6 @@
23#include <asm/io.h> 23#include <asm/io.h>
24#include <linux/list.h> 24#include <linux/list.h>
25#include <linux/ioport.h> 25#include <linux/ioport.h>
26#include <linux/mm.h>
27#include <linux/memory.h> 26#include <linux/memory.h>
28#include <asm/sections.h> 27#include <asm/sections.h>
29 28
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index c7bff4f603ff..a65239cfd97e 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -99,7 +99,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
99 "VmallocUsed: %8lu kB\n" 99 "VmallocUsed: %8lu kB\n"
100 "VmallocChunk: %8lu kB\n" 100 "VmallocChunk: %8lu kB\n"
101#ifdef CONFIG_MEMORY_FAILURE 101#ifdef CONFIG_MEMORY_FAILURE
102 "HardwareCorrupted: %8lu kB\n" 102 "HardwareCorrupted: %5lu kB\n"
103#endif 103#endif
104 , 104 ,
105 K(i.totalram), 105 K(i.totalram),
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 2281c2cbfe2b..5033ce0d254b 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -94,6 +94,7 @@ static const struct file_operations proc_kpagecount_operations = {
94#define KPF_COMPOUND_TAIL 16 94#define KPF_COMPOUND_TAIL 16
95#define KPF_HUGE 17 95#define KPF_HUGE 17
96#define KPF_UNEVICTABLE 18 96#define KPF_UNEVICTABLE 18
97#define KPF_HWPOISON 19
97#define KPF_NOPAGE 20 98#define KPF_NOPAGE 20
98 99
99#define KPF_KSM 21 100#define KPF_KSM 21
@@ -180,6 +181,10 @@ static u64 get_uflags(struct page *page)
180 u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); 181 u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable);
181 u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); 182 u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked);
182 183
184#ifdef CONFIG_MEMORY_FAILURE
185 u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
186#endif
187
183#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR 188#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
184 u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); 189 u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached);
185#endif 190#endif
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
index b3208adf8e71..71e2b4d50a0a 100644
--- a/fs/romfs/storage.c
+++ b/fs/romfs/storage.c
@@ -253,11 +253,11 @@ ssize_t romfs_dev_strnlen(struct super_block *sb,
253 253
254#ifdef CONFIG_ROMFS_ON_MTD 254#ifdef CONFIG_ROMFS_ON_MTD
255 if (sb->s_mtd) 255 if (sb->s_mtd)
256 return romfs_mtd_strnlen(sb, pos, limit); 256 return romfs_mtd_strnlen(sb, pos, maxlen);
257#endif 257#endif
258#ifdef CONFIG_ROMFS_ON_BLOCK 258#ifdef CONFIG_ROMFS_ON_BLOCK
259 if (sb->s_bdev) 259 if (sb->s_bdev)
260 return romfs_blk_strnlen(sb, pos, limit); 260 return romfs_blk_strnlen(sb, pos, maxlen);
261#endif 261#endif
262 return -EIO; 262 return -EIO;
263} 263}
diff --git a/fs/select.c b/fs/select.c
index a201fc370223..fd38ce2e32e3 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -15,6 +15,7 @@
15 */ 15 */
16 16
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/sched.h>
18#include <linux/syscalls.h> 19#include <linux/syscalls.h>
19#include <linux/module.h> 20#include <linux/module.h>
20#include <linux/slab.h> 21#include <linux/slab.h>
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 2524714bece1..60c702bc10ae 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -40,7 +40,7 @@ struct bin_buffer {
40 struct mutex mutex; 40 struct mutex mutex;
41 void *buffer; 41 void *buffer;
42 int mmapped; 42 int mmapped;
43 struct vm_operations_struct *vm_ops; 43 const struct vm_operations_struct *vm_ops;
44 struct file *file; 44 struct file *file;
45 struct hlist_node list; 45 struct hlist_node list;
46}; 46};
@@ -331,7 +331,7 @@ static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
331} 331}
332#endif 332#endif
333 333
334static struct vm_operations_struct bin_vm_ops = { 334static const struct vm_operations_struct bin_vm_ops = {
335 .open = bin_vma_open, 335 .open = bin_vma_open,
336 .close = bin_vma_close, 336 .close = bin_vma_close,
337 .fault = bin_fault, 337 .fault = bin_fault,
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 0050fc40e8c9..e0201837d244 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -21,6 +21,7 @@
21#include <linux/completion.h> 21#include <linux/completion.h>
22#include <linux/mutex.h> 22#include <linux/mutex.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/security.h>
24#include "sysfs.h" 25#include "sysfs.h"
25 26
26DEFINE_MUTEX(sysfs_mutex); 27DEFINE_MUTEX(sysfs_mutex);
@@ -285,6 +286,9 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
285 sysfs_put(sd->s_symlink.target_sd); 286 sysfs_put(sd->s_symlink.target_sd);
286 if (sysfs_type(sd) & SYSFS_COPY_NAME) 287 if (sysfs_type(sd) & SYSFS_COPY_NAME)
287 kfree(sd->s_name); 288 kfree(sd->s_name);
289 if (sd->s_iattr && sd->s_iattr->ia_secdata)
290 security_release_secctx(sd->s_iattr->ia_secdata,
291 sd->s_iattr->ia_secdata_len);
288 kfree(sd->s_iattr); 292 kfree(sd->s_iattr);
289 sysfs_free_ino(sd->s_ino); 293 sysfs_free_ino(sd->s_ino);
290 kmem_cache_free(sysfs_dir_cachep, sd); 294 kmem_cache_free(sysfs_dir_cachep, sd);
@@ -894,7 +898,8 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
894 898
895 mutex_lock(&sysfs_rename_mutex); 899 mutex_lock(&sysfs_rename_mutex);
896 BUG_ON(!sd->s_parent); 900 BUG_ON(!sd->s_parent);
897 new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root; 901 new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ?
902 new_parent_kobj->sd : &sysfs_root;
898 903
899 error = 0; 904 error = 0;
900 if (sd->s_parent == new_parent_sd) 905 if (sd->s_parent == new_parent_sd)
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 561a9c050cef..f5ea4680f15f 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -268,7 +268,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
268 struct sysfs_open_dirent *od, *new_od = NULL; 268 struct sysfs_open_dirent *od, *new_od = NULL;
269 269
270 retry: 270 retry:
271 spin_lock(&sysfs_open_dirent_lock); 271 spin_lock_irq(&sysfs_open_dirent_lock);
272 272
273 if (!sd->s_attr.open && new_od) { 273 if (!sd->s_attr.open && new_od) {
274 sd->s_attr.open = new_od; 274 sd->s_attr.open = new_od;
@@ -281,7 +281,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
281 list_add_tail(&buffer->list, &od->buffers); 281 list_add_tail(&buffer->list, &od->buffers);
282 } 282 }
283 283
284 spin_unlock(&sysfs_open_dirent_lock); 284 spin_unlock_irq(&sysfs_open_dirent_lock);
285 285
286 if (od) { 286 if (od) {
287 kfree(new_od); 287 kfree(new_od);
@@ -315,8 +315,9 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
315 struct sysfs_buffer *buffer) 315 struct sysfs_buffer *buffer)
316{ 316{
317 struct sysfs_open_dirent *od = sd->s_attr.open; 317 struct sysfs_open_dirent *od = sd->s_attr.open;
318 unsigned long flags;
318 319
319 spin_lock(&sysfs_open_dirent_lock); 320 spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
320 321
321 list_del(&buffer->list); 322 list_del(&buffer->list);
322 if (atomic_dec_and_test(&od->refcnt)) 323 if (atomic_dec_and_test(&od->refcnt))
@@ -324,7 +325,7 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
324 else 325 else
325 od = NULL; 326 od = NULL;
326 327
327 spin_unlock(&sysfs_open_dirent_lock); 328 spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
328 329
329 kfree(od); 330 kfree(od);
330} 331}
@@ -456,8 +457,9 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
456void sysfs_notify_dirent(struct sysfs_dirent *sd) 457void sysfs_notify_dirent(struct sysfs_dirent *sd)
457{ 458{
458 struct sysfs_open_dirent *od; 459 struct sysfs_open_dirent *od;
460 unsigned long flags;
459 461
460 spin_lock(&sysfs_open_dirent_lock); 462 spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
461 463
462 od = sd->s_attr.open; 464 od = sd->s_attr.open;
463 if (od) { 465 if (od) {
@@ -465,7 +467,7 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd)
465 wake_up_interruptible(&od->poll); 467 wake_up_interruptible(&od->poll);
466 } 468 }
467 469
468 spin_unlock(&sysfs_open_dirent_lock); 470 spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
469} 471}
470EXPORT_SYMBOL_GPL(sysfs_notify_dirent); 472EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
471 473
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 2e6481a7701c..1009adc8d602 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1534,7 +1534,7 @@ out_unlock:
1534 return err; 1534 return err;
1535} 1535}
1536 1536
1537static struct vm_operations_struct ubifs_file_vm_ops = { 1537static const struct vm_operations_struct ubifs_file_vm_ops = {
1538 .fault = filemap_fault, 1538 .fault = filemap_fault,
1539 .page_mkwrite = ubifs_vm_page_mkwrite, 1539 .page_mkwrite = ubifs_vm_page_mkwrite,
1540}; 1540};
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 381854461b28..c2e30eea74dc 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -186,19 +186,37 @@ xfs_destroy_ioend(
186} 186}
187 187
188/* 188/*
189 * If the end of the current ioend is beyond the current EOF,
190 * return the new EOF value, otherwise zero.
191 */
192STATIC xfs_fsize_t
193xfs_ioend_new_eof(
194 xfs_ioend_t *ioend)
195{
196 xfs_inode_t *ip = XFS_I(ioend->io_inode);
197 xfs_fsize_t isize;
198 xfs_fsize_t bsize;
199
200 bsize = ioend->io_offset + ioend->io_size;
201 isize = MAX(ip->i_size, ip->i_new_size);
202 isize = MIN(isize, bsize);
203 return isize > ip->i_d.di_size ? isize : 0;
204}
205
206/*
189 * Update on-disk file size now that data has been written to disk. 207 * Update on-disk file size now that data has been written to disk.
190 * The current in-memory file size is i_size. If a write is beyond 208 * The current in-memory file size is i_size. If a write is beyond
191 * eof i_new_size will be the intended file size until i_size is 209 * eof i_new_size will be the intended file size until i_size is
192 * updated. If this write does not extend all the way to the valid 210 * updated. If this write does not extend all the way to the valid
193 * file size then restrict this update to the end of the write. 211 * file size then restrict this update to the end of the write.
194 */ 212 */
213
195STATIC void 214STATIC void
196xfs_setfilesize( 215xfs_setfilesize(
197 xfs_ioend_t *ioend) 216 xfs_ioend_t *ioend)
198{ 217{
199 xfs_inode_t *ip = XFS_I(ioend->io_inode); 218 xfs_inode_t *ip = XFS_I(ioend->io_inode);
200 xfs_fsize_t isize; 219 xfs_fsize_t isize;
201 xfs_fsize_t bsize;
202 220
203 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); 221 ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
204 ASSERT(ioend->io_type != IOMAP_READ); 222 ASSERT(ioend->io_type != IOMAP_READ);
@@ -206,16 +224,10 @@ xfs_setfilesize(
206 if (unlikely(ioend->io_error)) 224 if (unlikely(ioend->io_error))
207 return; 225 return;
208 226
209 bsize = ioend->io_offset + ioend->io_size;
210
211 xfs_ilock(ip, XFS_ILOCK_EXCL); 227 xfs_ilock(ip, XFS_ILOCK_EXCL);
212 228 isize = xfs_ioend_new_eof(ioend);
213 isize = MAX(ip->i_size, ip->i_new_size); 229 if (isize) {
214 isize = MIN(isize, bsize);
215
216 if (ip->i_d.di_size < isize) {
217 ip->i_d.di_size = isize; 230 ip->i_d.di_size = isize;
218 ip->i_update_core = 1;
219 xfs_mark_inode_dirty_sync(ip); 231 xfs_mark_inode_dirty_sync(ip);
220 } 232 }
221 233
@@ -404,10 +416,16 @@ xfs_submit_ioend_bio(
404 struct bio *bio) 416 struct bio *bio)
405{ 417{
406 atomic_inc(&ioend->io_remaining); 418 atomic_inc(&ioend->io_remaining);
407
408 bio->bi_private = ioend; 419 bio->bi_private = ioend;
409 bio->bi_end_io = xfs_end_bio; 420 bio->bi_end_io = xfs_end_bio;
410 421
422 /*
423 * If the I/O is beyond EOF we mark the inode dirty immediately
424 * but don't update the inode size until I/O completion.
425 */
426 if (xfs_ioend_new_eof(ioend))
427 xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode));
428
411 submit_bio(WRITE, bio); 429 submit_bio(WRITE, bio);
412 ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP)); 430 ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
413 bio_put(bio); 431 bio_put(bio);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 988d8f87bc0f..eff61e2732af 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -42,7 +42,7 @@
42 42
43#include <linux/dcache.h> 43#include <linux/dcache.h>
44 44
45static struct vm_operations_struct xfs_file_vm_ops; 45static const struct vm_operations_struct xfs_file_vm_ops;
46 46
47STATIC ssize_t 47STATIC ssize_t
48xfs_file_aio_read( 48xfs_file_aio_read(
@@ -176,14 +176,7 @@ xfs_file_fsync(
176 struct dentry *dentry, 176 struct dentry *dentry,
177 int datasync) 177 int datasync)
178{ 178{
179 struct inode *inode = dentry->d_inode; 179 struct xfs_inode *ip = XFS_I(dentry->d_inode);
180 struct xfs_inode *ip = XFS_I(inode);
181 int error;
182
183 /* capture size updates in I/O completion before writing the inode. */
184 error = filemap_fdatawait(inode->i_mapping);
185 if (error)
186 return error;
187 180
188 xfs_iflags_clear(ip, XFS_ITRUNCATED); 181 xfs_iflags_clear(ip, XFS_ITRUNCATED);
189 return -xfs_fsync(ip); 182 return -xfs_fsync(ip);
@@ -280,7 +273,7 @@ const struct file_operations xfs_dir_file_operations = {
280 .fsync = xfs_file_fsync, 273 .fsync = xfs_file_fsync,
281}; 274};
282 275
283static struct vm_operations_struct xfs_file_vm_ops = { 276static const struct vm_operations_struct xfs_file_vm_ops = {
284 .fault = filemap_fault, 277 .fault = filemap_fault,
285 .page_mkwrite = xfs_vm_page_mkwrite, 278 .page_mkwrite = xfs_vm_page_mkwrite,
286}; 279};
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index da0159d99f82..cd42ef78f6b5 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -57,19 +57,22 @@
57#include <linux/fiemap.h> 57#include <linux/fiemap.h>
58 58
59/* 59/*
60 * Bring the atime in the XFS inode uptodate. 60 * Bring the timestamps in the XFS inode uptodate.
61 * Used before logging the inode to disk or when the Linux inode goes away. 61 *
62 * Used before writing the inode to disk.
62 */ 63 */
63void 64void
64xfs_synchronize_atime( 65xfs_synchronize_times(
65 xfs_inode_t *ip) 66 xfs_inode_t *ip)
66{ 67{
67 struct inode *inode = VFS_I(ip); 68 struct inode *inode = VFS_I(ip);
68 69
69 if (!(inode->i_state & I_CLEAR)) { 70 ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
70 ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; 71 ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
71 ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; 72 ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
72 } 73 ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
74 ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
75 ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
73} 76}
74 77
75/* 78/*
@@ -106,32 +109,20 @@ xfs_ichgtime(
106 if ((flags & XFS_ICHGTIME_MOD) && 109 if ((flags & XFS_ICHGTIME_MOD) &&
107 !timespec_equal(&inode->i_mtime, &tv)) { 110 !timespec_equal(&inode->i_mtime, &tv)) {
108 inode->i_mtime = tv; 111 inode->i_mtime = tv;
109 ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
110 ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
111 sync_it = 1; 112 sync_it = 1;
112 } 113 }
113 if ((flags & XFS_ICHGTIME_CHG) && 114 if ((flags & XFS_ICHGTIME_CHG) &&
114 !timespec_equal(&inode->i_ctime, &tv)) { 115 !timespec_equal(&inode->i_ctime, &tv)) {
115 inode->i_ctime = tv; 116 inode->i_ctime = tv;
116 ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
117 ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
118 sync_it = 1; 117 sync_it = 1;
119 } 118 }
120 119
121 /* 120 /*
122 * We update the i_update_core field _after_ changing 121 * Update complete - now make sure everyone knows that the inode
123 * the timestamps in order to coordinate properly with 122 * is dirty.
124 * xfs_iflush() so that we don't lose timestamp updates.
125 * This keeps us from having to hold the inode lock
126 * while doing this. We use the SYNCHRONIZE macro to
127 * ensure that the compiler does not reorder the update
128 * of i_update_core above the timestamp updates above.
129 */ 123 */
130 if (sync_it) { 124 if (sync_it)
131 SYNCHRONIZE();
132 ip->i_update_core = 1;
133 xfs_mark_inode_dirty_sync(ip); 125 xfs_mark_inode_dirty_sync(ip);
134 }
135} 126}
136 127
137/* 128/*
@@ -506,10 +497,8 @@ xfs_vn_getattr(
506 stat->gid = ip->i_d.di_gid; 497 stat->gid = ip->i_d.di_gid;
507 stat->ino = ip->i_ino; 498 stat->ino = ip->i_ino;
508 stat->atime = inode->i_atime; 499 stat->atime = inode->i_atime;
509 stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec; 500 stat->mtime = inode->i_mtime;
510 stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; 501 stat->ctime = inode->i_ctime;
511 stat->ctime.tv_sec = ip->i_d.di_ctime.t_sec;
512 stat->ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
513 stat->blocks = 502 stat->blocks =
514 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); 503 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
515 504
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 49e4a6aea73c..072050f8d346 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -667,7 +667,7 @@ start:
667 xip->i_new_size = new_size; 667 xip->i_new_size = new_size;
668 668
669 if (likely(!(ioflags & IO_INVIS))) 669 if (likely(!(ioflags & IO_INVIS)))
670 xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 670 file_update_time(file);
671 671
672 /* 672 /*
673 * If the offset is beyond the size of the file, we have a couple 673 * If the offset is beyond the size of the file, we have a couple
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 9e41f91aa269..3d4a0c84d634 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -80,7 +80,7 @@ xfs_fs_set_xstate(
80 80
81 if (sb->s_flags & MS_RDONLY) 81 if (sb->s_flags & MS_RDONLY)
82 return -EROFS; 82 return -EROFS;
83 if (!XFS_IS_QUOTA_RUNNING(mp)) 83 if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
84 return -ENOSYS; 84 return -ENOSYS;
85 if (!capable(CAP_SYS_ADMIN)) 85 if (!capable(CAP_SYS_ADMIN))
86 return -EPERM; 86 return -EPERM;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index bdd41c8c342f..18a4b8e11df2 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -977,6 +977,28 @@ xfs_fs_inode_init_once(
977} 977}
978 978
979/* 979/*
980 * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
981 * we catch unlogged VFS level updates to the inode. Care must be taken
982 * here - the transaction code calls mark_inode_dirty_sync() to mark the
983 * VFS inode dirty in a transaction and clears the i_update_core field;
984 * it must clear the field after calling mark_inode_dirty_sync() to
985 * correctly indicate that the dirty state has been propagated into the
986 * inode log item.
987 *
988 * We need the barrier() to maintain correct ordering between unlogged
989 * updates and the transaction commit code that clears the i_update_core
990 * field. This requires all updates to be completed before marking the
991 * inode dirty.
992 */
993STATIC void
994xfs_fs_dirty_inode(
995 struct inode *inode)
996{
997 barrier();
998 XFS_I(inode)->i_update_core = 1;
999}
1000
1001/*
980 * Attempt to flush the inode, this will actually fail 1002 * Attempt to flush the inode, this will actually fail
981 * if the inode is pinned, but we dirty the inode again 1003 * if the inode is pinned, but we dirty the inode again
982 * at the point when it is unpinned after a log write, 1004 * at the point when it is unpinned after a log write,
@@ -1126,7 +1148,7 @@ xfs_fs_put_super(
1126} 1148}
1127 1149
1128STATIC int 1150STATIC int
1129xfs_fs_sync_super( 1151xfs_fs_sync_fs(
1130 struct super_block *sb, 1152 struct super_block *sb,
1131 int wait) 1153 int wait)
1132{ 1154{
@@ -1134,23 +1156,23 @@ xfs_fs_sync_super(
1134 int error; 1156 int error;
1135 1157
1136 /* 1158 /*
1137 * Treat a sync operation like a freeze. This is to work 1159 * Not much we can do for the first async pass. Writing out the
1138 * around a race in sync_inodes() which works in two phases 1160 * superblock would be counter-productive as we are going to redirty
1139 * - an asynchronous flush, which can write out an inode 1161 * when writing out other data and metadata (and writing out a single
1140 * without waiting for file size updates to complete, and a 1162 * block is quite fast anyway).
1141 * synchronous flush, which wont do anything because the 1163 *
1142 * async flush removed the inode's dirty flag. Also 1164 * Try to asynchronously kick off quota syncing at least.
1143 * sync_inodes() will not see any files that just have
1144 * outstanding transactions to be flushed because we don't
1145 * dirty the Linux inode until after the transaction I/O
1146 * completes.
1147 */ 1165 */
1148 if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE)) 1166 if (!wait) {
1149 error = xfs_quiesce_data(mp); 1167 xfs_qm_sync(mp, SYNC_TRYLOCK);
1150 else 1168 return 0;
1151 error = xfs_sync_fsdata(mp, 0); 1169 }
1170
1171 error = xfs_quiesce_data(mp);
1172 if (error)
1173 return -error;
1152 1174
1153 if (unlikely(laptop_mode)) { 1175 if (laptop_mode) {
1154 int prev_sync_seq = mp->m_sync_seq; 1176 int prev_sync_seq = mp->m_sync_seq;
1155 1177
1156 /* 1178 /*
@@ -1169,7 +1191,7 @@ xfs_fs_sync_super(
1169 mp->m_sync_seq != prev_sync_seq); 1191 mp->m_sync_seq != prev_sync_seq);
1170 } 1192 }
1171 1193
1172 return -error; 1194 return 0;
1173} 1195}
1174 1196
1175STATIC int 1197STATIC int
@@ -1539,10 +1561,11 @@ xfs_fs_get_sb(
1539static const struct super_operations xfs_super_operations = { 1561static const struct super_operations xfs_super_operations = {
1540 .alloc_inode = xfs_fs_alloc_inode, 1562 .alloc_inode = xfs_fs_alloc_inode,
1541 .destroy_inode = xfs_fs_destroy_inode, 1563 .destroy_inode = xfs_fs_destroy_inode,
1564 .dirty_inode = xfs_fs_dirty_inode,
1542 .write_inode = xfs_fs_write_inode, 1565 .write_inode = xfs_fs_write_inode,
1543 .clear_inode = xfs_fs_clear_inode, 1566 .clear_inode = xfs_fs_clear_inode,
1544 .put_super = xfs_fs_put_super, 1567 .put_super = xfs_fs_put_super,
1545 .sync_fs = xfs_fs_sync_super, 1568 .sync_fs = xfs_fs_sync_fs,
1546 .freeze_fs = xfs_fs_freeze, 1569 .freeze_fs = xfs_fs_freeze,
1547 .statfs = xfs_fs_statfs, 1570 .statfs = xfs_fs_statfs,
1548 .remount_fs = xfs_fs_remount, 1571 .remount_fs = xfs_fs_remount,
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 320be6aea492..961df0a22c78 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -309,11 +309,15 @@ xfs_sync_attr(
309STATIC int 309STATIC int
310xfs_commit_dummy_trans( 310xfs_commit_dummy_trans(
311 struct xfs_mount *mp, 311 struct xfs_mount *mp,
312 uint log_flags) 312 uint flags)
313{ 313{
314 struct xfs_inode *ip = mp->m_rootip; 314 struct xfs_inode *ip = mp->m_rootip;
315 struct xfs_trans *tp; 315 struct xfs_trans *tp;
316 int error; 316 int error;
317 int log_flags = XFS_LOG_FORCE;
318
319 if (flags & SYNC_WAIT)
320 log_flags |= XFS_LOG_SYNC;
317 321
318 /* 322 /*
319 * Put a dummy transaction in the log to tell recovery 323 * Put a dummy transaction in the log to tell recovery
@@ -331,13 +335,12 @@ xfs_commit_dummy_trans(
331 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 335 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
332 xfs_trans_ihold(tp, ip); 336 xfs_trans_ihold(tp, ip);
333 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 337 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
334 /* XXX(hch): ignoring the error here.. */
335 error = xfs_trans_commit(tp, 0); 338 error = xfs_trans_commit(tp, 0);
336
337 xfs_iunlock(ip, XFS_ILOCK_EXCL); 339 xfs_iunlock(ip, XFS_ILOCK_EXCL);
338 340
341 /* the log force ensures this transaction is pushed to disk */
339 xfs_log_force(mp, 0, log_flags); 342 xfs_log_force(mp, 0, log_flags);
340 return 0; 343 return error;
341} 344}
342 345
343int 346int
@@ -385,7 +388,20 @@ xfs_sync_fsdata(
385 else 388 else
386 XFS_BUF_ASYNC(bp); 389 XFS_BUF_ASYNC(bp);
387 390
388 return xfs_bwrite(mp, bp); 391 error = xfs_bwrite(mp, bp);
392 if (error)
393 return error;
394
395 /*
396 * If this is a data integrity sync make sure all pending buffers
397 * are flushed out for the log coverage check below.
398 */
399 if (flags & SYNC_WAIT)
400 xfs_flush_buftarg(mp->m_ddev_targp, 1);
401
402 if (xfs_log_need_covered(mp))
403 error = xfs_commit_dummy_trans(mp, flags);
404 return error;
389 405
390 out_brelse: 406 out_brelse:
391 xfs_buf_relse(bp); 407 xfs_buf_relse(bp);
@@ -419,14 +435,16 @@ xfs_quiesce_data(
419 /* push non-blocking */ 435 /* push non-blocking */
420 xfs_sync_data(mp, 0); 436 xfs_sync_data(mp, 0);
421 xfs_qm_sync(mp, SYNC_TRYLOCK); 437 xfs_qm_sync(mp, SYNC_TRYLOCK);
422 xfs_filestream_flush(mp);
423 438
424 /* push and block */ 439 /* push and block till complete */
425 xfs_sync_data(mp, SYNC_WAIT); 440 xfs_sync_data(mp, SYNC_WAIT);
426 xfs_qm_sync(mp, SYNC_WAIT); 441 xfs_qm_sync(mp, SYNC_WAIT);
427 442
443 /* drop inode references pinned by filestreams */
444 xfs_filestream_flush(mp);
445
428 /* write superblock and hoover up shutdown errors */ 446 /* write superblock and hoover up shutdown errors */
429 error = xfs_sync_fsdata(mp, 0); 447 error = xfs_sync_fsdata(mp, SYNC_WAIT);
430 448
431 /* flush data-only devices */ 449 /* flush data-only devices */
432 if (mp->m_rtdev_targp) 450 if (mp->m_rtdev_targp)
@@ -570,8 +588,6 @@ xfs_sync_worker(
570 /* dgc: errors ignored here */ 588 /* dgc: errors ignored here */
571 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 589 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
572 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK); 590 error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
573 if (xfs_log_need_covered(mp))
574 error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
575 } 591 }
576 mp->m_sync_seq++; 592 mp->m_sync_seq++;
577 wake_up(&mp->m_wait_single_sync_task); 593 wake_up(&mp->m_wait_single_sync_task);
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 4e4276b956e8..5d1a3b98a6e6 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -876,7 +876,6 @@ xfs_dqrele_inode(
876 ip->i_gdquot = NULL; 876 ip->i_gdquot = NULL;
877 } 877 }
878 xfs_iput(ip, XFS_ILOCK_EXCL); 878 xfs_iput(ip, XFS_ILOCK_EXCL);
879 IRELE(ip);
880 879
881 return 0; 880 return 0;
882} 881}
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 7465f9ee125f..ab89a7e94a0f 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -206,10 +206,10 @@ xfs_swap_extents(
206 * process that the file was not changed out from 206 * process that the file was not changed out from
207 * under it. 207 * under it.
208 */ 208 */
209 if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) || 209 if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
210 (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) || 210 (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
211 (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) || 211 (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
212 (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) { 212 (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
213 error = XFS_ERROR(EBUSY); 213 error = XFS_ERROR(EBUSY);
214 goto out_unlock; 214 goto out_unlock;
215 } 215 }
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index fa913e459442..41ad537c49e9 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -854,6 +854,7 @@ xfs_dir2_leaf_getdents(
854 */ 854 */
855 ra_want = howmany(bufsize + mp->m_dirblksize, 855 ra_want = howmany(bufsize + mp->m_dirblksize,
856 mp->m_sb.sb_blocksize) - 1; 856 mp->m_sb.sb_blocksize) - 1;
857 ASSERT(ra_want >= 0);
857 858
858 /* 859 /*
859 * If we don't have as many as we want, and we haven't 860 * If we don't have as many as we want, and we haven't
@@ -1088,7 +1089,8 @@ xfs_dir2_leaf_getdents(
1088 */ 1089 */
1089 ptr += length; 1090 ptr += length;
1090 curoff += length; 1091 curoff += length;
1091 bufsize -= length; 1092 /* bufsize may have just been a guess; don't go negative */
1093 bufsize = bufsize > length ? bufsize - length : 0;
1092 } 1094 }
1093 1095
1094 /* 1096 /*
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index ab64f3efb43b..0785797db828 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -880,6 +880,7 @@ nextag:
880 * Not in range - save last search 880 * Not in range - save last search
881 * location and allocate a new inode 881 * location and allocate a new inode
882 */ 882 */
883 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
883 pag->pagl_leftrec = trec.ir_startino; 884 pag->pagl_leftrec = trec.ir_startino;
884 pag->pagl_rightrec = rec.ir_startino; 885 pag->pagl_rightrec = rec.ir_startino;
885 pag->pagl_pagino = pagino; 886 pag->pagl_pagino = pagino;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c1dc7ef5a1d8..b92a4fa2a0a1 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3068,9 +3068,9 @@ xfs_iflush_int(
3068 SYNCHRONIZE(); 3068 SYNCHRONIZE();
3069 3069
3070 /* 3070 /*
3071 * Make sure to get the latest atime from the Linux inode. 3071 * Make sure to get the latest timestamps from the Linux inode.
3072 */ 3072 */
3073 xfs_synchronize_atime(ip); 3073 xfs_synchronize_times(ip);
3074 3074
3075 if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, 3075 if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
3076 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 3076 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0b38b9a869ec..41555de1d1db 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -504,7 +504,7 @@ void xfs_ichgtime(xfs_inode_t *, int);
504void xfs_lock_inodes(xfs_inode_t **, int, uint); 504void xfs_lock_inodes(xfs_inode_t **, int, uint);
505void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 505void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
506 506
507void xfs_synchronize_atime(xfs_inode_t *); 507void xfs_synchronize_times(xfs_inode_t *);
508void xfs_mark_inode_dirty_sync(xfs_inode_t *); 508void xfs_mark_inode_dirty_sync(xfs_inode_t *);
509 509
510#if defined(XFS_INODE_TRACE) 510#if defined(XFS_INODE_TRACE)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 47d5b663c37e..9794b876d6ff 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -232,6 +232,15 @@ xfs_inode_item_format(
232 nvecs = 1; 232 nvecs = 1;
233 233
234 /* 234 /*
235 * Make sure the linux inode is dirty. We do this before
236 * clearing i_update_core as the VFS will call back into
237 * XFS here and set i_update_core, so we need to dirty the
238 * inode first so that the ordering of i_update_core and
239 * unlogged modifications still works as described below.
240 */
241 xfs_mark_inode_dirty_sync(ip);
242
243 /*
235 * Clear i_update_core if the timestamps (or any other 244 * Clear i_update_core if the timestamps (or any other
236 * non-transactional modification) need flushing/logging 245 * non-transactional modification) need flushing/logging
237 * and we're about to log them with the rest of the core. 246 * and we're about to log them with the rest of the core.
@@ -263,14 +272,9 @@ xfs_inode_item_format(
263 } 272 }
264 273
265 /* 274 /*
266 * Make sure to get the latest atime from the Linux inode. 275 * Make sure to get the latest timestamps from the Linux inode.
267 */ 276 */
268 xfs_synchronize_atime(ip); 277 xfs_synchronize_times(ip);
269
270 /*
271 * make sure the linux inode is dirty
272 */
273 xfs_mark_inode_dirty_sync(ip);
274 278
275 vecp->i_addr = (xfs_caddr_t)&ip->i_d; 279 vecp->i_addr = (xfs_caddr_t)&ip->i_d;
276 vecp->i_len = sizeof(struct xfs_icdinode); 280 vecp->i_len = sizeof(struct xfs_icdinode);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index b68f9107e26c..62efab2f3839 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -59,6 +59,7 @@ xfs_bulkstat_one_iget(
59{ 59{
60 xfs_icdinode_t *dic; /* dinode core info pointer */ 60 xfs_icdinode_t *dic; /* dinode core info pointer */
61 xfs_inode_t *ip; /* incore inode pointer */ 61 xfs_inode_t *ip; /* incore inode pointer */
62 struct inode *inode;
62 int error; 63 int error;
63 64
64 error = xfs_iget(mp, NULL, ino, 65 error = xfs_iget(mp, NULL, ino,
@@ -72,6 +73,7 @@ xfs_bulkstat_one_iget(
72 ASSERT(ip->i_imap.im_blkno != 0); 73 ASSERT(ip->i_imap.im_blkno != 0);
73 74
74 dic = &ip->i_d; 75 dic = &ip->i_d;
76 inode = VFS_I(ip);
75 77
76 /* xfs_iget returns the following without needing 78 /* xfs_iget returns the following without needing
77 * further change. 79 * further change.
@@ -83,16 +85,19 @@ xfs_bulkstat_one_iget(
83 buf->bs_uid = dic->di_uid; 85 buf->bs_uid = dic->di_uid;
84 buf->bs_gid = dic->di_gid; 86 buf->bs_gid = dic->di_gid;
85 buf->bs_size = dic->di_size; 87 buf->bs_size = dic->di_size;
88
86 /* 89 /*
87 * We are reading the atime from the Linux inode because the 90 * We need to read the timestamps from the Linux inode because
88 * dinode might not be uptodate. 91 * the VFS keeps writing directly into the inode structure instead
92 * of telling us about the updates.
89 */ 93 */
90 buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec; 94 buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
91 buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec; 95 buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
92 buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; 96 buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
93 buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; 97 buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
94 buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; 98 buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
95 buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec; 99 buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
100
96 buf->bs_xflags = xfs_ip2xflags(ip); 101 buf->bs_xflags = xfs_ip2xflags(ip);
97 buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; 102 buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
98 buf->bs_extents = dic->di_nextents; 103 buf->bs_extents = dic->di_nextents;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1099395d7d6c..fb17f8226b09 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1980,7 +1980,7 @@ xlog_recover_do_reg_buffer(
1980 "XFS: NULL dquot in %s.", __func__); 1980 "XFS: NULL dquot in %s.", __func__);
1981 goto next; 1981 goto next;
1982 } 1982 }
1983 if (item->ri_buf[i].i_len < sizeof(xfs_dqblk_t)) { 1983 if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
1984 cmn_err(CE_ALERT, 1984 cmn_err(CE_ALERT,
1985 "XFS: dquot too small (%d) in %s.", 1985 "XFS: dquot too small (%d) in %s.",
1986 item->ri_buf[i].i_len, __func__); 1986 item->ri_buf[i].i_len, __func__);
@@ -2635,7 +2635,7 @@ xlog_recover_do_dquot_trans(
2635 "XFS: NULL dquot in %s.", __func__); 2635 "XFS: NULL dquot in %s.", __func__);
2636 return XFS_ERROR(EIO); 2636 return XFS_ERROR(EIO);
2637 } 2637 }
2638 if (item->ri_buf[1].i_len < sizeof(xfs_dqblk_t)) { 2638 if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
2639 cmn_err(CE_ALERT, 2639 cmn_err(CE_ALERT,
2640 "XFS: dquot too small (%d) in %s.", 2640 "XFS: dquot too small (%d) in %s.",
2641 item->ri_buf[1].i_len, __func__); 2641 item->ri_buf[1].i_len, __func__);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index f31271c30de9..2ffc570679be 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -467,6 +467,7 @@ xfs_trans_ail_update(
467{ 467{
468 xfs_log_item_t *dlip = NULL; 468 xfs_log_item_t *dlip = NULL;
469 xfs_log_item_t *mlip; /* ptr to minimum lip */ 469 xfs_log_item_t *mlip; /* ptr to minimum lip */
470 xfs_lsn_t tail_lsn;
470 471
471 mlip = xfs_ail_min(ailp); 472 mlip = xfs_ail_min(ailp);
472 473
@@ -483,8 +484,16 @@ xfs_trans_ail_update(
483 484
484 if (mlip == dlip) { 485 if (mlip == dlip) {
485 mlip = xfs_ail_min(ailp); 486 mlip = xfs_ail_min(ailp);
487 /*
488 * It is not safe to access mlip after the AIL lock is
489 * dropped, so we must get a copy of li_lsn before we do
490 * so. This is especially important on 32-bit platforms
491 * where accessing and updating 64-bit values like li_lsn
492 * is not atomic.
493 */
494 tail_lsn = mlip->li_lsn;
486 spin_unlock(&ailp->xa_lock); 495 spin_unlock(&ailp->xa_lock);
487 xfs_log_move_tail(ailp->xa_mount, mlip->li_lsn); 496 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
488 } else { 497 } else {
489 spin_unlock(&ailp->xa_lock); 498 spin_unlock(&ailp->xa_lock);
490 } 499 }
@@ -514,6 +523,7 @@ xfs_trans_ail_delete(
514{ 523{
515 xfs_log_item_t *dlip; 524 xfs_log_item_t *dlip;
516 xfs_log_item_t *mlip; 525 xfs_log_item_t *mlip;
526 xfs_lsn_t tail_lsn;
517 527
518 if (lip->li_flags & XFS_LI_IN_AIL) { 528 if (lip->li_flags & XFS_LI_IN_AIL) {
519 mlip = xfs_ail_min(ailp); 529 mlip = xfs_ail_min(ailp);
@@ -527,9 +537,16 @@ xfs_trans_ail_delete(
527 537
528 if (mlip == dlip) { 538 if (mlip == dlip) {
529 mlip = xfs_ail_min(ailp); 539 mlip = xfs_ail_min(ailp);
540 /*
541 * It is not safe to access mlip after the AIL lock
542 * is dropped, so we must get a copy of li_lsn
543 * before we do so. This is especially important
544 * on 32-bit platforms where accessing and updating
545 * 64-bit values like li_lsn is not atomic.
546 */
547 tail_lsn = mlip ? mlip->li_lsn : 0;
530 spin_unlock(&ailp->xa_lock); 548 spin_unlock(&ailp->xa_lock);
531 xfs_log_move_tail(ailp->xa_mount, 549 xfs_log_move_tail(ailp->xa_mount, tail_lsn);
532 (mlip ? mlip->li_lsn : 0));
533 } else { 550 } else {
534 spin_unlock(&ailp->xa_lock); 551 spin_unlock(&ailp->xa_lock);
535 } 552 }
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index a434f287962d..b572f7e840e0 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2476,12 +2476,6 @@ xfs_reclaim(
2476 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); 2476 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
2477 2477
2478 /* 2478 /*
2479 * Make sure the atime in the XFS inode is correct before freeing the
2480 * Linux inode.
2481 */
2482 xfs_synchronize_atime(ip);
2483
2484 /*
2485 * If we have nothing to flush with this inode then complete the 2479 * If we have nothing to flush with this inode then complete the
2486 * teardown now, otherwise break the link between the xfs inode and the 2480 * teardown now, otherwise break the link between the xfs inode and the
2487 * linux inode and clean up the xfs inode later. This avoids flushing 2481 * linux inode and clean up the xfs inode later. This avoids flushing