aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig5
-rw-r--r--fs/9p/vfs_inode.c4
-rw-r--r--fs/9p/vfs_inode_dotl.c11
-rw-r--r--fs/Kconfig19
-rw-r--r--fs/affs/namei.c5
-rw-r--r--fs/afs/dir.c5
-rw-r--r--fs/autofs4/root.c2
-rw-r--r--fs/bfs/dir.c3
-rw-r--r--fs/block_dev.c17
-rw-r--r--fs/btrfs/extent_io.c9
-rw-r--r--fs/btrfs/super.c2
-rw-r--r--fs/buffer.c64
-rw-r--r--fs/ceph/addr.c5
-rw-r--r--fs/ceph/caps.c61
-rw-r--r--fs/ceph/dir.c7
-rw-r--r--fs/ceph/export.c25
-rw-r--r--fs/ceph/mds_client.c7
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/coda/dir.c5
-rw-r--r--fs/configfs/dir.c2
-rw-r--r--fs/dcache.c8
-rw-r--r--fs/drop_caches.c5
-rw-r--r--fs/ecryptfs/inode.c5
-rw-r--r--fs/exec.c12
-rw-r--r--fs/ext3/super.c2
-rw-r--r--fs/ext4/Makefile3
-rw-r--r--fs/ext4/balloc.c146
-rw-r--r--fs/ext4/ext4.h127
-rw-r--r--fs/ext4/ext4_jbd2.c14
-rw-r--r--fs/ext4/ext4_jbd2.h5
-rw-r--r--fs/ext4/extents.c1410
-rw-r--r--fs/ext4/file.c1
-rw-r--r--fs/ext4/fsync.c25
-rw-r--r--fs/ext4/inode.c114
-rw-r--r--fs/ext4/mballoc.c459
-rw-r--r--fs/ext4/mballoc.h6
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/mmp.c351
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c82
-rw-r--r--fs/ext4/page-io.c39
-rw-r--r--fs/ext4/super.c206
-rw-r--r--fs/ext4/xattr.c4
-rw-r--r--fs/fat/namei_msdos.c5
-rw-r--r--fs/fat/namei_vfat.c5
-rw-r--r--fs/fscache/operation.c10
-rw-r--r--fs/fscache/page.c13
-rw-r--r--fs/fuse/dir.c6
-rw-r--r--fs/gfs2/glock.c5
-rw-r--r--fs/gfs2/quota.c12
-rw-r--r--fs/gfs2/quota.h4
-rw-r--r--fs/hfs/dir.c6
-rw-r--r--fs/hfsplus/dir.c8
-rw-r--r--fs/hostfs/hostfs_kern.c5
-rw-r--r--fs/hpfs/namei.c9
-rw-r--r--fs/hugetlbfs/inode.c7
-rw-r--r--fs/inode.c9
-rw-r--r--fs/jbd2/commit.c22
-rw-r--r--fs/jbd2/journal.c58
-rw-r--r--fs/jbd2/transaction.c22
-rw-r--r--fs/jffs2/dir.c5
-rw-r--r--fs/jfs/namei.c5
-rw-r--r--fs/logfs/dir.c5
-rw-r--r--fs/mbcache.c10
-rw-r--r--fs/minix/namei.c5
-rw-r--r--fs/mpage.c7
-rw-r--r--fs/namei.c380
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/ncpfs/dir.c5
-rw-r--r--fs/ncpfs/inode.c4
-rw-r--r--fs/nfs/dir.c5
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nilfs2/namei.c5
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/omfs/dir.c11
-rw-r--r--fs/partitions/check.c8
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/base.c20
-rw-r--r--fs/proc/generic.c1
-rw-r--r--fs/proc/inode.c7
-rw-r--r--fs/proc/internal.h26
-rw-r--r--fs/proc/namespaces.c198
-rw-r--r--fs/proc/task_mmu.c206
-rw-r--r--fs/quota/dquot.c5
-rw-r--r--fs/reiserfs/namei.c5
-rw-r--r--fs/reiserfs/xattr.c1
-rw-r--r--fs/splice.c33
-rw-r--r--fs/super.c3
-rw-r--r--fs/sysv/namei.c5
-rw-r--r--fs/ubifs/dir.c5
-rw-r--r--fs/udf/namei.c5
-rw-r--r--fs/ufs/namei.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.c29
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c18
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c5
-rw-r--r--fs/xfs/quota/xfs_qm.c6
-rw-r--r--fs/xfs/xfs_ag.h3
-rw-r--r--fs/xfs/xfs_alloc.c35
-rw-r--r--fs/xfs/xfs_alloc.h5
-rw-r--r--fs/xfs/xfs_alloc_btree.c3
-rw-r--r--fs/xfs/xfs_bmap.c549
-rw-r--r--fs/xfs/xfs_bmap.h2
-rw-r--r--fs/xfs/xfs_inode.c15
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_log_cil.c13
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_trans.c2
109 files changed, 3339 insertions, 1835 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 814ac4e213a8..0a93dc1cb4ac 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -1,6 +1,6 @@
1config 9P_FS 1config 9P_FS
2 tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)" 2 tristate "Plan 9 Resource Sharing Support (9P2000)"
3 depends on INET && NET_9P && EXPERIMENTAL 3 depends on INET && NET_9P
4 help 4 help
5 If you say Y here, you will get experimental support for 5 If you say Y here, you will get experimental support for
6 Plan 9 resource sharing via the 9P2000 protocol. 6 Plan 9 resource sharing via the 9P2000 protocol.
@@ -10,7 +10,6 @@ config 9P_FS
10 If unsure, say N. 10 If unsure, say N.
11 11
12if 9P_FS 12if 9P_FS
13
14config 9P_FSCACHE 13config 9P_FSCACHE
15 bool "Enable 9P client caching support (EXPERIMENTAL)" 14 bool "Enable 9P client caching support (EXPERIMENTAL)"
16 depends on EXPERIMENTAL 15 depends on EXPERIMENTAL
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7f6c67703195..8d7f3e69ae29 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -814,6 +814,7 @@ int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
814 814
815int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) 815int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
816{ 816{
817 dentry_unhash(d);
817 return v9fs_remove(i, d, 1); 818 return v9fs_remove(i, d, 1);
818} 819}
819 820
@@ -839,6 +840,9 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
839 struct p9_fid *newdirfid; 840 struct p9_fid *newdirfid;
840 struct p9_wstat wstat; 841 struct p9_wstat wstat;
841 842
843 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
844 dentry_unhash(new_dentry);
845
842 P9_DPRINTK(P9_DEBUG_VFS, "\n"); 846 P9_DPRINTK(P9_DEBUG_VFS, "\n");
843 retval = 0; 847 retval = 0;
844 old_inode = old_dentry->d_inode; 848 old_inode = old_dentry->d_inode;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 82a7c38ddad0..691c78f58bef 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -259,7 +259,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
259 if (IS_ERR(inode_fid)) { 259 if (IS_ERR(inode_fid)) {
260 err = PTR_ERR(inode_fid); 260 err = PTR_ERR(inode_fid);
261 mutex_unlock(&v9inode->v_mutex); 261 mutex_unlock(&v9inode->v_mutex);
262 goto error; 262 goto err_clunk_old_fid;
263 } 263 }
264 v9inode->writeback_fid = (void *) inode_fid; 264 v9inode->writeback_fid = (void *) inode_fid;
265 } 265 }
@@ -267,8 +267,8 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
267 /* Since we are opening a file, assign the open fid to the file */ 267 /* Since we are opening a file, assign the open fid to the file */
268 filp = lookup_instantiate_filp(nd, dentry, generic_file_open); 268 filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
269 if (IS_ERR(filp)) { 269 if (IS_ERR(filp)) {
270 p9_client_clunk(ofid); 270 err = PTR_ERR(filp);
271 return PTR_ERR(filp); 271 goto err_clunk_old_fid;
272 } 272 }
273 filp->private_data = ofid; 273 filp->private_data = ofid;
274#ifdef CONFIG_9P_FSCACHE 274#ifdef CONFIG_9P_FSCACHE
@@ -278,10 +278,11 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
278 return 0; 278 return 0;
279 279
280error: 280error:
281 if (ofid)
282 p9_client_clunk(ofid);
283 if (fid) 281 if (fid)
284 p9_client_clunk(fid); 282 p9_client_clunk(fid);
283err_clunk_old_fid:
284 if (ofid)
285 p9_client_clunk(ofid);
285 return err; 286 return err;
286} 287}
287 288
diff --git a/fs/Kconfig b/fs/Kconfig
index f3aa9b08b228..19891aab9c6e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,7 +47,7 @@ config FS_POSIX_ACL
47 def_bool n 47 def_bool n
48 48
49config EXPORTFS 49config EXPORTFS
50 bool 50 tristate
51 51
52config FILE_LOCKING 52config FILE_LOCKING
53 bool "Enable POSIX file locking API" if EXPERT 53 bool "Enable POSIX file locking API" if EXPERT
@@ -124,6 +124,7 @@ config TMPFS
124config TMPFS_POSIX_ACL 124config TMPFS_POSIX_ACL
125 bool "Tmpfs POSIX Access Control Lists" 125 bool "Tmpfs POSIX Access Control Lists"
126 depends on TMPFS 126 depends on TMPFS
127 select TMPFS_XATTR
127 select GENERIC_ACL 128 select GENERIC_ACL
128 help 129 help
129 POSIX Access Control Lists (ACLs) support permissions for users and 130 POSIX Access Control Lists (ACLs) support permissions for users and
@@ -134,6 +135,22 @@ config TMPFS_POSIX_ACL
134 135
135 If you don't know what Access Control Lists are, say N. 136 If you don't know what Access Control Lists are, say N.
136 137
138config TMPFS_XATTR
139 bool "Tmpfs extended attributes"
140 depends on TMPFS
141 default n
142 help
143 Extended attributes are name:value pairs associated with inodes by
144 the kernel or by users (see the attr(5) manual page, or visit
145 <http://acl.bestbits.at/> for details).
146
147 Currently this enables support for the trusted.* and
148 security.* namespaces.
149
150 You need this for POSIX ACL support on tmpfs.
151
152 If unsure, say N.
153
137config HUGETLBFS 154config HUGETLBFS
138 bool "HugeTLB file system support" 155 bool "HugeTLB file system support"
139 depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ 156 depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index e3e9efc1fdd8..03330e2e390c 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -320,6 +320,8 @@ affs_rmdir(struct inode *dir, struct dentry *dentry)
320 dentry->d_inode->i_ino, 320 dentry->d_inode->i_ino,
321 (int)dentry->d_name.len, dentry->d_name.name); 321 (int)dentry->d_name.len, dentry->d_name.name);
322 322
323 dentry_unhash(dentry);
324
323 return affs_remove_header(dentry); 325 return affs_remove_header(dentry);
324} 326}
325 327
@@ -417,6 +419,9 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
417 struct buffer_head *bh = NULL; 419 struct buffer_head *bh = NULL;
418 int retval; 420 int retval;
419 421
422 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
423 dentry_unhash(new_dentry);
424
420 pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n", 425 pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
421 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name, 426 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
422 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name); 427 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 20c106f24927..2c4e05160042 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -845,6 +845,8 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
845 _enter("{%x:%u},{%s}", 845 _enter("{%x:%u},{%s}",
846 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); 846 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
847 847
848 dentry_unhash(dentry);
849
848 ret = -ENAMETOOLONG; 850 ret = -ENAMETOOLONG;
849 if (dentry->d_name.len >= AFSNAMEMAX) 851 if (dentry->d_name.len >= AFSNAMEMAX)
850 goto error; 852 goto error;
@@ -1146,6 +1148,9 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
1146 struct key *key; 1148 struct key *key;
1147 int ret; 1149 int ret;
1148 1150
1151 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
1152 dentry_unhash(new_dentry);
1153
1149 vnode = AFS_FS_I(old_dentry->d_inode); 1154 vnode = AFS_FS_I(old_dentry->d_inode);
1150 orig_dvnode = AFS_FS_I(old_dir); 1155 orig_dvnode = AFS_FS_I(old_dir);
1151 new_dvnode = AFS_FS_I(new_dir); 1156 new_dvnode = AFS_FS_I(new_dir);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index f55ae23b137e..87d95a8cddbc 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -583,6 +583,8 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
583 if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) 583 if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
584 return -EACCES; 584 return -EACCES;
585 585
586 dentry_unhash(dentry);
587
586 if (atomic_dec_and_test(&ino->count)) { 588 if (atomic_dec_and_test(&ino->count)) {
587 p_ino = autofs4_dentry_ino(dentry->d_parent); 589 p_ino = autofs4_dentry_ino(dentry->d_parent);
588 if (p_ino && dentry->d_parent != dentry) 590 if (p_ino && dentry->d_parent != dentry)
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index b14cebfd9047..c7d1d06b0483 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -224,6 +224,9 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
224 struct bfs_sb_info *info; 224 struct bfs_sb_info *info;
225 int error = -ENOENT; 225 int error = -ENOENT;
226 226
227 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
228 dentry_unhash(new_dentry);
229
227 old_bh = new_bh = NULL; 230 old_bh = new_bh = NULL;
228 old_inode = old_dentry->d_inode; 231 old_inode = old_dentry->d_inode;
229 if (S_ISDIR(old_inode->i_mode)) 232 if (S_ISDIR(old_inode->i_mode))
diff --git a/fs/block_dev.c b/fs/block_dev.c
index bf9c7a720371..1f2b19978333 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1238,6 +1238,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1238 res = __blkdev_get(bdev, mode, 0); 1238 res = __blkdev_get(bdev, mode, 0);
1239 1239
1240 if (whole) { 1240 if (whole) {
1241 struct gendisk *disk = whole->bd_disk;
1242
1241 /* finish claiming */ 1243 /* finish claiming */
1242 mutex_lock(&bdev->bd_mutex); 1244 mutex_lock(&bdev->bd_mutex);
1243 spin_lock(&bdev_lock); 1245 spin_lock(&bdev_lock);
@@ -1264,15 +1266,16 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1264 spin_unlock(&bdev_lock); 1266 spin_unlock(&bdev_lock);
1265 1267
1266 /* 1268 /*
1267 * Block event polling for write claims. Any write 1269 * Block event polling for write claims if requested. Any
1268 * holder makes the write_holder state stick until all 1270 * write holder makes the write_holder state stick until
1269 * are released. This is good enough and tracking 1271 * all are released. This is good enough and tracking
1270 * individual writeable reference is too fragile given 1272 * individual writeable reference is too fragile given the
1271 * the way @mode is used in blkdev_get/put(). 1273 * way @mode is used in blkdev_get/put().
1272 */ 1274 */
1273 if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { 1275 if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) &&
1276 !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
1274 bdev->bd_write_holder = true; 1277 bdev->bd_write_holder = true;
1275 disk_block_events(bdev->bd_disk); 1278 disk_block_events(disk);
1276 } 1279 }
1277 1280
1278 mutex_unlock(&bdev->bd_mutex); 1281 mutex_unlock(&bdev->bd_mutex);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 96fcfa522dab..4f9893243dae 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -11,6 +11,7 @@
11#include <linux/writeback.h> 11#include <linux/writeback.h>
12#include <linux/pagevec.h> 12#include <linux/pagevec.h>
13#include <linux/prefetch.h> 13#include <linux/prefetch.h>
14#include <linux/cleancache.h>
14#include "extent_io.h" 15#include "extent_io.h"
15#include "extent_map.h" 16#include "extent_map.h"
16#include "compat.h" 17#include "compat.h"
@@ -2016,6 +2017,13 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2016 2017
2017 set_page_extent_mapped(page); 2018 set_page_extent_mapped(page);
2018 2019
2020 if (!PageUptodate(page)) {
2021 if (cleancache_get_page(page) == 0) {
2022 BUG_ON(blocksize != PAGE_SIZE);
2023 goto out;
2024 }
2025 }
2026
2019 end = page_end; 2027 end = page_end;
2020 while (1) { 2028 while (1) {
2021 lock_extent(tree, start, end, GFP_NOFS); 2029 lock_extent(tree, start, end, GFP_NOFS);
@@ -2149,6 +2157,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2149 cur = cur + iosize; 2157 cur = cur + iosize;
2150 page_offset += iosize; 2158 page_offset += iosize;
2151 } 2159 }
2160out:
2152 if (!nr) { 2161 if (!nr) {
2153 if (!PageError(page)) 2162 if (!PageError(page))
2154 SetPageUptodate(page); 2163 SetPageUptodate(page);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0ac712efcdf2..be4ffa12f3ef 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -39,6 +39,7 @@
39#include <linux/miscdevice.h> 39#include <linux/miscdevice.h>
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h>
42#include "compat.h" 43#include "compat.h"
43#include "ctree.h" 44#include "ctree.h"
44#include "disk-io.h" 45#include "disk-io.h"
@@ -624,6 +625,7 @@ static int btrfs_fill_super(struct super_block *sb,
624 sb->s_root = root_dentry; 625 sb->s_root = root_dentry;
625 626
626 save_mount_options(sb, data); 627 save_mount_options(sb, data);
628 cleancache_init_fs(sb);
627 return 0; 629 return 0;
628 630
629fail_close: 631fail_close:
diff --git a/fs/buffer.c b/fs/buffer.c
index a08bb8e61c6f..698c6b2cc462 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -41,6 +41,7 @@
41#include <linux/bitops.h> 41#include <linux/bitops.h>
42#include <linux/mpage.h> 42#include <linux/mpage.h>
43#include <linux/bit_spinlock.h> 43#include <linux/bit_spinlock.h>
44#include <linux/cleancache.h>
44 45
45static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); 46static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
46 47
@@ -269,6 +270,10 @@ void invalidate_bdev(struct block_device *bdev)
269 invalidate_bh_lrus(); 270 invalidate_bh_lrus();
270 lru_add_drain_all(); /* make sure all lru add caches are flushed */ 271 lru_add_drain_all(); /* make sure all lru add caches are flushed */
271 invalidate_mapping_pages(mapping, 0, -1); 272 invalidate_mapping_pages(mapping, 0, -1);
273 /* 99% of the time, we don't need to flush the cleancache on the bdev.
274 * But, for the strange corners, lets be cautious
275 */
276 cleancache_flush_inode(mapping);
272} 277}
273EXPORT_SYMBOL(invalidate_bdev); 278EXPORT_SYMBOL(invalidate_bdev);
274 279
@@ -2331,24 +2336,26 @@ EXPORT_SYMBOL(block_commit_write);
2331 * page lock we can determine safely if the page is beyond EOF. If it is not 2336 * page lock we can determine safely if the page is beyond EOF. If it is not
2332 * beyond EOF, then the page is guaranteed safe against truncation until we 2337 * beyond EOF, then the page is guaranteed safe against truncation until we
2333 * unlock the page. 2338 * unlock the page.
2339 *
2340 * Direct callers of this function should call vfs_check_frozen() so that page
2341 * fault does not busyloop until the fs is thawed.
2334 */ 2342 */
2335int 2343int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2336block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, 2344 get_block_t get_block)
2337 get_block_t get_block)
2338{ 2345{
2339 struct page *page = vmf->page; 2346 struct page *page = vmf->page;
2340 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 2347 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2341 unsigned long end; 2348 unsigned long end;
2342 loff_t size; 2349 loff_t size;
2343 int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 2350 int ret;
2344 2351
2345 lock_page(page); 2352 lock_page(page);
2346 size = i_size_read(inode); 2353 size = i_size_read(inode);
2347 if ((page->mapping != inode->i_mapping) || 2354 if ((page->mapping != inode->i_mapping) ||
2348 (page_offset(page) > size)) { 2355 (page_offset(page) > size)) {
2349 /* page got truncated out from underneath us */ 2356 /* We overload EFAULT to mean page got truncated */
2350 unlock_page(page); 2357 ret = -EFAULT;
2351 goto out; 2358 goto out_unlock;
2352 } 2359 }
2353 2360
2354 /* page is wholly or partially inside EOF */ 2361 /* page is wholly or partially inside EOF */
@@ -2361,18 +2368,41 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2361 if (!ret) 2368 if (!ret)
2362 ret = block_commit_write(page, 0, end); 2369 ret = block_commit_write(page, 0, end);
2363 2370
2364 if (unlikely(ret)) { 2371 if (unlikely(ret < 0))
2365 unlock_page(page); 2372 goto out_unlock;
2366 if (ret == -ENOMEM) 2373 /*
2367 ret = VM_FAULT_OOM; 2374 * Freezing in progress? We check after the page is marked dirty and
2368 else /* -ENOSPC, -EIO, etc */ 2375 * with page lock held so if the test here fails, we are sure freezing
2369 ret = VM_FAULT_SIGBUS; 2376 * code will wait during syncing until the page fault is done - at that
2370 } else 2377 * point page will be dirty and unlocked so freezing code will write it
2371 ret = VM_FAULT_LOCKED; 2378 * and writeprotect it again.
2372 2379 */
2373out: 2380 set_page_dirty(page);
2381 if (inode->i_sb->s_frozen != SB_UNFROZEN) {
2382 ret = -EAGAIN;
2383 goto out_unlock;
2384 }
2385 return 0;
2386out_unlock:
2387 unlock_page(page);
2374 return ret; 2388 return ret;
2375} 2389}
2390EXPORT_SYMBOL(__block_page_mkwrite);
2391
2392int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2393 get_block_t get_block)
2394{
2395 int ret;
2396 struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
2397
2398 /*
2399 * This check is racy but catches the common case. The check in
2400 * __block_page_mkwrite() is reliable.
2401 */
2402 vfs_check_frozen(sb, SB_FREEZE_WRITE);
2403 ret = __block_page_mkwrite(vma, vmf, get_block);
2404 return block_page_mkwrite_return(ret);
2405}
2376EXPORT_SYMBOL(block_page_mkwrite); 2406EXPORT_SYMBOL(block_page_mkwrite);
2377 2407
2378/* 2408/*
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 38b8ab554924..33da49dc3cc6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -848,7 +848,8 @@ get_more_pages:
848 op->payload_len = cpu_to_le32(len); 848 op->payload_len = cpu_to_le32(len);
849 req->r_request->hdr.data_len = cpu_to_le32(len); 849 req->r_request->hdr.data_len = cpu_to_le32(len);
850 850
851 ceph_osdc_start_request(&fsc->client->osdc, req, true); 851 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
852 BUG_ON(rc);
852 req = NULL; 853 req = NULL;
853 854
854 /* continue? */ 855 /* continue? */
@@ -880,8 +881,6 @@ release_pvec_pages:
880out: 881out:
881 if (req) 882 if (req)
882 ceph_osdc_put_request(req); 883 ceph_osdc_put_request(req);
883 if (rc > 0)
884 rc = 0; /* vfs expects us to return 0 */
885 ceph_put_snap_context(snapc); 884 ceph_put_snap_context(snapc);
886 dout("writepages done, rc = %d\n", rc); 885 dout("writepages done, rc = %d\n", rc);
887 return rc; 886 return rc;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2a5404c1c42f..1f72b00447c4 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -569,7 +569,8 @@ retry:
569 list_add_tail(&cap->session_caps, &session->s_caps); 569 list_add_tail(&cap->session_caps, &session->s_caps);
570 session->s_nr_caps++; 570 session->s_nr_caps++;
571 spin_unlock(&session->s_cap_lock); 571 spin_unlock(&session->s_cap_lock);
572 } 572 } else if (new_cap)
573 ceph_put_cap(mdsc, new_cap);
573 574
574 if (!ci->i_snap_realm) { 575 if (!ci->i_snap_realm) {
575 /* 576 /*
@@ -2634,6 +2635,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2634 struct ceph_mds_session *session, 2635 struct ceph_mds_session *session,
2635 int *open_target_sessions) 2636 int *open_target_sessions)
2636{ 2637{
2638 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
2637 struct ceph_inode_info *ci = ceph_inode(inode); 2639 struct ceph_inode_info *ci = ceph_inode(inode);
2638 int mds = session->s_mds; 2640 int mds = session->s_mds;
2639 unsigned mseq = le32_to_cpu(ex->migrate_seq); 2641 unsigned mseq = le32_to_cpu(ex->migrate_seq);
@@ -2670,6 +2672,19 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2670 * export targets, so that we get the matching IMPORT 2672 * export targets, so that we get the matching IMPORT
2671 */ 2673 */
2672 *open_target_sessions = 1; 2674 *open_target_sessions = 1;
2675
2676 /*
2677 * we can't flush dirty caps that we've seen the
2678 * EXPORT but no IMPORT for
2679 */
2680 spin_lock(&mdsc->cap_dirty_lock);
2681 if (!list_empty(&ci->i_dirty_item)) {
2682 dout(" moving %p to cap_dirty_migrating\n",
2683 inode);
2684 list_move(&ci->i_dirty_item,
2685 &mdsc->cap_dirty_migrating);
2686 }
2687 spin_unlock(&mdsc->cap_dirty_lock);
2673 } 2688 }
2674 __ceph_remove_cap(cap); 2689 __ceph_remove_cap(cap);
2675 } 2690 }
@@ -2707,6 +2722,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
2707 ci->i_cap_exporting_issued = 0; 2722 ci->i_cap_exporting_issued = 0;
2708 ci->i_cap_exporting_mseq = 0; 2723 ci->i_cap_exporting_mseq = 0;
2709 ci->i_cap_exporting_mds = -1; 2724 ci->i_cap_exporting_mds = -1;
2725
2726 spin_lock(&mdsc->cap_dirty_lock);
2727 if (!list_empty(&ci->i_dirty_item)) {
2728 dout(" moving %p back to cap_dirty\n", inode);
2729 list_move(&ci->i_dirty_item, &mdsc->cap_dirty);
2730 }
2731 spin_unlock(&mdsc->cap_dirty_lock);
2710 } else { 2732 } else {
2711 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n", 2733 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
2712 inode, ci, mds, mseq); 2734 inode, ci, mds, mseq);
@@ -2910,38 +2932,16 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
2910 */ 2932 */
2911void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) 2933void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
2912{ 2934{
2913 struct ceph_inode_info *ci, *nci = NULL; 2935 struct ceph_inode_info *ci;
2914 struct inode *inode, *ninode = NULL; 2936 struct inode *inode;
2915 struct list_head *p, *n;
2916 2937
2917 dout("flush_dirty_caps\n"); 2938 dout("flush_dirty_caps\n");
2918 spin_lock(&mdsc->cap_dirty_lock); 2939 spin_lock(&mdsc->cap_dirty_lock);
2919 list_for_each_safe(p, n, &mdsc->cap_dirty) { 2940 while (!list_empty(&mdsc->cap_dirty)) {
2920 if (nci) { 2941 ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
2921 ci = nci; 2942 i_dirty_item);
2922 inode = ninode; 2943 inode = igrab(&ci->vfs_inode);
2923 ci->i_ceph_flags &= ~CEPH_I_NOFLUSH; 2944 dout("flush_dirty_caps %p\n", inode);
2924 dout("flush_dirty_caps inode %p (was next inode)\n",
2925 inode);
2926 } else {
2927 ci = list_entry(p, struct ceph_inode_info,
2928 i_dirty_item);
2929 inode = igrab(&ci->vfs_inode);
2930 BUG_ON(!inode);
2931 dout("flush_dirty_caps inode %p\n", inode);
2932 }
2933 if (n != &mdsc->cap_dirty) {
2934 nci = list_entry(n, struct ceph_inode_info,
2935 i_dirty_item);
2936 ninode = igrab(&nci->vfs_inode);
2937 BUG_ON(!ninode);
2938 nci->i_ceph_flags |= CEPH_I_NOFLUSH;
2939 dout("flush_dirty_caps next inode %p, noflush\n",
2940 ninode);
2941 } else {
2942 nci = NULL;
2943 ninode = NULL;
2944 }
2945 spin_unlock(&mdsc->cap_dirty_lock); 2945 spin_unlock(&mdsc->cap_dirty_lock);
2946 if (inode) { 2946 if (inode) {
2947 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, 2947 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
@@ -2951,6 +2951,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
2951 spin_lock(&mdsc->cap_dirty_lock); 2951 spin_lock(&mdsc->cap_dirty_lock);
2952 } 2952 }
2953 spin_unlock(&mdsc->cap_dirty_lock); 2953 spin_unlock(&mdsc->cap_dirty_lock);
2954 dout("flush_dirty_caps done\n");
2954} 2955}
2955 2956
2956/* 2957/*
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 1a867a3601ae..33729e822bb9 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -360,7 +360,7 @@ more:
360 rinfo = &fi->last_readdir->r_reply_info; 360 rinfo = &fi->last_readdir->r_reply_info;
361 dout("readdir frag %x num %d off %d chunkoff %d\n", frag, 361 dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
362 rinfo->dir_nr, off, fi->offset); 362 rinfo->dir_nr, off, fi->offset);
363 while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) { 363 while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
364 u64 pos = ceph_make_fpos(frag, off); 364 u64 pos = ceph_make_fpos(frag, off);
365 struct ceph_mds_reply_inode *in = 365 struct ceph_mds_reply_inode *in =
366 rinfo->dir_in[off - fi->offset].in; 366 rinfo->dir_in[off - fi->offset].in;
@@ -1066,16 +1066,17 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1066 struct inode *inode = file->f_dentry->d_inode; 1066 struct inode *inode = file->f_dentry->d_inode;
1067 struct ceph_inode_info *ci = ceph_inode(inode); 1067 struct ceph_inode_info *ci = ceph_inode(inode);
1068 int left; 1068 int left;
1069 const int bufsize = 1024;
1069 1070
1070 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) 1071 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1071 return -EISDIR; 1072 return -EISDIR;
1072 1073
1073 if (!cf->dir_info) { 1074 if (!cf->dir_info) {
1074 cf->dir_info = kmalloc(1024, GFP_NOFS); 1075 cf->dir_info = kmalloc(bufsize, GFP_NOFS);
1075 if (!cf->dir_info) 1076 if (!cf->dir_info)
1076 return -ENOMEM; 1077 return -ENOMEM;
1077 cf->dir_info_len = 1078 cf->dir_info_len =
1078 sprintf(cf->dir_info, 1079 snprintf(cf->dir_info, bufsize,
1079 "entries: %20lld\n" 1080 "entries: %20lld\n"
1080 " files: %20lld\n" 1081 " files: %20lld\n"
1081 " subdirs: %20lld\n" 1082 " subdirs: %20lld\n"
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e41056174bf8..a610d3d67488 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -86,6 +86,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
86static struct dentry *__fh_to_dentry(struct super_block *sb, 86static struct dentry *__fh_to_dentry(struct super_block *sb,
87 struct ceph_nfs_fh *fh) 87 struct ceph_nfs_fh *fh)
88{ 88{
89 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
89 struct inode *inode; 90 struct inode *inode;
90 struct dentry *dentry; 91 struct dentry *dentry;
91 struct ceph_vino vino; 92 struct ceph_vino vino;
@@ -95,8 +96,24 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
95 vino.ino = fh->ino; 96 vino.ino = fh->ino;
96 vino.snap = CEPH_NOSNAP; 97 vino.snap = CEPH_NOSNAP;
97 inode = ceph_find_inode(sb, vino); 98 inode = ceph_find_inode(sb, vino);
98 if (!inode) 99 if (!inode) {
99 return ERR_PTR(-ESTALE); 100 struct ceph_mds_request *req;
101
102 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
103 USE_ANY_MDS);
104 if (IS_ERR(req))
105 return ERR_CAST(req);
106
107 req->r_ino1 = vino;
108 req->r_num_caps = 1;
109 err = ceph_mdsc_do_request(mdsc, NULL, req);
110 inode = req->r_target_inode;
111 if (inode)
112 igrab(inode);
113 ceph_mdsc_put_request(req);
114 if (!inode)
115 return ERR_PTR(-ESTALE);
116 }
100 117
101 dentry = d_obtain_alias(inode); 118 dentry = d_obtain_alias(inode);
102 if (IS_ERR(dentry)) { 119 if (IS_ERR(dentry)) {
@@ -148,8 +165,10 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
148 snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash); 165 snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
149 req->r_num_caps = 1; 166 req->r_num_caps = 1;
150 err = ceph_mdsc_do_request(mdsc, NULL, req); 167 err = ceph_mdsc_do_request(mdsc, NULL, req);
168 inode = req->r_target_inode;
169 if (inode)
170 igrab(inode);
151 ceph_mdsc_put_request(req); 171 ceph_mdsc_put_request(req);
152 inode = ceph_find_inode(sb, vino);
153 if (!inode) 172 if (!inode)
154 return ERR_PTR(err ? err : -ESTALE); 173 return ERR_PTR(err ? err : -ESTALE);
155 } 174 }
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d0fae4ce9ba5..79743d146be6 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -578,6 +578,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
578 if (dir) { 578 if (dir) {
579 struct ceph_inode_info *ci = ceph_inode(dir); 579 struct ceph_inode_info *ci = ceph_inode(dir);
580 580
581 ihold(dir);
581 spin_lock(&ci->i_unsafe_lock); 582 spin_lock(&ci->i_unsafe_lock);
582 req->r_unsafe_dir = dir; 583 req->r_unsafe_dir = dir;
583 list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); 584 list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
@@ -598,6 +599,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
598 spin_lock(&ci->i_unsafe_lock); 599 spin_lock(&ci->i_unsafe_lock);
599 list_del_init(&req->r_unsafe_dir_item); 600 list_del_init(&req->r_unsafe_dir_item);
600 spin_unlock(&ci->i_unsafe_lock); 601 spin_unlock(&ci->i_unsafe_lock);
602
603 iput(req->r_unsafe_dir);
604 req->r_unsafe_dir = NULL;
601 } 605 }
602 606
603 ceph_mdsc_put_request(req); 607 ceph_mdsc_put_request(req);
@@ -2691,7 +2695,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2691{ 2695{
2692 struct super_block *sb = mdsc->fsc->sb; 2696 struct super_block *sb = mdsc->fsc->sb;
2693 struct inode *inode; 2697 struct inode *inode;
2694 struct ceph_inode_info *ci;
2695 struct dentry *parent, *dentry; 2698 struct dentry *parent, *dentry;
2696 struct ceph_dentry_info *di; 2699 struct ceph_dentry_info *di;
2697 int mds = session->s_mds; 2700 int mds = session->s_mds;
@@ -2728,7 +2731,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2728 dout("handle_lease no inode %llx\n", vino.ino); 2731 dout("handle_lease no inode %llx\n", vino.ino);
2729 goto release; 2732 goto release;
2730 } 2733 }
2731 ci = ceph_inode(inode);
2732 2734
2733 /* dentry */ 2735 /* dentry */
2734 parent = d_find_alias(inode); 2736 parent = d_find_alias(inode);
@@ -3002,6 +3004,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
3002 spin_lock_init(&mdsc->snap_flush_lock); 3004 spin_lock_init(&mdsc->snap_flush_lock);
3003 mdsc->cap_flush_seq = 0; 3005 mdsc->cap_flush_seq = 0;
3004 INIT_LIST_HEAD(&mdsc->cap_dirty); 3006 INIT_LIST_HEAD(&mdsc->cap_dirty);
3007 INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
3005 mdsc->num_cap_flushing = 0; 3008 mdsc->num_cap_flushing = 0;
3006 spin_lock_init(&mdsc->cap_dirty_lock); 3009 spin_lock_init(&mdsc->cap_dirty_lock);
3007 init_waitqueue_head(&mdsc->cap_flushing_wq); 3010 init_waitqueue_head(&mdsc->cap_flushing_wq);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4e3a9cc0bba6..7d8a0d662d56 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -278,6 +278,7 @@ struct ceph_mds_client {
278 278
279 u64 cap_flush_seq; 279 u64 cap_flush_seq;
280 struct list_head cap_dirty; /* inodes with dirty caps */ 280 struct list_head cap_dirty; /* inodes with dirty caps */
281 struct list_head cap_dirty_migrating; /* ...that are migration... */
281 int num_cap_flushing; /* # caps we are flushing */ 282 int num_cap_flushing; /* # caps we are flushing */
282 spinlock_t cap_dirty_lock; /* protects above items */ 283 spinlock_t cap_dirty_lock; /* protects above items */
283 wait_queue_head_t cap_flushing_wq; 284 wait_queue_head_t cap_flushing_wq;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 2b8dae4d121e..a46126fd5735 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -336,6 +336,8 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
336 int len = de->d_name.len; 336 int len = de->d_name.len;
337 int error; 337 int error;
338 338
339 dentry_unhash(de);
340
339 error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len); 341 error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
340 if (!error) { 342 if (!error) {
341 /* VFS may delete the child */ 343 /* VFS may delete the child */
@@ -359,6 +361,9 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
359 int new_length = new_dentry->d_name.len; 361 int new_length = new_dentry->d_name.len;
360 int error; 362 int error;
361 363
364 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
365 dentry_unhash(new_dentry);
366
362 error = venus_rename(old_dir->i_sb, coda_i2f(old_dir), 367 error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
363 coda_i2f(new_dir), old_length, new_length, 368 coda_i2f(new_dir), old_length, new_length,
364 (const char *) old_name, (const char *)new_name); 369 (const char *) old_name, (const char *)new_name);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 9a37a9b6de3a..9d17d350abc5 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1359,6 +1359,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
1359 struct module *subsys_owner = NULL, *dead_item_owner = NULL; 1359 struct module *subsys_owner = NULL, *dead_item_owner = NULL;
1360 int ret; 1360 int ret;
1361 1361
1362 dentry_unhash(dentry);
1363
1362 if (dentry->d_parent == configfs_sb->s_root) 1364 if (dentry->d_parent == configfs_sb->s_root)
1363 return -EPERM; 1365 return -EPERM;
1364 1366
diff --git a/fs/dcache.c b/fs/dcache.c
index 18b2a1f10ed8..37f72ee5bf7c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1220,7 +1220,7 @@ void shrink_dcache_parent(struct dentry * parent)
1220EXPORT_SYMBOL(shrink_dcache_parent); 1220EXPORT_SYMBOL(shrink_dcache_parent);
1221 1221
1222/* 1222/*
1223 * Scan `nr' dentries and return the number which remain. 1223 * Scan `sc->nr_slab_to_reclaim' dentries and return the number which remain.
1224 * 1224 *
1225 * We need to avoid reentering the filesystem if the caller is performing a 1225 * We need to avoid reentering the filesystem if the caller is performing a
1226 * GFP_NOFS allocation attempt. One example deadlock is: 1226 * GFP_NOFS allocation attempt. One example deadlock is:
@@ -1231,8 +1231,12 @@ EXPORT_SYMBOL(shrink_dcache_parent);
1231 * 1231 *
1232 * In this case we return -1 to tell the caller that we baled. 1232 * In this case we return -1 to tell the caller that we baled.
1233 */ 1233 */
1234static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) 1234static int shrink_dcache_memory(struct shrinker *shrink,
1235 struct shrink_control *sc)
1235{ 1236{
1237 int nr = sc->nr_to_scan;
1238 gfp_t gfp_mask = sc->gfp_mask;
1239
1236 if (nr) { 1240 if (nr) {
1237 if (!(gfp_mask & __GFP_FS)) 1241 if (!(gfp_mask & __GFP_FS))
1238 return -1; 1242 return -1;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 98b77c89494c..c00e055b6282 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -40,9 +40,12 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
40static void drop_slab(void) 40static void drop_slab(void)
41{ 41{
42 int nr_objects; 42 int nr_objects;
43 struct shrink_control shrink = {
44 .gfp_mask = GFP_KERNEL,
45 };
43 46
44 do { 47 do {
45 nr_objects = shrink_slab(1000, GFP_KERNEL, 1000); 48 nr_objects = shrink_slab(&shrink, 1000, 1000);
46 } while (nr_objects > 10); 49 } while (nr_objects > 10);
47} 50}
48 51
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4d4cc6a90cd5..227b409b8406 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -521,6 +521,8 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
521 struct dentry *lower_dir_dentry; 521 struct dentry *lower_dir_dentry;
522 int rc; 522 int rc;
523 523
524 dentry_unhash(dentry);
525
524 lower_dentry = ecryptfs_dentry_to_lower(dentry); 526 lower_dentry = ecryptfs_dentry_to_lower(dentry);
525 dget(dentry); 527 dget(dentry);
526 lower_dir_dentry = lock_parent(lower_dentry); 528 lower_dir_dentry = lock_parent(lower_dentry);
@@ -571,6 +573,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
571 struct dentry *lower_new_dir_dentry; 573 struct dentry *lower_new_dir_dentry;
572 struct dentry *trap = NULL; 574 struct dentry *trap = NULL;
573 575
576 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
577 dentry_unhash(new_dentry);
578
574 lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); 579 lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
575 lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); 580 lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
576 dget(lower_old_dentry); 581 dget(lower_old_dentry);
diff --git a/fs/exec.c b/fs/exec.c
index c1cf372f17a7..936f5776655c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -200,7 +200,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
200 200
201#ifdef CONFIG_STACK_GROWSUP 201#ifdef CONFIG_STACK_GROWSUP
202 if (write) { 202 if (write) {
203 ret = expand_stack_downwards(bprm->vma, pos); 203 ret = expand_downwards(bprm->vma, pos);
204 if (ret < 0) 204 if (ret < 0)
205 return NULL; 205 return NULL;
206 } 206 }
@@ -600,7 +600,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
600 unsigned long length = old_end - old_start; 600 unsigned long length = old_end - old_start;
601 unsigned long new_start = old_start - shift; 601 unsigned long new_start = old_start - shift;
602 unsigned long new_end = old_end - shift; 602 unsigned long new_end = old_end - shift;
603 struct mmu_gather *tlb; 603 struct mmu_gather tlb;
604 604
605 BUG_ON(new_start > new_end); 605 BUG_ON(new_start > new_end);
606 606
@@ -626,12 +626,12 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
626 return -ENOMEM; 626 return -ENOMEM;
627 627
628 lru_add_drain(); 628 lru_add_drain();
629 tlb = tlb_gather_mmu(mm, 0); 629 tlb_gather_mmu(&tlb, mm, 0);
630 if (new_end > old_start) { 630 if (new_end > old_start) {
631 /* 631 /*
632 * when the old and new regions overlap clear from new_end. 632 * when the old and new regions overlap clear from new_end.
633 */ 633 */
634 free_pgd_range(tlb, new_end, old_end, new_end, 634 free_pgd_range(&tlb, new_end, old_end, new_end,
635 vma->vm_next ? vma->vm_next->vm_start : 0); 635 vma->vm_next ? vma->vm_next->vm_start : 0);
636 } else { 636 } else {
637 /* 637 /*
@@ -640,10 +640,10 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
640 * have constraints on va-space that make this illegal (IA64) - 640 * have constraints on va-space that make this illegal (IA64) -
641 * for the others its just a little faster. 641 * for the others its just a little faster.
642 */ 642 */
643 free_pgd_range(tlb, old_start, old_end, new_end, 643 free_pgd_range(&tlb, old_start, old_end, new_end,
644 vma->vm_next ? vma->vm_next->vm_start : 0); 644 vma->vm_next ? vma->vm_next->vm_start : 0);
645 } 645 }
646 tlb_finish_mmu(tlb, new_end, old_end); 646 tlb_finish_mmu(&tlb, new_end, old_end);
647 647
648 /* 648 /*
649 * Shrink the vma to just the new range. Always succeeds. 649 * Shrink the vma to just the new range. Always succeeds.
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3c6a9e0eadc1..aad153ef6b78 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -36,6 +36,7 @@
36#include <linux/quotaops.h> 36#include <linux/quotaops.h>
37#include <linux/seq_file.h> 37#include <linux/seq_file.h>
38#include <linux/log2.h> 38#include <linux/log2.h>
39#include <linux/cleancache.h>
39 40
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41 42
@@ -1367,6 +1368,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
1367 } else { 1368 } else {
1368 ext3_msg(sb, KERN_INFO, "using internal journal"); 1369 ext3_msg(sb, KERN_INFO, "using internal journal");
1369 } 1370 }
1371 cleancache_init_fs(sb);
1370 return res; 1372 return res;
1371} 1373}
1372 1374
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index c947e36eda6c..04109460ba9e 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
6 6
7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ 7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o 9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
10 mmp.o
10 11
11ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 12ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
12ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o 13ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1c67139ad4b4..264f6949511e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -362,130 +362,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
362} 362}
363 363
364/** 364/**
365 * ext4_add_groupblocks() -- Add given blocks to an existing group
366 * @handle: handle to this transaction
367 * @sb: super block
368 * @block: start physcial block to add to the block group
369 * @count: number of blocks to free
370 *
371 * This marks the blocks as free in the bitmap. We ask the
372 * mballoc to reload the buddy after this by setting group
373 * EXT4_GROUP_INFO_NEED_INIT_BIT flag
374 */
375void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
376 ext4_fsblk_t block, unsigned long count)
377{
378 struct buffer_head *bitmap_bh = NULL;
379 struct buffer_head *gd_bh;
380 ext4_group_t block_group;
381 ext4_grpblk_t bit;
382 unsigned int i;
383 struct ext4_group_desc *desc;
384 struct ext4_sb_info *sbi = EXT4_SB(sb);
385 int err = 0, ret, blk_free_count;
386 ext4_grpblk_t blocks_freed;
387 struct ext4_group_info *grp;
388
389 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
390
391 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
392 grp = ext4_get_group_info(sb, block_group);
393 /*
394 * Check to see if we are freeing blocks across a group
395 * boundary.
396 */
397 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
398 goto error_return;
399 }
400 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
401 if (!bitmap_bh)
402 goto error_return;
403 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
404 if (!desc)
405 goto error_return;
406
407 if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
408 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
409 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
410 in_range(block + count - 1, ext4_inode_table(sb, desc),
411 sbi->s_itb_per_group)) {
412 ext4_error(sb, "Adding blocks in system zones - "
413 "Block = %llu, count = %lu",
414 block, count);
415 goto error_return;
416 }
417
418 /*
419 * We are about to add blocks to the bitmap,
420 * so we need undo access.
421 */
422 BUFFER_TRACE(bitmap_bh, "getting undo access");
423 err = ext4_journal_get_undo_access(handle, bitmap_bh);
424 if (err)
425 goto error_return;
426
427 /*
428 * We are about to modify some metadata. Call the journal APIs
429 * to unshare ->b_data if a currently-committing transaction is
430 * using it
431 */
432 BUFFER_TRACE(gd_bh, "get_write_access");
433 err = ext4_journal_get_write_access(handle, gd_bh);
434 if (err)
435 goto error_return;
436 /*
437 * make sure we don't allow a parallel init on other groups in the
438 * same buddy cache
439 */
440 down_write(&grp->alloc_sem);
441 for (i = 0, blocks_freed = 0; i < count; i++) {
442 BUFFER_TRACE(bitmap_bh, "clear bit");
443 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
444 bit + i, bitmap_bh->b_data)) {
445 ext4_error(sb, "bit already cleared for block %llu",
446 (ext4_fsblk_t)(block + i));
447 BUFFER_TRACE(bitmap_bh, "bit already cleared");
448 } else {
449 blocks_freed++;
450 }
451 }
452 ext4_lock_group(sb, block_group);
453 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
454 ext4_free_blks_set(sb, desc, blk_free_count);
455 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
456 ext4_unlock_group(sb, block_group);
457 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
458
459 if (sbi->s_log_groups_per_flex) {
460 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
461 atomic_add(blocks_freed,
462 &sbi->s_flex_groups[flex_group].free_blocks);
463 }
464 /*
465 * request to reload the buddy with the
466 * new bitmap information
467 */
468 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
469 grp->bb_free += blocks_freed;
470 up_write(&grp->alloc_sem);
471
472 /* We dirtied the bitmap block */
473 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
474 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
475
476 /* And the group descriptor block */
477 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
478 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
479 if (!err)
480 err = ret;
481
482error_return:
483 brelse(bitmap_bh);
484 ext4_std_error(sb, err);
485 return;
486}
487
488/**
489 * ext4_has_free_blocks() 365 * ext4_has_free_blocks()
490 * @sbi: in-core super block structure. 366 * @sbi: in-core super block structure.
491 * @nblocks: number of needed blocks 367 * @nblocks: number of needed blocks
@@ -493,7 +369,8 @@ error_return:
493 * Check if filesystem has nblocks free & available for allocation. 369 * Check if filesystem has nblocks free & available for allocation.
494 * On success return 1, return 0 on failure. 370 * On success return 1, return 0 on failure.
495 */ 371 */
496static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) 372static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
373 s64 nblocks, unsigned int flags)
497{ 374{
498 s64 free_blocks, dirty_blocks, root_blocks; 375 s64 free_blocks, dirty_blocks, root_blocks;
499 struct percpu_counter *fbc = &sbi->s_freeblocks_counter; 376 struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
@@ -507,11 +384,6 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
507 EXT4_FREEBLOCKS_WATERMARK) { 384 EXT4_FREEBLOCKS_WATERMARK) {
508 free_blocks = percpu_counter_sum_positive(fbc); 385 free_blocks = percpu_counter_sum_positive(fbc);
509 dirty_blocks = percpu_counter_sum_positive(dbc); 386 dirty_blocks = percpu_counter_sum_positive(dbc);
510 if (dirty_blocks < 0) {
511 printk(KERN_CRIT "Dirty block accounting "
512 "went wrong %lld\n",
513 (long long)dirty_blocks);
514 }
515 } 387 }
516 /* Check whether we have space after 388 /* Check whether we have space after
517 * accounting for current dirty blocks & root reserved blocks. 389 * accounting for current dirty blocks & root reserved blocks.
@@ -522,7 +394,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
522 /* Hm, nope. Are (enough) root reserved blocks available? */ 394 /* Hm, nope. Are (enough) root reserved blocks available? */
523 if (sbi->s_resuid == current_fsuid() || 395 if (sbi->s_resuid == current_fsuid() ||
524 ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) || 396 ((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
525 capable(CAP_SYS_RESOURCE)) { 397 capable(CAP_SYS_RESOURCE) ||
398 (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
399
526 if (free_blocks >= (nblocks + dirty_blocks)) 400 if (free_blocks >= (nblocks + dirty_blocks))
527 return 1; 401 return 1;
528 } 402 }
@@ -531,9 +405,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
531} 405}
532 406
533int ext4_claim_free_blocks(struct ext4_sb_info *sbi, 407int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
534 s64 nblocks) 408 s64 nblocks, unsigned int flags)
535{ 409{
536 if (ext4_has_free_blocks(sbi, nblocks)) { 410 if (ext4_has_free_blocks(sbi, nblocks, flags)) {
537 percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks); 411 percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
538 return 0; 412 return 0;
539 } else 413 } else
@@ -554,7 +428,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
554 */ 428 */
555int ext4_should_retry_alloc(struct super_block *sb, int *retries) 429int ext4_should_retry_alloc(struct super_block *sb, int *retries)
556{ 430{
557 if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || 431 if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
558 (*retries)++ > 3 || 432 (*retries)++ > 3 ||
559 !EXT4_SB(sb)->s_journal) 433 !EXT4_SB(sb)->s_journal)
560 return 0; 434 return 0;
@@ -577,7 +451,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
577 * error stores in errp pointer 451 * error stores in errp pointer
578 */ 452 */
579ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 453ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
580 ext4_fsblk_t goal, unsigned long *count, int *errp) 454 ext4_fsblk_t goal, unsigned int flags,
455 unsigned long *count, int *errp)
581{ 456{
582 struct ext4_allocation_request ar; 457 struct ext4_allocation_request ar;
583 ext4_fsblk_t ret; 458 ext4_fsblk_t ret;
@@ -587,6 +462,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
587 ar.inode = inode; 462 ar.inode = inode;
588 ar.goal = goal; 463 ar.goal = goal;
589 ar.len = count ? *count : 1; 464 ar.len = count ? *count : 1;
465 ar.flags = flags;
590 466
591 ret = ext4_mb_new_blocks(handle, &ar, errp); 467 ret = ext4_mb_new_blocks(handle, &ar, errp);
592 if (count) 468 if (count)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4daaf2b753f4..a74b89c09f90 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -108,7 +108,8 @@ typedef unsigned int ext4_group_t;
108#define EXT4_MB_DELALLOC_RESERVED 0x0400 108#define EXT4_MB_DELALLOC_RESERVED 0x0400
109/* We are doing stream allocation */ 109/* We are doing stream allocation */
110#define EXT4_MB_STREAM_ALLOC 0x0800 110#define EXT4_MB_STREAM_ALLOC 0x0800
111 111/* Use reserved root blocks if needed */
112#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
112 113
113struct ext4_allocation_request { 114struct ext4_allocation_request {
114 /* target inode for block we're allocating */ 115 /* target inode for block we're allocating */
@@ -209,6 +210,8 @@ struct ext4_io_submit {
209 */ 210 */
210#define EXT4_BAD_INO 1 /* Bad blocks inode */ 211#define EXT4_BAD_INO 1 /* Bad blocks inode */
211#define EXT4_ROOT_INO 2 /* Root inode */ 212#define EXT4_ROOT_INO 2 /* Root inode */
213#define EXT4_USR_QUOTA_INO 3 /* User quota inode */
214#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */
212#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ 215#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */
213#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ 216#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */
214#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ 217#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */
@@ -512,6 +515,10 @@ struct ext4_new_group_data {
512 /* Convert extent to initialized after IO complete */ 515 /* Convert extent to initialized after IO complete */
513#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ 516#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
514 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) 517 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
518 /* Punch out blocks of an extent */
519#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020
520 /* Don't normalize allocation size (used for fallocate) */
521#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
515 522
516/* 523/*
517 * Flags used by ext4_free_blocks 524 * Flags used by ext4_free_blocks
@@ -1028,7 +1035,7 @@ struct ext4_super_block {
1028 __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ 1035 __le16 s_want_extra_isize; /* New inodes should reserve # bytes */
1029 __le32 s_flags; /* Miscellaneous flags */ 1036 __le32 s_flags; /* Miscellaneous flags */
1030 __le16 s_raid_stride; /* RAID stride */ 1037 __le16 s_raid_stride; /* RAID stride */
1031 __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ 1038 __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */
1032 __le64 s_mmp_block; /* Block for multi-mount protection */ 1039 __le64 s_mmp_block; /* Block for multi-mount protection */
1033 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ 1040 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
1034 __u8 s_log_groups_per_flex; /* FLEX_BG group size */ 1041 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
@@ -1144,6 +1151,9 @@ struct ext4_sb_info {
1144 unsigned long s_ext_blocks; 1151 unsigned long s_ext_blocks;
1145 unsigned long s_ext_extents; 1152 unsigned long s_ext_extents;
1146#endif 1153#endif
1154 /* ext4 extent cache stats */
1155 unsigned long extent_cache_hits;
1156 unsigned long extent_cache_misses;
1147 1157
1148 /* for buddy allocator */ 1158 /* for buddy allocator */
1149 struct ext4_group_info ***s_group_info; 1159 struct ext4_group_info ***s_group_info;
@@ -1201,6 +1211,9 @@ struct ext4_sb_info {
1201 struct ext4_li_request *s_li_request; 1211 struct ext4_li_request *s_li_request;
1202 /* Wait multiplier for lazy initialization thread */ 1212 /* Wait multiplier for lazy initialization thread */
1203 unsigned int s_li_wait_mult; 1213 unsigned int s_li_wait_mult;
1214
1215 /* Kernel thread for multiple mount protection */
1216 struct task_struct *s_mmp_tsk;
1204}; 1217};
1205 1218
1206static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1219static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1338,6 +1351,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1338#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 1351#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
1339#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 1352#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
1340#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 1353#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
1354#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
1341 1355
1342#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 1356#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
1343#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 1357#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1351,13 +1365,29 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1351#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ 1365#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
1352#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ 1366#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
1353 1367
1368#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
1369#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
1370 EXT4_FEATURE_INCOMPAT_META_BG)
1371#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
1372 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
1373 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
1374
1375#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
1376#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
1377 EXT4_FEATURE_INCOMPAT_RECOVER| \
1378 EXT4_FEATURE_INCOMPAT_META_BG)
1379#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
1380 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
1381 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
1382
1354#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR 1383#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
1355#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ 1384#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
1356 EXT4_FEATURE_INCOMPAT_RECOVER| \ 1385 EXT4_FEATURE_INCOMPAT_RECOVER| \
1357 EXT4_FEATURE_INCOMPAT_META_BG| \ 1386 EXT4_FEATURE_INCOMPAT_META_BG| \
1358 EXT4_FEATURE_INCOMPAT_EXTENTS| \ 1387 EXT4_FEATURE_INCOMPAT_EXTENTS| \
1359 EXT4_FEATURE_INCOMPAT_64BIT| \ 1388 EXT4_FEATURE_INCOMPAT_64BIT| \
1360 EXT4_FEATURE_INCOMPAT_FLEX_BG) 1389 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
1390 EXT4_FEATURE_INCOMPAT_MMP)
1361#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ 1391#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
1362 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ 1392 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
1363 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ 1393 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1590,12 +1620,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
1590 */ 1620 */
1591struct ext4_lazy_init { 1621struct ext4_lazy_init {
1592 unsigned long li_state; 1622 unsigned long li_state;
1593
1594 wait_queue_head_t li_wait_daemon;
1595 wait_queue_head_t li_wait_task;
1596 struct timer_list li_timer;
1597 struct task_struct *li_task;
1598
1599 struct list_head li_request_list; 1623 struct list_head li_request_list;
1600 struct mutex li_list_mtx; 1624 struct mutex li_list_mtx;
1601}; 1625};
@@ -1615,6 +1639,67 @@ struct ext4_features {
1615}; 1639};
1616 1640
1617/* 1641/*
1642 * This structure will be used for multiple mount protection. It will be
1643 * written into the block number saved in the s_mmp_block field in the
1644 * superblock. Programs that check MMP should assume that if
1645 * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
1646 * to use the filesystem, regardless of how old the timestamp is.
1647 */
1648#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */
1649#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
1650#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */
1651#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */
1652
1653struct mmp_struct {
1654 __le32 mmp_magic; /* Magic number for MMP */
1655 __le32 mmp_seq; /* Sequence no. updated periodically */
1656
1657 /*
1658 * mmp_time, mmp_nodename & mmp_bdevname are only used for information
1659 * purposes and do not affect the correctness of the algorithm
1660 */
1661 __le64 mmp_time; /* Time last updated */
1662 char mmp_nodename[64]; /* Node which last updated MMP block */
1663 char mmp_bdevname[32]; /* Bdev which last updated MMP block */
1664
1665 /*
1666 * mmp_check_interval is used to verify if the MMP block has been
1667 * updated on the block device. The value is updated based on the
1668 * maximum time to write the MMP block during an update cycle.
1669 */
1670 __le16 mmp_check_interval;
1671
1672 __le16 mmp_pad1;
1673 __le32 mmp_pad2[227];
1674};
1675
1676/* arguments passed to the mmp thread */
1677struct mmpd_data {
1678 struct buffer_head *bh; /* bh from initial read_mmp_block() */
1679 struct super_block *sb; /* super block of the fs */
1680};
1681
1682/*
1683 * Check interval multiplier
1684 * The MMP block is written every update interval and initially checked every
1685 * update interval x the multiplier (the value is then adapted based on the
1686 * write latency). The reason is that writes can be delayed under load and we
1687 * don't want readers to incorrectly assume that the filesystem is no longer
1688 * in use.
1689 */
1690#define EXT4_MMP_CHECK_MULT 2UL
1691
1692/*
1693 * Minimum interval for MMP checking in seconds.
1694 */
1695#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL
1696
1697/*
1698 * Maximum interval for MMP checking in seconds.
1699 */
1700#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL
1701
1702/*
1618 * Function prototypes 1703 * Function prototypes
1619 */ 1704 */
1620 1705
@@ -1638,10 +1723,12 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
1638extern unsigned long ext4_bg_num_gdb(struct super_block *sb, 1723extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
1639 ext4_group_t group); 1724 ext4_group_t group);
1640extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, 1725extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1641 ext4_fsblk_t goal, unsigned long *count, int *errp); 1726 ext4_fsblk_t goal,
1642extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1727 unsigned int flags,
1643extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, 1728 unsigned long *count,
1644 ext4_fsblk_t block, unsigned long count); 1729 int *errp);
1730extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
1731 s64 nblocks, unsigned int flags);
1645extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); 1732extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
1646extern void ext4_check_blocks_bitmap(struct super_block *); 1733extern void ext4_check_blocks_bitmap(struct super_block *);
1647extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, 1734extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1706,6 +1793,8 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1706 unsigned long count, int flags); 1793 unsigned long count, int flags);
1707extern int ext4_mb_add_groupinfo(struct super_block *sb, 1794extern int ext4_mb_add_groupinfo(struct super_block *sb,
1708 ext4_group_t i, struct ext4_group_desc *desc); 1795 ext4_group_t i, struct ext4_group_desc *desc);
1796extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
1797 ext4_fsblk_t block, unsigned long count);
1709extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); 1798extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
1710 1799
1711/* inode.c */ 1800/* inode.c */
@@ -1729,6 +1818,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
1729extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1818extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1730extern int ext4_can_truncate(struct inode *inode); 1819extern int ext4_can_truncate(struct inode *inode);
1731extern void ext4_truncate(struct inode *); 1820extern void ext4_truncate(struct inode *);
1821extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
1732extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); 1822extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
1733extern void ext4_set_inode_flags(struct inode *); 1823extern void ext4_set_inode_flags(struct inode *);
1734extern void ext4_get_inode_flags(struct ext4_inode_info *); 1824extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -1738,6 +1828,8 @@ extern int ext4_writepage_trans_blocks(struct inode *);
1738extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 1828extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
1739extern int ext4_block_truncate_page(handle_t *handle, 1829extern int ext4_block_truncate_page(handle_t *handle,
1740 struct address_space *mapping, loff_t from); 1830 struct address_space *mapping, loff_t from);
1831extern int ext4_block_zero_page_range(handle_t *handle,
1832 struct address_space *mapping, loff_t from, loff_t length);
1741extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1833extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1742extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1834extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1743extern void ext4_da_update_reserve_space(struct inode *inode, 1835extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1788,6 +1880,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int,
1788 __LINE__, ## message) 1880 __LINE__, ## message)
1789extern void ext4_msg(struct super_block *, const char *, const char *, ...) 1881extern void ext4_msg(struct super_block *, const char *, const char *, ...)
1790 __attribute__ ((format (printf, 3, 4))); 1882 __attribute__ ((format (printf, 3, 4)));
1883extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
1884 const char *, unsigned int, const char *);
1885#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
1886 __LINE__, msg)
1791extern void __ext4_grp_locked_error(const char *, unsigned int, \ 1887extern void __ext4_grp_locked_error(const char *, unsigned int, \
1792 struct super_block *, ext4_group_t, \ 1888 struct super_block *, ext4_group_t, \
1793 unsigned long, ext4_fsblk_t, \ 1889 unsigned long, ext4_fsblk_t, \
@@ -2064,6 +2160,8 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
2064extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 2160extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
2065 struct ext4_map_blocks *map, int flags); 2161 struct ext4_map_blocks *map, int flags);
2066extern void ext4_ext_truncate(struct inode *); 2162extern void ext4_ext_truncate(struct inode *);
2163extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
2164 loff_t length);
2067extern void ext4_ext_init(struct super_block *); 2165extern void ext4_ext_init(struct super_block *);
2068extern void ext4_ext_release(struct super_block *); 2166extern void ext4_ext_release(struct super_block *);
2069extern long ext4_fallocate(struct file *file, int mode, loff_t offset, 2167extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
@@ -2092,6 +2190,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
2092 int len, 2190 int len,
2093 struct writeback_control *wbc); 2191 struct writeback_control *wbc);
2094 2192
2193/* mmp.c */
2194extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
2195
2095/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ 2196/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
2096enum ext4_state_bits { 2197enum ext4_state_bits {
2097 BH_Uninit /* blocks are allocated but uninitialized on disk */ 2198 BH_Uninit /* blocks are allocated but uninitialized on disk */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6e272ef6ba96..f5240aa15601 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,20 +6,6 @@
6 6
7#include <trace/events/ext4.h> 7#include <trace/events/ext4.h>
8 8
9int __ext4_journal_get_undo_access(const char *where, unsigned int line,
10 handle_t *handle, struct buffer_head *bh)
11{
12 int err = 0;
13
14 if (ext4_handle_valid(handle)) {
15 err = jbd2_journal_get_undo_access(handle, bh);
16 if (err)
17 ext4_journal_abort_handle(where, line, __func__, bh,
18 handle, err);
19 }
20 return err;
21}
22
23int __ext4_journal_get_write_access(const char *where, unsigned int line, 9int __ext4_journal_get_write_access(const char *where, unsigned int line,
24 handle_t *handle, struct buffer_head *bh) 10 handle_t *handle, struct buffer_head *bh)
25{ 11{
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index d0f53538a57f..bb85757689b6 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -126,9 +126,6 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
126 const char *err_fn, 126 const char *err_fn,
127 struct buffer_head *bh, handle_t *handle, int err); 127 struct buffer_head *bh, handle_t *handle, int err);
128 128
129int __ext4_journal_get_undo_access(const char *where, unsigned int line,
130 handle_t *handle, struct buffer_head *bh);
131
132int __ext4_journal_get_write_access(const char *where, unsigned int line, 129int __ext4_journal_get_write_access(const char *where, unsigned int line,
133 handle_t *handle, struct buffer_head *bh); 130 handle_t *handle, struct buffer_head *bh);
134 131
@@ -146,8 +143,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
146int __ext4_handle_dirty_super(const char *where, unsigned int line, 143int __ext4_handle_dirty_super(const char *where, unsigned int line,
147 handle_t *handle, struct super_block *sb); 144 handle_t *handle, struct super_block *sb);
148 145
149#define ext4_journal_get_undo_access(handle, bh) \
150 __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
151#define ext4_journal_get_write_access(handle, bh) \ 146#define ext4_journal_get_write_access(handle, bh) \
152 __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) 147 __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
153#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ 148#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4890d6f3ad15..5199bac7fc62 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -46,6 +46,13 @@
46 46
47#include <trace/events/ext4.h> 47#include <trace/events/ext4.h>
48 48
49static int ext4_split_extent(handle_t *handle,
50 struct inode *inode,
51 struct ext4_ext_path *path,
52 struct ext4_map_blocks *map,
53 int split_flag,
54 int flags);
55
49static int ext4_ext_truncate_extend_restart(handle_t *handle, 56static int ext4_ext_truncate_extend_restart(handle_t *handle,
50 struct inode *inode, 57 struct inode *inode,
51 int needed) 58 int needed)
@@ -192,12 +199,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
192static ext4_fsblk_t 199static ext4_fsblk_t
193ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, 200ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
194 struct ext4_ext_path *path, 201 struct ext4_ext_path *path,
195 struct ext4_extent *ex, int *err) 202 struct ext4_extent *ex, int *err, unsigned int flags)
196{ 203{
197 ext4_fsblk_t goal, newblock; 204 ext4_fsblk_t goal, newblock;
198 205
199 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); 206 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
200 newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); 207 newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
208 NULL, err);
201 return newblock; 209 return newblock;
202} 210}
203 211
@@ -474,9 +482,43 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
474 } 482 }
475 ext_debug("\n"); 483 ext_debug("\n");
476} 484}
485
486static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
487 ext4_fsblk_t newblock, int level)
488{
489 int depth = ext_depth(inode);
490 struct ext4_extent *ex;
491
492 if (depth != level) {
493 struct ext4_extent_idx *idx;
494 idx = path[level].p_idx;
495 while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
496 ext_debug("%d: move %d:%llu in new index %llu\n", level,
497 le32_to_cpu(idx->ei_block),
498 ext4_idx_pblock(idx),
499 newblock);
500 idx++;
501 }
502
503 return;
504 }
505
506 ex = path[depth].p_ext;
507 while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
508 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
509 le32_to_cpu(ex->ee_block),
510 ext4_ext_pblock(ex),
511 ext4_ext_is_uninitialized(ex),
512 ext4_ext_get_actual_len(ex),
513 newblock);
514 ex++;
515 }
516}
517
477#else 518#else
478#define ext4_ext_show_path(inode, path) 519#define ext4_ext_show_path(inode, path)
479#define ext4_ext_show_leaf(inode, path) 520#define ext4_ext_show_leaf(inode, path)
521#define ext4_ext_show_move(inode, path, newblock, level)
480#endif 522#endif
481 523
482void ext4_ext_drop_refs(struct ext4_ext_path *path) 524void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -792,14 +834,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
792 * - initializes subtree 834 * - initializes subtree
793 */ 835 */
794static int ext4_ext_split(handle_t *handle, struct inode *inode, 836static int ext4_ext_split(handle_t *handle, struct inode *inode,
795 struct ext4_ext_path *path, 837 unsigned int flags,
796 struct ext4_extent *newext, int at) 838 struct ext4_ext_path *path,
839 struct ext4_extent *newext, int at)
797{ 840{
798 struct buffer_head *bh = NULL; 841 struct buffer_head *bh = NULL;
799 int depth = ext_depth(inode); 842 int depth = ext_depth(inode);
800 struct ext4_extent_header *neh; 843 struct ext4_extent_header *neh;
801 struct ext4_extent_idx *fidx; 844 struct ext4_extent_idx *fidx;
802 struct ext4_extent *ex;
803 int i = at, k, m, a; 845 int i = at, k, m, a;
804 ext4_fsblk_t newblock, oldblock; 846 ext4_fsblk_t newblock, oldblock;
805 __le32 border; 847 __le32 border;
@@ -847,7 +889,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
847 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); 889 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
848 for (a = 0; a < depth - at; a++) { 890 for (a = 0; a < depth - at; a++) {
849 newblock = ext4_ext_new_meta_block(handle, inode, path, 891 newblock = ext4_ext_new_meta_block(handle, inode, path,
850 newext, &err); 892 newext, &err, flags);
851 if (newblock == 0) 893 if (newblock == 0)
852 goto cleanup; 894 goto cleanup;
853 ablocks[a] = newblock; 895 ablocks[a] = newblock;
@@ -876,7 +918,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
876 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0)); 918 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
877 neh->eh_magic = EXT4_EXT_MAGIC; 919 neh->eh_magic = EXT4_EXT_MAGIC;
878 neh->eh_depth = 0; 920 neh->eh_depth = 0;
879 ex = EXT_FIRST_EXTENT(neh);
880 921
881 /* move remainder of path[depth] to the new leaf */ 922 /* move remainder of path[depth] to the new leaf */
882 if (unlikely(path[depth].p_hdr->eh_entries != 923 if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -888,25 +929,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
888 goto cleanup; 929 goto cleanup;
889 } 930 }
890 /* start copy from next extent */ 931 /* start copy from next extent */
891 /* TODO: we could do it by single memmove */ 932 m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
892 m = 0; 933 ext4_ext_show_move(inode, path, newblock, depth);
893 path[depth].p_ext++;
894 while (path[depth].p_ext <=
895 EXT_MAX_EXTENT(path[depth].p_hdr)) {
896 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
897 le32_to_cpu(path[depth].p_ext->ee_block),
898 ext4_ext_pblock(path[depth].p_ext),
899 ext4_ext_is_uninitialized(path[depth].p_ext),
900 ext4_ext_get_actual_len(path[depth].p_ext),
901 newblock);
902 /*memmove(ex++, path[depth].p_ext++,
903 sizeof(struct ext4_extent));
904 neh->eh_entries++;*/
905 path[depth].p_ext++;
906 m++;
907 }
908 if (m) { 934 if (m) {
909 memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m); 935 struct ext4_extent *ex;
936 ex = EXT_FIRST_EXTENT(neh);
937 memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
910 le16_add_cpu(&neh->eh_entries, m); 938 le16_add_cpu(&neh->eh_entries, m);
911 } 939 }
912 940
@@ -968,12 +996,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
968 996
969 ext_debug("int.index at %d (block %llu): %u -> %llu\n", 997 ext_debug("int.index at %d (block %llu): %u -> %llu\n",
970 i, newblock, le32_to_cpu(border), oldblock); 998 i, newblock, le32_to_cpu(border), oldblock);
971 /* copy indexes */
972 m = 0;
973 path[i].p_idx++;
974 999
975 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, 1000 /* move remainder of path[i] to the new index block */
976 EXT_MAX_INDEX(path[i].p_hdr));
977 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) != 1001 if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
978 EXT_LAST_INDEX(path[i].p_hdr))) { 1002 EXT_LAST_INDEX(path[i].p_hdr))) {
979 EXT4_ERROR_INODE(inode, 1003 EXT4_ERROR_INODE(inode,
@@ -982,20 +1006,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
982 err = -EIO; 1006 err = -EIO;
983 goto cleanup; 1007 goto cleanup;
984 } 1008 }
985 while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { 1009 /* start copy indexes */
986 ext_debug("%d: move %d:%llu in new index %llu\n", i, 1010 m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
987 le32_to_cpu(path[i].p_idx->ei_block), 1011 ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
988 ext4_idx_pblock(path[i].p_idx), 1012 EXT_MAX_INDEX(path[i].p_hdr));
989 newblock); 1013 ext4_ext_show_move(inode, path, newblock, i);
990 /*memmove(++fidx, path[i].p_idx++,
991 sizeof(struct ext4_extent_idx));
992 neh->eh_entries++;
993 BUG_ON(neh->eh_entries > neh->eh_max);*/
994 path[i].p_idx++;
995 m++;
996 }
997 if (m) { 1014 if (m) {
998 memmove(++fidx, path[i].p_idx - m, 1015 memmove(++fidx, path[i].p_idx,
999 sizeof(struct ext4_extent_idx) * m); 1016 sizeof(struct ext4_extent_idx) * m);
1000 le16_add_cpu(&neh->eh_entries, m); 1017 le16_add_cpu(&neh->eh_entries, m);
1001 } 1018 }
@@ -1056,8 +1073,9 @@ cleanup:
1056 * just created block 1073 * just created block
1057 */ 1074 */
1058static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1075static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1059 struct ext4_ext_path *path, 1076 unsigned int flags,
1060 struct ext4_extent *newext) 1077 struct ext4_ext_path *path,
1078 struct ext4_extent *newext)
1061{ 1079{
1062 struct ext4_ext_path *curp = path; 1080 struct ext4_ext_path *curp = path;
1063 struct ext4_extent_header *neh; 1081 struct ext4_extent_header *neh;
@@ -1065,7 +1083,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1065 ext4_fsblk_t newblock; 1083 ext4_fsblk_t newblock;
1066 int err = 0; 1084 int err = 0;
1067 1085
1068 newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); 1086 newblock = ext4_ext_new_meta_block(handle, inode, path,
1087 newext, &err, flags);
1069 if (newblock == 0) 1088 if (newblock == 0)
1070 return err; 1089 return err;
1071 1090
@@ -1140,8 +1159,9 @@ out:
1140 * if no free index is found, then it requests in-depth growing. 1159 * if no free index is found, then it requests in-depth growing.
1141 */ 1160 */
1142static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 1161static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1143 struct ext4_ext_path *path, 1162 unsigned int flags,
1144 struct ext4_extent *newext) 1163 struct ext4_ext_path *path,
1164 struct ext4_extent *newext)
1145{ 1165{
1146 struct ext4_ext_path *curp; 1166 struct ext4_ext_path *curp;
1147 int depth, i, err = 0; 1167 int depth, i, err = 0;
@@ -1161,7 +1181,7 @@ repeat:
1161 if (EXT_HAS_FREE_INDEX(curp)) { 1181 if (EXT_HAS_FREE_INDEX(curp)) {
1162 /* if we found index with free entry, then use that 1182 /* if we found index with free entry, then use that
1163 * entry: create all needed subtree and add new leaf */ 1183 * entry: create all needed subtree and add new leaf */
1164 err = ext4_ext_split(handle, inode, path, newext, i); 1184 err = ext4_ext_split(handle, inode, flags, path, newext, i);
1165 if (err) 1185 if (err)
1166 goto out; 1186 goto out;
1167 1187
@@ -1174,7 +1194,8 @@ repeat:
1174 err = PTR_ERR(path); 1194 err = PTR_ERR(path);
1175 } else { 1195 } else {
1176 /* tree is full, time to grow in depth */ 1196 /* tree is full, time to grow in depth */
1177 err = ext4_ext_grow_indepth(handle, inode, path, newext); 1197 err = ext4_ext_grow_indepth(handle, inode, flags,
1198 path, newext);
1178 if (err) 1199 if (err)
1179 goto out; 1200 goto out;
1180 1201
@@ -1563,7 +1584,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1563 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns 1584 * Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
1564 * 1 if they got merged. 1585 * 1 if they got merged.
1565 */ 1586 */
1566static int ext4_ext_try_to_merge(struct inode *inode, 1587static int ext4_ext_try_to_merge_right(struct inode *inode,
1567 struct ext4_ext_path *path, 1588 struct ext4_ext_path *path,
1568 struct ext4_extent *ex) 1589 struct ext4_extent *ex)
1569{ 1590{
@@ -1603,6 +1624,31 @@ static int ext4_ext_try_to_merge(struct inode *inode,
1603} 1624}
1604 1625
1605/* 1626/*
1627 * This function tries to merge the @ex extent to neighbours in the tree.
1628 * return 1 if merge left else 0.
1629 */
1630static int ext4_ext_try_to_merge(struct inode *inode,
1631 struct ext4_ext_path *path,
1632 struct ext4_extent *ex) {
1633 struct ext4_extent_header *eh;
1634 unsigned int depth;
1635 int merge_done = 0;
1636 int ret = 0;
1637
1638 depth = ext_depth(inode);
1639 BUG_ON(path[depth].p_hdr == NULL);
1640 eh = path[depth].p_hdr;
1641
1642 if (ex > EXT_FIRST_EXTENT(eh))
1643 merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
1644
1645 if (!merge_done)
1646 ret = ext4_ext_try_to_merge_right(inode, path, ex);
1647
1648 return ret;
1649}
1650
1651/*
1606 * check if a portion of the "newext" extent overlaps with an 1652 * check if a portion of the "newext" extent overlaps with an
1607 * existing extent. 1653 * existing extent.
1608 * 1654 *
@@ -1668,6 +1714,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1668 int depth, len, err; 1714 int depth, len, err;
1669 ext4_lblk_t next; 1715 ext4_lblk_t next;
1670 unsigned uninitialized = 0; 1716 unsigned uninitialized = 0;
1717 int flags = 0;
1671 1718
1672 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1719 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1673 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1720 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1742,7 +1789,9 @@ repeat:
1742 * There is no free space in the found leaf. 1789 * There is no free space in the found leaf.
1743 * We're gonna add a new leaf in the tree. 1790 * We're gonna add a new leaf in the tree.
1744 */ 1791 */
1745 err = ext4_ext_create_new_leaf(handle, inode, path, newext); 1792 if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
1793 flags = EXT4_MB_USE_ROOT_BLOCKS;
1794 err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
1746 if (err) 1795 if (err)
1747 goto cleanup; 1796 goto cleanup;
1748 depth = ext_depth(inode); 1797 depth = ext_depth(inode);
@@ -2003,13 +2052,25 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2003} 2052}
2004 2053
2005/* 2054/*
2055 * ext4_ext_in_cache()
2056 * Checks to see if the given block is in the cache.
2057 * If it is, the cached extent is stored in the given
2058 * cache extent pointer. If the cached extent is a hole,
2059 * this routine should be used instead of
2060 * ext4_ext_in_cache if the calling function needs to
2061 * know the size of the hole.
2062 *
2063 * @inode: The files inode
2064 * @block: The block to look for in the cache
2065 * @ex: Pointer where the cached extent will be stored
2066 * if it contains block
2067 *
2006 * Return 0 if cache is invalid; 1 if the cache is valid 2068 * Return 0 if cache is invalid; 1 if the cache is valid
2007 */ 2069 */
2008static int 2070static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
2009ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, 2071 struct ext4_ext_cache *ex){
2010 struct ext4_extent *ex)
2011{
2012 struct ext4_ext_cache *cex; 2072 struct ext4_ext_cache *cex;
2073 struct ext4_sb_info *sbi;
2013 int ret = 0; 2074 int ret = 0;
2014 2075
2015 /* 2076 /*
@@ -2017,26 +2078,60 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2017 */ 2078 */
2018 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2079 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2019 cex = &EXT4_I(inode)->i_cached_extent; 2080 cex = &EXT4_I(inode)->i_cached_extent;
2081 sbi = EXT4_SB(inode->i_sb);
2020 2082
2021 /* has cache valid data? */ 2083 /* has cache valid data? */
2022 if (cex->ec_len == 0) 2084 if (cex->ec_len == 0)
2023 goto errout; 2085 goto errout;
2024 2086
2025 if (in_range(block, cex->ec_block, cex->ec_len)) { 2087 if (in_range(block, cex->ec_block, cex->ec_len)) {
2026 ex->ee_block = cpu_to_le32(cex->ec_block); 2088 memcpy(ex, cex, sizeof(struct ext4_ext_cache));
2027 ext4_ext_store_pblock(ex, cex->ec_start);
2028 ex->ee_len = cpu_to_le16(cex->ec_len);
2029 ext_debug("%u cached by %u:%u:%llu\n", 2089 ext_debug("%u cached by %u:%u:%llu\n",
2030 block, 2090 block,
2031 cex->ec_block, cex->ec_len, cex->ec_start); 2091 cex->ec_block, cex->ec_len, cex->ec_start);
2032 ret = 1; 2092 ret = 1;
2033 } 2093 }
2034errout: 2094errout:
2095 if (!ret)
2096 sbi->extent_cache_misses++;
2097 else
2098 sbi->extent_cache_hits++;
2035 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 2099 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2036 return ret; 2100 return ret;
2037} 2101}
2038 2102
2039/* 2103/*
2104 * ext4_ext_in_cache()
2105 * Checks to see if the given block is in the cache.
2106 * If it is, the cached extent is stored in the given
2107 * extent pointer.
2108 *
2109 * @inode: The files inode
2110 * @block: The block to look for in the cache
2111 * @ex: Pointer where the cached extent will be stored
2112 * if it contains block
2113 *
2114 * Return 0 if cache is invalid; 1 if the cache is valid
2115 */
2116static int
2117ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2118 struct ext4_extent *ex)
2119{
2120 struct ext4_ext_cache cex;
2121 int ret = 0;
2122
2123 if (ext4_ext_check_cache(inode, block, &cex)) {
2124 ex->ee_block = cpu_to_le32(cex.ec_block);
2125 ext4_ext_store_pblock(ex, cex.ec_start);
2126 ex->ee_len = cpu_to_le16(cex.ec_len);
2127 ret = 1;
2128 }
2129
2130 return ret;
2131}
2132
2133
2134/*
2040 * ext4_ext_rm_idx: 2135 * ext4_ext_rm_idx:
2041 * removes index from the index block. 2136 * removes index from the index block.
2042 * It's used in truncate case only, thus all requests are for 2137 * It's used in truncate case only, thus all requests are for
@@ -2163,8 +2258,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2163 ext4_free_blocks(handle, inode, NULL, start, num, flags); 2258 ext4_free_blocks(handle, inode, NULL, start, num, flags);
2164 } else if (from == le32_to_cpu(ex->ee_block) 2259 } else if (from == le32_to_cpu(ex->ee_block)
2165 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2260 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2166 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", 2261 /* head removal */
2167 from, to, le32_to_cpu(ex->ee_block), ee_len); 2262 ext4_lblk_t num;
2263 ext4_fsblk_t start;
2264
2265 num = to - from;
2266 start = ext4_ext_pblock(ex);
2267
2268 ext_debug("free first %u blocks starting %llu\n", num, start);
2269 ext4_free_blocks(handle, inode, 0, start, num, flags);
2270
2168 } else { 2271 } else {
2169 printk(KERN_INFO "strange request: removal(2) " 2272 printk(KERN_INFO "strange request: removal(2) "
2170 "%u-%u from %u:%u\n", 2273 "%u-%u from %u:%u\n",
@@ -2173,9 +2276,22 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2173 return 0; 2276 return 0;
2174} 2277}
2175 2278
2279
2280/*
2281 * ext4_ext_rm_leaf() Removes the extents associated with the
2282 * blocks appearing between "start" and "end", and splits the extents
2283 * if "start" and "end" appear in the same extent
2284 *
2285 * @handle: The journal handle
2286 * @inode: The files inode
2287 * @path: The path to the leaf
2288 * @start: The first block to remove
2289 * @end: The last block to remove
2290 */
2176static int 2291static int
2177ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2292ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2178 struct ext4_ext_path *path, ext4_lblk_t start) 2293 struct ext4_ext_path *path, ext4_lblk_t start,
2294 ext4_lblk_t end)
2179{ 2295{
2180 int err = 0, correct_index = 0; 2296 int err = 0, correct_index = 0;
2181 int depth = ext_depth(inode), credits; 2297 int depth = ext_depth(inode), credits;
@@ -2186,6 +2302,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2186 unsigned short ex_ee_len; 2302 unsigned short ex_ee_len;
2187 unsigned uninitialized = 0; 2303 unsigned uninitialized = 0;
2188 struct ext4_extent *ex; 2304 struct ext4_extent *ex;
2305 struct ext4_map_blocks map;
2189 2306
2190 /* the header must be checked already in ext4_ext_remove_space() */ 2307 /* the header must be checked already in ext4_ext_remove_space() */
2191 ext_debug("truncate since %u in leaf\n", start); 2308 ext_debug("truncate since %u in leaf\n", start);
@@ -2215,31 +2332,95 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2215 path[depth].p_ext = ex; 2332 path[depth].p_ext = ex;
2216 2333
2217 a = ex_ee_block > start ? ex_ee_block : start; 2334 a = ex_ee_block > start ? ex_ee_block : start;
2218 b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ? 2335 b = ex_ee_block+ex_ee_len - 1 < end ?
2219 ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK; 2336 ex_ee_block+ex_ee_len - 1 : end;
2220 2337
2221 ext_debug(" border %u:%u\n", a, b); 2338 ext_debug(" border %u:%u\n", a, b);
2222 2339
2223 if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) { 2340 /* If this extent is beyond the end of the hole, skip it */
2224 block = 0; 2341 if (end <= ex_ee_block) {
2225 num = 0; 2342 ex--;
2226 BUG(); 2343 ex_ee_block = le32_to_cpu(ex->ee_block);
2344 ex_ee_len = ext4_ext_get_actual_len(ex);
2345 continue;
2346 } else if (a != ex_ee_block &&
2347 b != ex_ee_block + ex_ee_len - 1) {
2348 /*
2349 * If this is a truncate, then this condition should
2350 * never happen because at least one of the end points
2351 * needs to be on the edge of the extent.
2352 */
2353 if (end == EXT_MAX_BLOCK) {
2354 ext_debug(" bad truncate %u:%u\n",
2355 start, end);
2356 block = 0;
2357 num = 0;
2358 err = -EIO;
2359 goto out;
2360 }
2361 /*
2362 * else this is a hole punch, so the extent needs to
2363 * be split since neither edge of the hole is on the
2364 * extent edge
2365 */
2366 else{
2367 map.m_pblk = ext4_ext_pblock(ex);
2368 map.m_lblk = ex_ee_block;
2369 map.m_len = b - ex_ee_block;
2370
2371 err = ext4_split_extent(handle,
2372 inode, path, &map, 0,
2373 EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
2374 EXT4_GET_BLOCKS_PRE_IO);
2375
2376 if (err < 0)
2377 goto out;
2378
2379 ex_ee_len = ext4_ext_get_actual_len(ex);
2380
2381 b = ex_ee_block+ex_ee_len - 1 < end ?
2382 ex_ee_block+ex_ee_len - 1 : end;
2383
2384 /* Then remove tail of this extent */
2385 block = ex_ee_block;
2386 num = a - block;
2387 }
2227 } else if (a != ex_ee_block) { 2388 } else if (a != ex_ee_block) {
2228 /* remove tail of the extent */ 2389 /* remove tail of the extent */
2229 block = ex_ee_block; 2390 block = ex_ee_block;
2230 num = a - block; 2391 num = a - block;
2231 } else if (b != ex_ee_block + ex_ee_len - 1) { 2392 } else if (b != ex_ee_block + ex_ee_len - 1) {
2232 /* remove head of the extent */ 2393 /* remove head of the extent */
2233 block = a; 2394 block = b;
2234 num = b - a; 2395 num = ex_ee_block + ex_ee_len - b;
2235 /* there is no "make a hole" API yet */ 2396
2236 BUG(); 2397 /*
2398 * If this is a truncate, this condition
2399 * should never happen
2400 */
2401 if (end == EXT_MAX_BLOCK) {
2402 ext_debug(" bad truncate %u:%u\n",
2403 start, end);
2404 err = -EIO;
2405 goto out;
2406 }
2237 } else { 2407 } else {
2238 /* remove whole extent: excellent! */ 2408 /* remove whole extent: excellent! */
2239 block = ex_ee_block; 2409 block = ex_ee_block;
2240 num = 0; 2410 num = 0;
2241 BUG_ON(a != ex_ee_block); 2411 if (a != ex_ee_block) {
2242 BUG_ON(b != ex_ee_block + ex_ee_len - 1); 2412 ext_debug(" bad truncate %u:%u\n",
2413 start, end);
2414 err = -EIO;
2415 goto out;
2416 }
2417
2418 if (b != ex_ee_block + ex_ee_len - 1) {
2419 ext_debug(" bad truncate %u:%u\n",
2420 start, end);
2421 err = -EIO;
2422 goto out;
2423 }
2243 } 2424 }
2244 2425
2245 /* 2426 /*
@@ -2270,7 +2451,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2270 if (num == 0) { 2451 if (num == 0) {
2271 /* this extent is removed; mark slot entirely unused */ 2452 /* this extent is removed; mark slot entirely unused */
2272 ext4_ext_store_pblock(ex, 0); 2453 ext4_ext_store_pblock(ex, 0);
2273 le16_add_cpu(&eh->eh_entries, -1); 2454 } else if (block != ex_ee_block) {
2455 /*
2456 * If this was a head removal, then we need to update
2457 * the physical block since it is now at a different
2458 * location
2459 */
2460 ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
2274 } 2461 }
2275 2462
2276 ex->ee_block = cpu_to_le32(block); 2463 ex->ee_block = cpu_to_le32(block);
@@ -2286,6 +2473,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2286 if (err) 2473 if (err)
2287 goto out; 2474 goto out;
2288 2475
2476 /*
2477 * If the extent was completely released,
2478 * we need to remove it from the leaf
2479 */
2480 if (num == 0) {
2481 if (end != EXT_MAX_BLOCK) {
2482 /*
2483 * For hole punching, we need to scoot all the
2484 * extents up when an extent is removed so that
2485 * we dont have blank extents in the middle
2486 */
2487 memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
2488 sizeof(struct ext4_extent));
2489
2490 /* Now get rid of the one at the end */
2491 memset(EXT_LAST_EXTENT(eh), 0,
2492 sizeof(struct ext4_extent));
2493 }
2494 le16_add_cpu(&eh->eh_entries, -1);
2495 }
2496
2289 ext_debug("new extent: %u:%u:%llu\n", block, num, 2497 ext_debug("new extent: %u:%u:%llu\n", block, num,
2290 ext4_ext_pblock(ex)); 2498 ext4_ext_pblock(ex));
2291 ex--; 2499 ex--;
@@ -2326,7 +2534,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
2326 return 1; 2534 return 1;
2327} 2535}
2328 2536
2329static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) 2537static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2538 ext4_lblk_t end)
2330{ 2539{
2331 struct super_block *sb = inode->i_sb; 2540 struct super_block *sb = inode->i_sb;
2332 int depth = ext_depth(inode); 2541 int depth = ext_depth(inode);
@@ -2365,7 +2574,8 @@ again:
2365 while (i >= 0 && err == 0) { 2574 while (i >= 0 && err == 0) {
2366 if (i == depth) { 2575 if (i == depth) {
2367 /* this is leaf block */ 2576 /* this is leaf block */
2368 err = ext4_ext_rm_leaf(handle, inode, path, start); 2577 err = ext4_ext_rm_leaf(handle, inode, path,
2578 start, end);
2369 /* root level has p_bh == NULL, brelse() eats this */ 2579 /* root level has p_bh == NULL, brelse() eats this */
2370 brelse(path[i].p_bh); 2580 brelse(path[i].p_bh);
2371 path[i].p_bh = NULL; 2581 path[i].p_bh = NULL;
@@ -2529,6 +2739,195 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2529 return ret; 2739 return ret;
2530} 2740}
2531 2741
2742/*
2743 * used by extent splitting.
2744 */
2745#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
2746 due to ENOSPC */
2747#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
2748#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
2749
2750/*
2751 * ext4_split_extent_at() splits an extent at given block.
2752 *
2753 * @handle: the journal handle
2754 * @inode: the file inode
2755 * @path: the path to the extent
2756 * @split: the logical block where the extent is splitted.
2757 * @split_flags: indicates if the extent could be zeroout if split fails, and
2758 * the states(init or uninit) of new extents.
2759 * @flags: flags used to insert new extent to extent tree.
2760 *
2761 *
2762 * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
2763 * of which are deterimined by split_flag.
2764 *
2765 * There are two cases:
2766 * a> the extent are splitted into two extent.
2767 * b> split is not needed, and just mark the extent.
2768 *
2769 * return 0 on success.
2770 */
2771static int ext4_split_extent_at(handle_t *handle,
2772 struct inode *inode,
2773 struct ext4_ext_path *path,
2774 ext4_lblk_t split,
2775 int split_flag,
2776 int flags)
2777{
2778 ext4_fsblk_t newblock;
2779 ext4_lblk_t ee_block;
2780 struct ext4_extent *ex, newex, orig_ex;
2781 struct ext4_extent *ex2 = NULL;
2782 unsigned int ee_len, depth;
2783 int err = 0;
2784
2785 ext_debug("ext4_split_extents_at: inode %lu, logical"
2786 "block %llu\n", inode->i_ino, (unsigned long long)split);
2787
2788 ext4_ext_show_leaf(inode, path);
2789
2790 depth = ext_depth(inode);
2791 ex = path[depth].p_ext;
2792 ee_block = le32_to_cpu(ex->ee_block);
2793 ee_len = ext4_ext_get_actual_len(ex);
2794 newblock = split - ee_block + ext4_ext_pblock(ex);
2795
2796 BUG_ON(split < ee_block || split >= (ee_block + ee_len));
2797
2798 err = ext4_ext_get_access(handle, inode, path + depth);
2799 if (err)
2800 goto out;
2801
2802 if (split == ee_block) {
2803 /*
2804 * case b: block @split is the block that the extent begins with
2805 * then we just change the state of the extent, and splitting
2806 * is not needed.
2807 */
2808 if (split_flag & EXT4_EXT_MARK_UNINIT2)
2809 ext4_ext_mark_uninitialized(ex);
2810 else
2811 ext4_ext_mark_initialized(ex);
2812
2813 if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
2814 ext4_ext_try_to_merge(inode, path, ex);
2815
2816 err = ext4_ext_dirty(handle, inode, path + depth);
2817 goto out;
2818 }
2819
2820 /* case a */
2821 memcpy(&orig_ex, ex, sizeof(orig_ex));
2822 ex->ee_len = cpu_to_le16(split - ee_block);
2823 if (split_flag & EXT4_EXT_MARK_UNINIT1)
2824 ext4_ext_mark_uninitialized(ex);
2825
2826 /*
2827 * path may lead to new leaf, not to original leaf any more
2828 * after ext4_ext_insert_extent() returns,
2829 */
2830 err = ext4_ext_dirty(handle, inode, path + depth);
2831 if (err)
2832 goto fix_extent_len;
2833
2834 ex2 = &newex;
2835 ex2->ee_block = cpu_to_le32(split);
2836 ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block));
2837 ext4_ext_store_pblock(ex2, newblock);
2838 if (split_flag & EXT4_EXT_MARK_UNINIT2)
2839 ext4_ext_mark_uninitialized(ex2);
2840
2841 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
2842 if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2843 err = ext4_ext_zeroout(inode, &orig_ex);
2844 if (err)
2845 goto fix_extent_len;
2846 /* update the extent length and mark as initialized */
2847 ex->ee_len = cpu_to_le32(ee_len);
2848 ext4_ext_try_to_merge(inode, path, ex);
2849 err = ext4_ext_dirty(handle, inode, path + depth);
2850 goto out;
2851 } else if (err)
2852 goto fix_extent_len;
2853
2854out:
2855 ext4_ext_show_leaf(inode, path);
2856 return err;
2857
2858fix_extent_len:
2859 ex->ee_len = orig_ex.ee_len;
2860 ext4_ext_dirty(handle, inode, path + depth);
2861 return err;
2862}
2863
2864/*
2865 * ext4_split_extents() splits an extent and mark extent which is covered
2866 * by @map as split_flags indicates
2867 *
2868 * It may result in splitting the extent into multiple extents (upto three)
2869 * There are three possibilities:
2870 * a> There is no split required
2871 * b> Splits in two extents: Split is happening at either end of the extent
2872 * c> Splits in three extents: Somone is splitting in middle of the extent
2873 *
2874 */
2875static int ext4_split_extent(handle_t *handle,
2876 struct inode *inode,
2877 struct ext4_ext_path *path,
2878 struct ext4_map_blocks *map,
2879 int split_flag,
2880 int flags)
2881{
2882 ext4_lblk_t ee_block;
2883 struct ext4_extent *ex;
2884 unsigned int ee_len, depth;
2885 int err = 0;
2886 int uninitialized;
2887 int split_flag1, flags1;
2888
2889 depth = ext_depth(inode);
2890 ex = path[depth].p_ext;
2891 ee_block = le32_to_cpu(ex->ee_block);
2892 ee_len = ext4_ext_get_actual_len(ex);
2893 uninitialized = ext4_ext_is_uninitialized(ex);
2894
2895 if (map->m_lblk + map->m_len < ee_block + ee_len) {
2896 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2897 EXT4_EXT_MAY_ZEROOUT : 0;
2898 flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
2899 if (uninitialized)
2900 split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
2901 EXT4_EXT_MARK_UNINIT2;
2902 err = ext4_split_extent_at(handle, inode, path,
2903 map->m_lblk + map->m_len, split_flag1, flags1);
2904 if (err)
2905 goto out;
2906 }
2907
2908 ext4_ext_drop_refs(path);
2909 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2910 if (IS_ERR(path))
2911 return PTR_ERR(path);
2912
2913 if (map->m_lblk >= ee_block) {
2914 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
2915 EXT4_EXT_MAY_ZEROOUT : 0;
2916 if (uninitialized)
2917 split_flag1 |= EXT4_EXT_MARK_UNINIT1;
2918 if (split_flag & EXT4_EXT_MARK_UNINIT2)
2919 split_flag1 |= EXT4_EXT_MARK_UNINIT2;
2920 err = ext4_split_extent_at(handle, inode, path,
2921 map->m_lblk, split_flag1, flags);
2922 if (err)
2923 goto out;
2924 }
2925
2926 ext4_ext_show_leaf(inode, path);
2927out:
2928 return err ? err : map->m_len;
2929}
2930
2532#define EXT4_EXT_ZERO_LEN 7 2931#define EXT4_EXT_ZERO_LEN 7
2533/* 2932/*
2534 * This function is called by ext4_ext_map_blocks() if someone tries to write 2933 * This function is called by ext4_ext_map_blocks() if someone tries to write
@@ -2545,17 +2944,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2545 struct ext4_map_blocks *map, 2944 struct ext4_map_blocks *map,
2546 struct ext4_ext_path *path) 2945 struct ext4_ext_path *path)
2547{ 2946{
2548 struct ext4_extent *ex, newex, orig_ex; 2947 struct ext4_map_blocks split_map;
2549 struct ext4_extent *ex1 = NULL; 2948 struct ext4_extent zero_ex;
2550 struct ext4_extent *ex2 = NULL; 2949 struct ext4_extent *ex;
2551 struct ext4_extent *ex3 = NULL;
2552 struct ext4_extent_header *eh;
2553 ext4_lblk_t ee_block, eof_block; 2950 ext4_lblk_t ee_block, eof_block;
2554 unsigned int allocated, ee_len, depth; 2951 unsigned int allocated, ee_len, depth;
2555 ext4_fsblk_t newblock;
2556 int err = 0; 2952 int err = 0;
2557 int ret = 0; 2953 int split_flag = 0;
2558 int may_zeroout;
2559 2954
2560 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" 2955 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2561 "block %llu, max_blocks %u\n", inode->i_ino, 2956 "block %llu, max_blocks %u\n", inode->i_ino,
@@ -2567,280 +2962,86 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2567 eof_block = map->m_lblk + map->m_len; 2962 eof_block = map->m_lblk + map->m_len;
2568 2963
2569 depth = ext_depth(inode); 2964 depth = ext_depth(inode);
2570 eh = path[depth].p_hdr;
2571 ex = path[depth].p_ext; 2965 ex = path[depth].p_ext;
2572 ee_block = le32_to_cpu(ex->ee_block); 2966 ee_block = le32_to_cpu(ex->ee_block);
2573 ee_len = ext4_ext_get_actual_len(ex); 2967 ee_len = ext4_ext_get_actual_len(ex);
2574 allocated = ee_len - (map->m_lblk - ee_block); 2968 allocated = ee_len - (map->m_lblk - ee_block);
2575 newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
2576
2577 ex2 = ex;
2578 orig_ex.ee_block = ex->ee_block;
2579 orig_ex.ee_len = cpu_to_le16(ee_len);
2580 ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
2581 2969
2970 WARN_ON(map->m_lblk < ee_block);
2582 /* 2971 /*
2583 * It is safe to convert extent to initialized via explicit 2972 * It is safe to convert extent to initialized via explicit
2584 * zeroout only if extent is fully insde i_size or new_size. 2973 * zeroout only if extent is fully insde i_size or new_size.
2585 */ 2974 */
2586 may_zeroout = ee_block + ee_len <= eof_block; 2975 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
2587 2976
2588 err = ext4_ext_get_access(handle, inode, path + depth);
2589 if (err)
2590 goto out;
2591 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 2977 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2592 if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) { 2978 if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
2593 err = ext4_ext_zeroout(inode, &orig_ex); 2979 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
2980 err = ext4_ext_zeroout(inode, ex);
2594 if (err) 2981 if (err)
2595 goto fix_extent_len;
2596 /* update the extent length and mark as initialized */
2597 ex->ee_block = orig_ex.ee_block;
2598 ex->ee_len = orig_ex.ee_len;
2599 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2600 ext4_ext_dirty(handle, inode, path + depth);
2601 /* zeroed the full extent */
2602 return allocated;
2603 }
2604
2605 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2606 if (map->m_lblk > ee_block) {
2607 ex1 = ex;
2608 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2609 ext4_ext_mark_uninitialized(ex1);
2610 ex2 = &newex;
2611 }
2612 /*
2613 * for sanity, update the length of the ex2 extent before
2614 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2615 * overlap of blocks.
2616 */
2617 if (!ex1 && allocated > map->m_len)
2618 ex2->ee_len = cpu_to_le16(map->m_len);
2619 /* ex3: to ee_block + ee_len : uninitialised */
2620 if (allocated > map->m_len) {
2621 unsigned int newdepth;
2622 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2623 if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
2624 /*
2625 * map->m_lblk == ee_block is handled by the zerouout
2626 * at the beginning.
2627 * Mark first half uninitialized.
2628 * Mark second half initialized and zero out the
2629 * initialized extent
2630 */
2631 ex->ee_block = orig_ex.ee_block;
2632 ex->ee_len = cpu_to_le16(ee_len - allocated);
2633 ext4_ext_mark_uninitialized(ex);
2634 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2635 ext4_ext_dirty(handle, inode, path + depth);
2636
2637 ex3 = &newex;
2638 ex3->ee_block = cpu_to_le32(map->m_lblk);
2639 ext4_ext_store_pblock(ex3, newblock);
2640 ex3->ee_len = cpu_to_le16(allocated);
2641 err = ext4_ext_insert_extent(handle, inode, path,
2642 ex3, 0);
2643 if (err == -ENOSPC) {
2644 err = ext4_ext_zeroout(inode, &orig_ex);
2645 if (err)
2646 goto fix_extent_len;
2647 ex->ee_block = orig_ex.ee_block;
2648 ex->ee_len = orig_ex.ee_len;
2649 ext4_ext_store_pblock(ex,
2650 ext4_ext_pblock(&orig_ex));
2651 ext4_ext_dirty(handle, inode, path + depth);
2652 /* blocks available from map->m_lblk */
2653 return allocated;
2654
2655 } else if (err)
2656 goto fix_extent_len;
2657
2658 /*
2659 * We need to zero out the second half because
2660 * an fallocate request can update file size and
2661 * converting the second half to initialized extent
2662 * implies that we can leak some junk data to user
2663 * space.
2664 */
2665 err = ext4_ext_zeroout(inode, ex3);
2666 if (err) {
2667 /*
2668 * We should actually mark the
2669 * second half as uninit and return error
2670 * Insert would have changed the extent
2671 */
2672 depth = ext_depth(inode);
2673 ext4_ext_drop_refs(path);
2674 path = ext4_ext_find_extent(inode, map->m_lblk,
2675 path);
2676 if (IS_ERR(path)) {
2677 err = PTR_ERR(path);
2678 return err;
2679 }
2680 /* get the second half extent details */
2681 ex = path[depth].p_ext;
2682 err = ext4_ext_get_access(handle, inode,
2683 path + depth);
2684 if (err)
2685 return err;
2686 ext4_ext_mark_uninitialized(ex);
2687 ext4_ext_dirty(handle, inode, path + depth);
2688 return err;
2689 }
2690
2691 /* zeroed the second half */
2692 return allocated;
2693 }
2694 ex3 = &newex;
2695 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2696 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2697 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2698 ext4_ext_mark_uninitialized(ex3);
2699 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2700 if (err == -ENOSPC && may_zeroout) {
2701 err = ext4_ext_zeroout(inode, &orig_ex);
2702 if (err)
2703 goto fix_extent_len;
2704 /* update the extent length and mark as initialized */
2705 ex->ee_block = orig_ex.ee_block;
2706 ex->ee_len = orig_ex.ee_len;
2707 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2708 ext4_ext_dirty(handle, inode, path + depth);
2709 /* zeroed the full extent */
2710 /* blocks available from map->m_lblk */
2711 return allocated;
2712
2713 } else if (err)
2714 goto fix_extent_len;
2715 /*
2716 * The depth, and hence eh & ex might change
2717 * as part of the insert above.
2718 */
2719 newdepth = ext_depth(inode);
2720 /*
2721 * update the extent length after successful insert of the
2722 * split extent
2723 */
2724 ee_len -= ext4_ext_get_actual_len(ex3);
2725 orig_ex.ee_len = cpu_to_le16(ee_len);
2726 may_zeroout = ee_block + ee_len <= eof_block;
2727
2728 depth = newdepth;
2729 ext4_ext_drop_refs(path);
2730 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2731 if (IS_ERR(path)) {
2732 err = PTR_ERR(path);
2733 goto out; 2982 goto out;
2734 }
2735 eh = path[depth].p_hdr;
2736 ex = path[depth].p_ext;
2737 if (ex2 != &newex)
2738 ex2 = ex;
2739 2983
2740 err = ext4_ext_get_access(handle, inode, path + depth); 2984 err = ext4_ext_get_access(handle, inode, path + depth);
2741 if (err) 2985 if (err)
2742 goto out; 2986 goto out;
2743 2987 ext4_ext_mark_initialized(ex);
2744 allocated = map->m_len; 2988 ext4_ext_try_to_merge(inode, path, ex);
2745 2989 err = ext4_ext_dirty(handle, inode, path + depth);
2746 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying 2990 goto out;
2747 * to insert a extent in the middle zerout directly
2748 * otherwise give the extent a chance to merge to left
2749 */
2750 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2751 map->m_lblk != ee_block && may_zeroout) {
2752 err = ext4_ext_zeroout(inode, &orig_ex);
2753 if (err)
2754 goto fix_extent_len;
2755 /* update the extent length and mark as initialized */
2756 ex->ee_block = orig_ex.ee_block;
2757 ex->ee_len = orig_ex.ee_len;
2758 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2759 ext4_ext_dirty(handle, inode, path + depth);
2760 /* zero out the first half */
2761 /* blocks available from map->m_lblk */
2762 return allocated;
2763 }
2764 }
2765 /*
2766 * If there was a change of depth as part of the
2767 * insertion of ex3 above, we need to update the length
2768 * of the ex1 extent again here
2769 */
2770 if (ex1 && ex1 != ex) {
2771 ex1 = ex;
2772 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2773 ext4_ext_mark_uninitialized(ex1);
2774 ex2 = &newex;
2775 }
2776 /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
2777 ex2->ee_block = cpu_to_le32(map->m_lblk);
2778 ext4_ext_store_pblock(ex2, newblock);
2779 ex2->ee_len = cpu_to_le16(allocated);
2780 if (ex2 != ex)
2781 goto insert;
2782 /*
2783 * New (initialized) extent starts from the first block
2784 * in the current extent. i.e., ex2 == ex
2785 * We have to see if it can be merged with the extent
2786 * on the left.
2787 */
2788 if (ex2 > EXT_FIRST_EXTENT(eh)) {
2789 /*
2790 * To merge left, pass "ex2 - 1" to try_to_merge(),
2791 * since it merges towards right _only_.
2792 */
2793 ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
2794 if (ret) {
2795 err = ext4_ext_correct_indexes(handle, inode, path);
2796 if (err)
2797 goto out;
2798 depth = ext_depth(inode);
2799 ex2--;
2800 }
2801 } 2991 }
2992
2802 /* 2993 /*
2803 * Try to Merge towards right. This might be required 2994 * four cases:
2804 * only when the whole extent is being written to. 2995 * 1. split the extent into three extents.
2805 * i.e. ex2 == ex and ex3 == NULL. 2996 * 2. split the extent into two extents, zeroout the first half.
2997 * 3. split the extent into two extents, zeroout the second half.
2998 * 4. split the extent into two extents with out zeroout.
2806 */ 2999 */
2807 if (!ex3) { 3000 split_map.m_lblk = map->m_lblk;
2808 ret = ext4_ext_try_to_merge(inode, path, ex2); 3001 split_map.m_len = map->m_len;
2809 if (ret) { 3002
2810 err = ext4_ext_correct_indexes(handle, inode, path); 3003 if (allocated > map->m_len) {
3004 if (allocated <= EXT4_EXT_ZERO_LEN &&
3005 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3006 /* case 3 */
3007 zero_ex.ee_block =
3008 cpu_to_le32(map->m_lblk);
3009 zero_ex.ee_len = cpu_to_le16(allocated);
3010 ext4_ext_store_pblock(&zero_ex,
3011 ext4_ext_pblock(ex) + map->m_lblk - ee_block);
3012 err = ext4_ext_zeroout(inode, &zero_ex);
2811 if (err) 3013 if (err)
2812 goto out; 3014 goto out;
3015 split_map.m_lblk = map->m_lblk;
3016 split_map.m_len = allocated;
3017 } else if ((map->m_lblk - ee_block + map->m_len <
3018 EXT4_EXT_ZERO_LEN) &&
3019 (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3020 /* case 2 */
3021 if (map->m_lblk != ee_block) {
3022 zero_ex.ee_block = ex->ee_block;
3023 zero_ex.ee_len = cpu_to_le16(map->m_lblk -
3024 ee_block);
3025 ext4_ext_store_pblock(&zero_ex,
3026 ext4_ext_pblock(ex));
3027 err = ext4_ext_zeroout(inode, &zero_ex);
3028 if (err)
3029 goto out;
3030 }
3031
3032 split_map.m_lblk = ee_block;
3033 split_map.m_len = map->m_lblk - ee_block + map->m_len;
3034 allocated = map->m_len;
2813 } 3035 }
2814 } 3036 }
2815 /* Mark modified extent as dirty */ 3037
2816 err = ext4_ext_dirty(handle, inode, path + depth); 3038 allocated = ext4_split_extent(handle, inode, path,
2817 goto out; 3039 &split_map, split_flag, 0);
2818insert: 3040 if (allocated < 0)
2819 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); 3041 err = allocated;
2820 if (err == -ENOSPC && may_zeroout) { 3042
2821 err = ext4_ext_zeroout(inode, &orig_ex);
2822 if (err)
2823 goto fix_extent_len;
2824 /* update the extent length and mark as initialized */
2825 ex->ee_block = orig_ex.ee_block;
2826 ex->ee_len = orig_ex.ee_len;
2827 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2828 ext4_ext_dirty(handle, inode, path + depth);
2829 /* zero out the first half */
2830 return allocated;
2831 } else if (err)
2832 goto fix_extent_len;
2833out: 3043out:
2834 ext4_ext_show_leaf(inode, path);
2835 return err ? err : allocated; 3044 return err ? err : allocated;
2836
2837fix_extent_len:
2838 ex->ee_block = orig_ex.ee_block;
2839 ex->ee_len = orig_ex.ee_len;
2840 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2841 ext4_ext_mark_uninitialized(ex);
2842 ext4_ext_dirty(handle, inode, path + depth);
2843 return err;
2844} 3045}
2845 3046
2846/* 3047/*
@@ -2871,15 +3072,11 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2871 struct ext4_ext_path *path, 3072 struct ext4_ext_path *path,
2872 int flags) 3073 int flags)
2873{ 3074{
2874 struct ext4_extent *ex, newex, orig_ex; 3075 ext4_lblk_t eof_block;
2875 struct ext4_extent *ex1 = NULL; 3076 ext4_lblk_t ee_block;
2876 struct ext4_extent *ex2 = NULL; 3077 struct ext4_extent *ex;
2877 struct ext4_extent *ex3 = NULL; 3078 unsigned int ee_len;
2878 ext4_lblk_t ee_block, eof_block; 3079 int split_flag = 0, depth;
2879 unsigned int allocated, ee_len, depth;
2880 ext4_fsblk_t newblock;
2881 int err = 0;
2882 int may_zeroout;
2883 3080
2884 ext_debug("ext4_split_unwritten_extents: inode %lu, logical" 3081 ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
2885 "block %llu, max_blocks %u\n", inode->i_ino, 3082 "block %llu, max_blocks %u\n", inode->i_ino,
@@ -2889,156 +3086,22 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2889 inode->i_sb->s_blocksize_bits; 3086 inode->i_sb->s_blocksize_bits;
2890 if (eof_block < map->m_lblk + map->m_len) 3087 if (eof_block < map->m_lblk + map->m_len)
2891 eof_block = map->m_lblk + map->m_len; 3088 eof_block = map->m_lblk + map->m_len;
2892
2893 depth = ext_depth(inode);
2894 ex = path[depth].p_ext;
2895 ee_block = le32_to_cpu(ex->ee_block);
2896 ee_len = ext4_ext_get_actual_len(ex);
2897 allocated = ee_len - (map->m_lblk - ee_block);
2898 newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
2899
2900 ex2 = ex;
2901 orig_ex.ee_block = ex->ee_block;
2902 orig_ex.ee_len = cpu_to_le16(ee_len);
2903 ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
2904
2905 /* 3089 /*
2906 * It is safe to convert extent to initialized via explicit 3090 * It is safe to convert extent to initialized via explicit
2907 * zeroout only if extent is fully insde i_size or new_size. 3091 * zeroout only if extent is fully insde i_size or new_size.
2908 */ 3092 */
2909 may_zeroout = ee_block + ee_len <= eof_block; 3093 depth = ext_depth(inode);
2910 3094 ex = path[depth].p_ext;
2911 /* 3095 ee_block = le32_to_cpu(ex->ee_block);
2912 * If the uninitialized extent begins at the same logical 3096 ee_len = ext4_ext_get_actual_len(ex);
2913 * block where the write begins, and the write completely
2914 * covers the extent, then we don't need to split it.
2915 */
2916 if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
2917 return allocated;
2918
2919 err = ext4_ext_get_access(handle, inode, path + depth);
2920 if (err)
2921 goto out;
2922 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2923 if (map->m_lblk > ee_block) {
2924 ex1 = ex;
2925 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2926 ext4_ext_mark_uninitialized(ex1);
2927 ex2 = &newex;
2928 }
2929 /*
2930 * for sanity, update the length of the ex2 extent before
2931 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2932 * overlap of blocks.
2933 */
2934 if (!ex1 && allocated > map->m_len)
2935 ex2->ee_len = cpu_to_le16(map->m_len);
2936 /* ex3: to ee_block + ee_len : uninitialised */
2937 if (allocated > map->m_len) {
2938 unsigned int newdepth;
2939 ex3 = &newex;
2940 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2941 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2942 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2943 ext4_ext_mark_uninitialized(ex3);
2944 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2945 if (err == -ENOSPC && may_zeroout) {
2946 err = ext4_ext_zeroout(inode, &orig_ex);
2947 if (err)
2948 goto fix_extent_len;
2949 /* update the extent length and mark as initialized */
2950 ex->ee_block = orig_ex.ee_block;
2951 ex->ee_len = orig_ex.ee_len;
2952 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
2953 ext4_ext_dirty(handle, inode, path + depth);
2954 /* zeroed the full extent */
2955 /* blocks available from map->m_lblk */
2956 return allocated;
2957
2958 } else if (err)
2959 goto fix_extent_len;
2960 /*
2961 * The depth, and hence eh & ex might change
2962 * as part of the insert above.
2963 */
2964 newdepth = ext_depth(inode);
2965 /*
2966 * update the extent length after successful insert of the
2967 * split extent
2968 */
2969 ee_len -= ext4_ext_get_actual_len(ex3);
2970 orig_ex.ee_len = cpu_to_le16(ee_len);
2971 may_zeroout = ee_block + ee_len <= eof_block;
2972
2973 depth = newdepth;
2974 ext4_ext_drop_refs(path);
2975 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2976 if (IS_ERR(path)) {
2977 err = PTR_ERR(path);
2978 goto out;
2979 }
2980 ex = path[depth].p_ext;
2981 if (ex2 != &newex)
2982 ex2 = ex;
2983 3097
2984 err = ext4_ext_get_access(handle, inode, path + depth); 3098 split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
2985 if (err) 3099 split_flag |= EXT4_EXT_MARK_UNINIT2;
2986 goto out;
2987 3100
2988 allocated = map->m_len; 3101 flags |= EXT4_GET_BLOCKS_PRE_IO;
2989 } 3102 return ext4_split_extent(handle, inode, path, map, split_flag, flags);
2990 /*
2991 * If there was a change of depth as part of the
2992 * insertion of ex3 above, we need to update the length
2993 * of the ex1 extent again here
2994 */
2995 if (ex1 && ex1 != ex) {
2996 ex1 = ex;
2997 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2998 ext4_ext_mark_uninitialized(ex1);
2999 ex2 = &newex;
3000 }
3001 /*
3002 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
3003 * using direct I/O, uninitialised still.
3004 */
3005 ex2->ee_block = cpu_to_le32(map->m_lblk);
3006 ext4_ext_store_pblock(ex2, newblock);
3007 ex2->ee_len = cpu_to_le16(allocated);
3008 ext4_ext_mark_uninitialized(ex2);
3009 if (ex2 != ex)
3010 goto insert;
3011 /* Mark modified extent as dirty */
3012 err = ext4_ext_dirty(handle, inode, path + depth);
3013 ext_debug("out here\n");
3014 goto out;
3015insert:
3016 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3017 if (err == -ENOSPC && may_zeroout) {
3018 err = ext4_ext_zeroout(inode, &orig_ex);
3019 if (err)
3020 goto fix_extent_len;
3021 /* update the extent length and mark as initialized */
3022 ex->ee_block = orig_ex.ee_block;
3023 ex->ee_len = orig_ex.ee_len;
3024 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3025 ext4_ext_dirty(handle, inode, path + depth);
3026 /* zero out the first half */
3027 return allocated;
3028 } else if (err)
3029 goto fix_extent_len;
3030out:
3031 ext4_ext_show_leaf(inode, path);
3032 return err ? err : allocated;
3033
3034fix_extent_len:
3035 ex->ee_block = orig_ex.ee_block;
3036 ex->ee_len = orig_ex.ee_len;
3037 ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
3038 ext4_ext_mark_uninitialized(ex);
3039 ext4_ext_dirty(handle, inode, path + depth);
3040 return err;
3041} 3103}
3104
3042static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3105static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3043 struct inode *inode, 3106 struct inode *inode,
3044 struct ext4_ext_path *path) 3107 struct ext4_ext_path *path)
@@ -3047,46 +3110,27 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3047 struct ext4_extent_header *eh; 3110 struct ext4_extent_header *eh;
3048 int depth; 3111 int depth;
3049 int err = 0; 3112 int err = 0;
3050 int ret = 0;
3051 3113
3052 depth = ext_depth(inode); 3114 depth = ext_depth(inode);
3053 eh = path[depth].p_hdr; 3115 eh = path[depth].p_hdr;
3054 ex = path[depth].p_ext; 3116 ex = path[depth].p_ext;
3055 3117
3118 ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
3119 "block %llu, max_blocks %u\n", inode->i_ino,
3120 (unsigned long long)le32_to_cpu(ex->ee_block),
3121 ext4_ext_get_actual_len(ex));
3122
3056 err = ext4_ext_get_access(handle, inode, path + depth); 3123 err = ext4_ext_get_access(handle, inode, path + depth);
3057 if (err) 3124 if (err)
3058 goto out; 3125 goto out;
3059 /* first mark the extent as initialized */ 3126 /* first mark the extent as initialized */
3060 ext4_ext_mark_initialized(ex); 3127 ext4_ext_mark_initialized(ex);
3061 3128
3062 /* 3129 /* note: ext4_ext_correct_indexes() isn't needed here because
3063 * We have to see if it can be merged with the extent 3130 * borders are not changed
3064 * on the left.
3065 */
3066 if (ex > EXT_FIRST_EXTENT(eh)) {
3067 /*
3068 * To merge left, pass "ex - 1" to try_to_merge(),
3069 * since it merges towards right _only_.
3070 */
3071 ret = ext4_ext_try_to_merge(inode, path, ex - 1);
3072 if (ret) {
3073 err = ext4_ext_correct_indexes(handle, inode, path);
3074 if (err)
3075 goto out;
3076 depth = ext_depth(inode);
3077 ex--;
3078 }
3079 }
3080 /*
3081 * Try to Merge towards right.
3082 */ 3131 */
3083 ret = ext4_ext_try_to_merge(inode, path, ex); 3132 ext4_ext_try_to_merge(inode, path, ex);
3084 if (ret) { 3133
3085 err = ext4_ext_correct_indexes(handle, inode, path);
3086 if (err)
3087 goto out;
3088 depth = ext_depth(inode);
3089 }
3090 /* Mark modified extent as dirty */ 3134 /* Mark modified extent as dirty */
3091 err = ext4_ext_dirty(handle, inode, path + depth); 3135 err = ext4_ext_dirty(handle, inode, path + depth);
3092out: 3136out:
@@ -3302,15 +3346,19 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3302 ext4_fsblk_t newblock = 0; 3346 ext4_fsblk_t newblock = 0;
3303 int err = 0, depth, ret; 3347 int err = 0, depth, ret;
3304 unsigned int allocated = 0; 3348 unsigned int allocated = 0;
3349 unsigned int punched_out = 0;
3350 unsigned int result = 0;
3305 struct ext4_allocation_request ar; 3351 struct ext4_allocation_request ar;
3306 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3352 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3353 struct ext4_map_blocks punch_map;
3307 3354
3308 ext_debug("blocks %u/%u requested for inode %lu\n", 3355 ext_debug("blocks %u/%u requested for inode %lu\n",
3309 map->m_lblk, map->m_len, inode->i_ino); 3356 map->m_lblk, map->m_len, inode->i_ino);
3310 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 3357 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3311 3358
3312 /* check in cache */ 3359 /* check in cache */
3313 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { 3360 if (ext4_ext_in_cache(inode, map->m_lblk, &newex) &&
3361 ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) {
3314 if (!newex.ee_start_lo && !newex.ee_start_hi) { 3362 if (!newex.ee_start_lo && !newex.ee_start_hi) {
3315 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3363 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3316 /* 3364 /*
@@ -3375,16 +3423,84 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3375 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, 3423 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3376 ee_block, ee_len, newblock); 3424 ee_block, ee_len, newblock);
3377 3425
3378 /* Do not put uninitialized extent in the cache */ 3426 if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
3379 if (!ext4_ext_is_uninitialized(ex)) { 3427 /*
3380 ext4_ext_put_in_cache(inode, ee_block, 3428 * Do not put uninitialized extent
3381 ee_len, ee_start); 3429 * in the cache
3382 goto out; 3430 */
3431 if (!ext4_ext_is_uninitialized(ex)) {
3432 ext4_ext_put_in_cache(inode, ee_block,
3433 ee_len, ee_start);
3434 goto out;
3435 }
3436 ret = ext4_ext_handle_uninitialized_extents(
3437 handle, inode, map, path, flags,
3438 allocated, newblock);
3439 return ret;
3383 } 3440 }
3384 ret = ext4_ext_handle_uninitialized_extents(handle, 3441
3385 inode, map, path, flags, allocated, 3442 /*
3386 newblock); 3443 * Punch out the map length, but only to the
3387 return ret; 3444 * end of the extent
3445 */
3446 punched_out = allocated < map->m_len ?
3447 allocated : map->m_len;
3448
3449 /*
3450 * Sense extents need to be converted to
3451 * uninitialized, they must fit in an
3452 * uninitialized extent
3453 */
3454 if (punched_out > EXT_UNINIT_MAX_LEN)
3455 punched_out = EXT_UNINIT_MAX_LEN;
3456
3457 punch_map.m_lblk = map->m_lblk;
3458 punch_map.m_pblk = newblock;
3459 punch_map.m_len = punched_out;
3460 punch_map.m_flags = 0;
3461
3462 /* Check to see if the extent needs to be split */
3463 if (punch_map.m_len != ee_len ||
3464 punch_map.m_lblk != ee_block) {
3465
3466 ret = ext4_split_extent(handle, inode,
3467 path, &punch_map, 0,
3468 EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
3469 EXT4_GET_BLOCKS_PRE_IO);
3470
3471 if (ret < 0) {
3472 err = ret;
3473 goto out2;
3474 }
3475 /*
3476 * find extent for the block at
3477 * the start of the hole
3478 */
3479 ext4_ext_drop_refs(path);
3480 kfree(path);
3481
3482 path = ext4_ext_find_extent(inode,
3483 map->m_lblk, NULL);
3484 if (IS_ERR(path)) {
3485 err = PTR_ERR(path);
3486 path = NULL;
3487 goto out2;
3488 }
3489
3490 depth = ext_depth(inode);
3491 ex = path[depth].p_ext;
3492 ee_len = ext4_ext_get_actual_len(ex);
3493 ee_block = le32_to_cpu(ex->ee_block);
3494 ee_start = ext4_ext_pblock(ex);
3495
3496 }
3497
3498 ext4_ext_mark_uninitialized(ex);
3499
3500 err = ext4_ext_remove_space(inode, map->m_lblk,
3501 map->m_lblk + punched_out);
3502
3503 goto out2;
3388 } 3504 }
3389 } 3505 }
3390 3506
@@ -3446,6 +3562,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3446 else 3562 else
3447 /* disable in-core preallocation for non-regular files */ 3563 /* disable in-core preallocation for non-regular files */
3448 ar.flags = 0; 3564 ar.flags = 0;
3565 if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
3566 ar.flags |= EXT4_MB_HINT_NOPREALLOC;
3449 newblock = ext4_mb_new_blocks(handle, &ar, &err); 3567 newblock = ext4_mb_new_blocks(handle, &ar, &err);
3450 if (!newblock) 3568 if (!newblock)
3451 goto out2; 3569 goto out2;
@@ -3529,7 +3647,11 @@ out2:
3529 } 3647 }
3530 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk, 3648 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
3531 newblock, map->m_len, err ? err : allocated); 3649 newblock, map->m_len, err ? err : allocated);
3532 return err ? err : allocated; 3650
3651 result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
3652 punched_out : allocated;
3653
3654 return err ? err : result;
3533} 3655}
3534 3656
3535void ext4_ext_truncate(struct inode *inode) 3657void ext4_ext_truncate(struct inode *inode)
@@ -3577,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode)
3577 3699
3578 last_block = (inode->i_size + sb->s_blocksize - 1) 3700 last_block = (inode->i_size + sb->s_blocksize - 1)
3579 >> EXT4_BLOCK_SIZE_BITS(sb); 3701 >> EXT4_BLOCK_SIZE_BITS(sb);
3580 err = ext4_ext_remove_space(inode, last_block); 3702 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK);
3581 3703
3582 /* In a multi-transaction truncate, we only make the final 3704 /* In a multi-transaction truncate, we only make the final
3583 * transaction synchronous. 3705 * transaction synchronous.
@@ -3585,8 +3707,9 @@ void ext4_ext_truncate(struct inode *inode)
3585 if (IS_SYNC(inode)) 3707 if (IS_SYNC(inode))
3586 ext4_handle_sync(handle); 3708 ext4_handle_sync(handle);
3587 3709
3588out_stop:
3589 up_write(&EXT4_I(inode)->i_data_sem); 3710 up_write(&EXT4_I(inode)->i_data_sem);
3711
3712out_stop:
3590 /* 3713 /*
3591 * If this was a simple ftruncate() and the file will remain alive, 3714 * If this was a simple ftruncate() and the file will remain alive,
3592 * then we need to clear up the orphan record which we created above. 3715 * then we need to clear up the orphan record which we created above.
@@ -3651,10 +3774,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3651 struct ext4_map_blocks map; 3774 struct ext4_map_blocks map;
3652 unsigned int credits, blkbits = inode->i_blkbits; 3775 unsigned int credits, blkbits = inode->i_blkbits;
3653 3776
3654 /* We only support the FALLOC_FL_KEEP_SIZE mode */
3655 if (mode & ~FALLOC_FL_KEEP_SIZE)
3656 return -EOPNOTSUPP;
3657
3658 /* 3777 /*
3659 * currently supporting (pre)allocate mode for extent-based 3778 * currently supporting (pre)allocate mode for extent-based
3660 * files _only_ 3779 * files _only_
@@ -3662,6 +3781,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3662 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3781 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3663 return -EOPNOTSUPP; 3782 return -EOPNOTSUPP;
3664 3783
3784 /* Return error if mode is not supported */
3785 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
3786 return -EOPNOTSUPP;
3787
3788 if (mode & FALLOC_FL_PUNCH_HOLE)
3789 return ext4_punch_hole(file, offset, len);
3790
3665 trace_ext4_fallocate_enter(inode, offset, len, mode); 3791 trace_ext4_fallocate_enter(inode, offset, len, mode);
3666 map.m_lblk = offset >> blkbits; 3792 map.m_lblk = offset >> blkbits;
3667 /* 3793 /*
@@ -3691,7 +3817,8 @@ retry:
3691 break; 3817 break;
3692 } 3818 }
3693 ret = ext4_map_blocks(handle, inode, &map, 3819 ret = ext4_map_blocks(handle, inode, &map,
3694 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); 3820 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
3821 EXT4_GET_BLOCKS_NO_NORMALIZE);
3695 if (ret <= 0) { 3822 if (ret <= 0) {
3696#ifdef EXT4FS_DEBUG 3823#ifdef EXT4FS_DEBUG
3697 WARN_ON(ret <= 0); 3824 WARN_ON(ret <= 0);
@@ -3822,6 +3949,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3822 pgoff_t last_offset; 3949 pgoff_t last_offset;
3823 pgoff_t offset; 3950 pgoff_t offset;
3824 pgoff_t index; 3951 pgoff_t index;
3952 pgoff_t start_index = 0;
3825 struct page **pages = NULL; 3953 struct page **pages = NULL;
3826 struct buffer_head *bh = NULL; 3954 struct buffer_head *bh = NULL;
3827 struct buffer_head *head = NULL; 3955 struct buffer_head *head = NULL;
@@ -3848,39 +3976,57 @@ out:
3848 kfree(pages); 3976 kfree(pages);
3849 return EXT_CONTINUE; 3977 return EXT_CONTINUE;
3850 } 3978 }
3979 index = 0;
3851 3980
3981next_page:
3852 /* Try to find the 1st mapped buffer. */ 3982 /* Try to find the 1st mapped buffer. */
3853 end = ((__u64)pages[0]->index << PAGE_SHIFT) >> 3983 end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
3854 blksize_bits; 3984 blksize_bits;
3855 if (!page_has_buffers(pages[0])) 3985 if (!page_has_buffers(pages[index]))
3856 goto out; 3986 goto out;
3857 head = page_buffers(pages[0]); 3987 head = page_buffers(pages[index]);
3858 if (!head) 3988 if (!head)
3859 goto out; 3989 goto out;
3860 3990
3991 index++;
3861 bh = head; 3992 bh = head;
3862 do { 3993 do {
3863 if (buffer_mapped(bh)) { 3994 if (end >= newex->ec_block +
3995 newex->ec_len)
3996 /* The buffer is out of
3997 * the request range.
3998 */
3999 goto out;
4000
4001 if (buffer_mapped(bh) &&
4002 end >= newex->ec_block) {
4003 start_index = index - 1;
3864 /* get the 1st mapped buffer. */ 4004 /* get the 1st mapped buffer. */
3865 if (end > newex->ec_block +
3866 newex->ec_len)
3867 /* The buffer is out of
3868 * the request range.
3869 */
3870 goto out;
3871 goto found_mapped_buffer; 4005 goto found_mapped_buffer;
3872 } 4006 }
4007
3873 bh = bh->b_this_page; 4008 bh = bh->b_this_page;
3874 end++; 4009 end++;
3875 } while (bh != head); 4010 } while (bh != head);
3876 4011
3877 /* No mapped buffer found. */ 4012 /* No mapped buffer in the range found in this page,
3878 goto out; 4013 * We need to look up next page.
4014 */
4015 if (index >= ret) {
4016 /* There is no page left, but we need to limit
4017 * newex->ec_len.
4018 */
4019 newex->ec_len = end - newex->ec_block;
4020 goto out;
4021 }
4022 goto next_page;
3879 } else { 4023 } else {
3880 /*Find contiguous delayed buffers. */ 4024 /*Find contiguous delayed buffers. */
3881 if (ret > 0 && pages[0]->index == last_offset) 4025 if (ret > 0 && pages[0]->index == last_offset)
3882 head = page_buffers(pages[0]); 4026 head = page_buffers(pages[0]);
3883 bh = head; 4027 bh = head;
4028 index = 1;
4029 start_index = 0;
3884 } 4030 }
3885 4031
3886found_mapped_buffer: 4032found_mapped_buffer:
@@ -3903,7 +4049,7 @@ found_mapped_buffer:
3903 end++; 4049 end++;
3904 } while (bh != head); 4050 } while (bh != head);
3905 4051
3906 for (index = 1; index < ret; index++) { 4052 for (; index < ret; index++) {
3907 if (!page_has_buffers(pages[index])) { 4053 if (!page_has_buffers(pages[index])) {
3908 bh = NULL; 4054 bh = NULL;
3909 break; 4055 break;
@@ -3913,8 +4059,10 @@ found_mapped_buffer:
3913 bh = NULL; 4059 bh = NULL;
3914 break; 4060 break;
3915 } 4061 }
4062
3916 if (pages[index]->index != 4063 if (pages[index]->index !=
3917 pages[0]->index + index) { 4064 pages[start_index]->index + index
4065 - start_index) {
3918 /* Blocks are not contiguous. */ 4066 /* Blocks are not contiguous. */
3919 bh = NULL; 4067 bh = NULL;
3920 break; 4068 break;
@@ -4006,6 +4154,177 @@ static int ext4_xattr_fiemap(struct inode *inode,
4006 return (error < 0 ? error : 0); 4154 return (error < 0 ? error : 0);
4007} 4155}
4008 4156
4157/*
4158 * ext4_ext_punch_hole
4159 *
4160 * Punches a hole of "length" bytes in a file starting
4161 * at byte "offset"
4162 *
4163 * @inode: The inode of the file to punch a hole in
4164 * @offset: The starting byte offset of the hole
4165 * @length: The length of the hole
4166 *
4167 * Returns the number of blocks removed or negative on err
4168 */
4169int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
4170{
4171 struct inode *inode = file->f_path.dentry->d_inode;
4172 struct super_block *sb = inode->i_sb;
4173 struct ext4_ext_cache cache_ex;
4174 ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
4175 struct address_space *mapping = inode->i_mapping;
4176 struct ext4_map_blocks map;
4177 handle_t *handle;
4178 loff_t first_block_offset, last_block_offset, block_len;
4179 loff_t first_page, last_page, first_page_offset, last_page_offset;
4180 int ret, credits, blocks_released, err = 0;
4181
4182 first_block = (offset + sb->s_blocksize - 1) >>
4183 EXT4_BLOCK_SIZE_BITS(sb);
4184 last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
4185
4186 first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
4187 last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
4188
4189 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
4190 last_page = (offset + length) >> PAGE_CACHE_SHIFT;
4191
4192 first_page_offset = first_page << PAGE_CACHE_SHIFT;
4193 last_page_offset = last_page << PAGE_CACHE_SHIFT;
4194
4195 /*
4196 * Write out all dirty pages to avoid race conditions
4197 * Then release them.
4198 */
4199 if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
4200 err = filemap_write_and_wait_range(mapping,
4201 first_page_offset == 0 ? 0 : first_page_offset-1,
4202 last_page_offset);
4203
4204 if (err)
4205 return err;
4206 }
4207
4208 /* Now release the pages */
4209 if (last_page_offset > first_page_offset) {
4210 truncate_inode_pages_range(mapping, first_page_offset,
4211 last_page_offset-1);
4212 }
4213
4214 /* finish any pending end_io work */
4215 ext4_flush_completed_IO(inode);
4216
4217 credits = ext4_writepage_trans_blocks(inode);
4218 handle = ext4_journal_start(inode, credits);
4219 if (IS_ERR(handle))
4220 return PTR_ERR(handle);
4221
4222 err = ext4_orphan_add(handle, inode);
4223 if (err)
4224 goto out;
4225
4226 /*
4227 * Now we need to zero out the un block aligned data.
4228 * If the file is smaller than a block, just
4229 * zero out the middle
4230 */
4231 if (first_block > last_block)
4232 ext4_block_zero_page_range(handle, mapping, offset, length);
4233 else {
4234 /* zero out the head of the hole before the first block */
4235 block_len = first_block_offset - offset;
4236 if (block_len > 0)
4237 ext4_block_zero_page_range(handle, mapping,
4238 offset, block_len);
4239
4240 /* zero out the tail of the hole after the last block */
4241 block_len = offset + length - last_block_offset;
4242 if (block_len > 0) {
4243 ext4_block_zero_page_range(handle, mapping,
4244 last_block_offset, block_len);
4245 }
4246 }
4247
4248 /* If there are no blocks to remove, return now */
4249 if (first_block >= last_block)
4250 goto out;
4251
4252 down_write(&EXT4_I(inode)->i_data_sem);
4253 ext4_ext_invalidate_cache(inode);
4254 ext4_discard_preallocations(inode);
4255
4256 /*
4257 * Loop over all the blocks and identify blocks
4258 * that need to be punched out
4259 */
4260 iblock = first_block;
4261 blocks_released = 0;
4262 while (iblock < last_block) {
4263 max_blocks = last_block - iblock;
4264 num_blocks = 1;
4265 memset(&map, 0, sizeof(map));
4266 map.m_lblk = iblock;
4267 map.m_len = max_blocks;
4268 ret = ext4_ext_map_blocks(handle, inode, &map,
4269 EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
4270
4271 if (ret > 0) {
4272 blocks_released += ret;
4273 num_blocks = ret;
4274 } else if (ret == 0) {
4275 /*
4276 * If map blocks could not find the block,
4277 * then it is in a hole. If the hole was
4278 * not already cached, then map blocks should
4279 * put it in the cache. So we can get the hole
4280 * out of the cache
4281 */
4282 memset(&cache_ex, 0, sizeof(cache_ex));
4283 if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
4284 !cache_ex.ec_start) {
4285
4286 /* The hole is cached */
4287 num_blocks = cache_ex.ec_block +
4288 cache_ex.ec_len - iblock;
4289
4290 } else {
4291 /* The block could not be identified */
4292 err = -EIO;
4293 break;
4294 }
4295 } else {
4296 /* Map blocks error */
4297 err = ret;
4298 break;
4299 }
4300
4301 if (num_blocks == 0) {
4302 /* This condition should never happen */
4303 ext_debug("Block lookup failed");
4304 err = -EIO;
4305 break;
4306 }
4307
4308 iblock += num_blocks;
4309 }
4310
4311 if (blocks_released > 0) {
4312 ext4_ext_invalidate_cache(inode);
4313 ext4_discard_preallocations(inode);
4314 }
4315
4316 if (IS_SYNC(inode))
4317 ext4_handle_sync(handle);
4318
4319 up_write(&EXT4_I(inode)->i_data_sem);
4320
4321out:
4322 ext4_orphan_del(handle, inode);
4323 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4324 ext4_mark_inode_dirty(handle, inode);
4325 ext4_journal_stop(handle);
4326 return err;
4327}
4009int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4328int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4010 __u64 start, __u64 len) 4329 __u64 start, __u64 len)
4011{ 4330{
@@ -4042,4 +4361,3 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4042 4361
4043 return error; 4362 return error;
4044} 4363}
4045
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 7b80d543b89e..2c0972322009 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -272,7 +272,6 @@ const struct file_operations ext4_file_operations = {
272}; 272};
273 273
274const struct inode_operations ext4_file_inode_operations = { 274const struct inode_operations ext4_file_inode_operations = {
275 .truncate = ext4_truncate,
276 .setattr = ext4_setattr, 275 .setattr = ext4_setattr,
277 .getattr = ext4_getattr, 276 .getattr = ext4_getattr,
278#ifdef CONFIG_EXT4_FS_XATTR 277#ifdef CONFIG_EXT4_FS_XATTR
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e9473cbe80df..ce66d2fe826c 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -36,7 +36,7 @@
36 36
37static void dump_completed_IO(struct inode * inode) 37static void dump_completed_IO(struct inode * inode)
38{ 38{
39#ifdef EXT4_DEBUG 39#ifdef EXT4FS_DEBUG
40 struct list_head *cur, *before, *after; 40 struct list_head *cur, *before, *after;
41 ext4_io_end_t *io, *io0, *io1; 41 ext4_io_end_t *io, *io0, *io1;
42 unsigned long flags; 42 unsigned long flags;
@@ -172,6 +172,7 @@ int ext4_sync_file(struct file *file, int datasync)
172 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 172 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
173 int ret; 173 int ret;
174 tid_t commit_tid; 174 tid_t commit_tid;
175 bool needs_barrier = false;
175 176
176 J_ASSERT(ext4_journal_current_handle() == NULL); 177 J_ASSERT(ext4_journal_current_handle() == NULL);
177 178
@@ -211,22 +212,12 @@ int ext4_sync_file(struct file *file, int datasync)
211 } 212 }
212 213
213 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; 214 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
214 if (jbd2_log_start_commit(journal, commit_tid)) { 215 if (journal->j_flags & JBD2_BARRIER &&
215 /* 216 !jbd2_trans_will_send_data_barrier(journal, commit_tid))
216 * When the journal is on a different device than the 217 needs_barrier = true;
217 * fs data disk, we need to issue the barrier in 218 jbd2_log_start_commit(journal, commit_tid);
218 * writeback mode. (In ordered mode, the jbd2 layer 219 ret = jbd2_log_wait_commit(journal, commit_tid);
219 * will take care of issuing the barrier. In 220 if (needs_barrier)
220 * data=journal, all of the data blocks are written to
221 * the journal device.)
222 */
223 if (ext4_should_writeback_data(inode) &&
224 (journal->j_fs_dev != journal->j_dev) &&
225 (journal->j_flags & JBD2_BARRIER))
226 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
227 NULL);
228 ret = jbd2_log_wait_commit(journal, commit_tid);
229 } else if (journal->j_flags & JBD2_BARRIER)
230 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 221 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
231 out: 222 out:
232 trace_ext4_sync_file_exit(inode, ret); 223 trace_ext4_sync_file_exit(inode, ret);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f2fa5e8a582c..50d0e9c64584 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -639,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
639 while (target > 0) { 639 while (target > 0) {
640 count = target; 640 count = target;
641 /* allocating blocks for indirect blocks and direct blocks */ 641 /* allocating blocks for indirect blocks and direct blocks */
642 current_block = ext4_new_meta_blocks(handle, inode, 642 current_block = ext4_new_meta_blocks(handle, inode, goal,
643 goal, &count, err); 643 0, &count, err);
644 if (*err) 644 if (*err)
645 goto failed_out; 645 goto failed_out;
646 646
@@ -1930,7 +1930,7 @@ repeat:
1930 * We do still charge estimated metadata to the sb though; 1930 * We do still charge estimated metadata to the sb though;
1931 * we cannot afford to run out of free blocks. 1931 * we cannot afford to run out of free blocks.
1932 */ 1932 */
1933 if (ext4_claim_free_blocks(sbi, md_needed + 1)) { 1933 if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
1934 dquot_release_reservation_block(inode, 1); 1934 dquot_release_reservation_block(inode, 1);
1935 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1935 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1936 yield(); 1936 yield();
@@ -2796,9 +2796,7 @@ static int write_cache_pages_da(struct address_space *mapping,
2796 continue; 2796 continue;
2797 } 2797 }
2798 2798
2799 if (PageWriteback(page)) 2799 wait_on_page_writeback(page);
2800 wait_on_page_writeback(page);
2801
2802 BUG_ON(PageWriteback(page)); 2800 BUG_ON(PageWriteback(page));
2803 2801
2804 if (mpd->next_page != page->index) 2802 if (mpd->next_page != page->index)
@@ -3513,7 +3511,7 @@ retry:
3513 loff_t end = offset + iov_length(iov, nr_segs); 3511 loff_t end = offset + iov_length(iov, nr_segs);
3514 3512
3515 if (end > isize) 3513 if (end > isize)
3516 vmtruncate(inode, isize); 3514 ext4_truncate_failed_write(inode);
3517 } 3515 }
3518 } 3516 }
3519 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3517 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3916,9 +3914,30 @@ void ext4_set_aops(struct inode *inode)
3916int ext4_block_truncate_page(handle_t *handle, 3914int ext4_block_truncate_page(handle_t *handle,
3917 struct address_space *mapping, loff_t from) 3915 struct address_space *mapping, loff_t from)
3918{ 3916{
3917 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3918 unsigned length;
3919 unsigned blocksize;
3920 struct inode *inode = mapping->host;
3921
3922 blocksize = inode->i_sb->s_blocksize;
3923 length = blocksize - (offset & (blocksize - 1));
3924
3925 return ext4_block_zero_page_range(handle, mapping, from, length);
3926}
3927
3928/*
3929 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3930 * starting from file offset 'from'. The range to be zero'd must
3931 * be contained with in one block. If the specified range exceeds
3932 * the end of the block it will be shortened to end of the block
3933 * that cooresponds to 'from'
3934 */
3935int ext4_block_zero_page_range(handle_t *handle,
3936 struct address_space *mapping, loff_t from, loff_t length)
3937{
3919 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3938 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3920 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3939 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3921 unsigned blocksize, length, pos; 3940 unsigned blocksize, max, pos;
3922 ext4_lblk_t iblock; 3941 ext4_lblk_t iblock;
3923 struct inode *inode = mapping->host; 3942 struct inode *inode = mapping->host;
3924 struct buffer_head *bh; 3943 struct buffer_head *bh;
@@ -3931,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle,
3931 return -EINVAL; 3950 return -EINVAL;
3932 3951
3933 blocksize = inode->i_sb->s_blocksize; 3952 blocksize = inode->i_sb->s_blocksize;
3934 length = blocksize - (offset & (blocksize - 1)); 3953 max = blocksize - (offset & (blocksize - 1));
3954
3955 /*
3956 * correct length if it does not fall between
3957 * 'from' and the end of the block
3958 */
3959 if (length > max || length < 0)
3960 length = max;
3961
3935 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); 3962 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3936 3963
3937 if (!page_has_buffers(page)) 3964 if (!page_has_buffers(page))
@@ -4380,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4380 4407
4381int ext4_can_truncate(struct inode *inode) 4408int ext4_can_truncate(struct inode *inode)
4382{ 4409{
4383 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4384 return 0;
4385 if (S_ISREG(inode->i_mode)) 4410 if (S_ISREG(inode->i_mode))
4386 return 1; 4411 return 1;
4387 if (S_ISDIR(inode->i_mode)) 4412 if (S_ISDIR(inode->i_mode))
@@ -4392,6 +4417,31 @@ int ext4_can_truncate(struct inode *inode)
4392} 4417}
4393 4418
4394/* 4419/*
4420 * ext4_punch_hole: punches a hole in a file by releaseing the blocks
4421 * associated with the given offset and length
4422 *
4423 * @inode: File inode
4424 * @offset: The offset where the hole will begin
4425 * @len: The length of the hole
4426 *
4427 * Returns: 0 on sucess or negative on failure
4428 */
4429
4430int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
4431{
4432 struct inode *inode = file->f_path.dentry->d_inode;
4433 if (!S_ISREG(inode->i_mode))
4434 return -ENOTSUPP;
4435
4436 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4437 /* TODO: Add support for non extent hole punching */
4438 return -ENOTSUPP;
4439 }
4440
4441 return ext4_ext_punch_hole(file, offset, length);
4442}
4443
4444/*
4395 * ext4_truncate() 4445 * ext4_truncate()
4396 * 4446 *
4397 * We block out ext4_get_block() block instantiations across the entire 4447 * We block out ext4_get_block() block instantiations across the entire
@@ -4617,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
4617 /* 4667 /*
4618 * Figure out the offset within the block group inode table 4668 * Figure out the offset within the block group inode table
4619 */ 4669 */
4620 inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); 4670 inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
4621 inode_offset = ((inode->i_ino - 1) % 4671 inode_offset = ((inode->i_ino - 1) %
4622 EXT4_INODES_PER_GROUP(sb)); 4672 EXT4_INODES_PER_GROUP(sb));
4623 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); 4673 block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
@@ -5311,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5311 5361
5312 if (S_ISREG(inode->i_mode) && 5362 if (S_ISREG(inode->i_mode) &&
5313 attr->ia_valid & ATTR_SIZE && 5363 attr->ia_valid & ATTR_SIZE &&
5314 (attr->ia_size < inode->i_size || 5364 (attr->ia_size < inode->i_size)) {
5315 (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
5316 handle_t *handle; 5365 handle_t *handle;
5317 5366
5318 handle = ext4_journal_start(inode, 3); 5367 handle = ext4_journal_start(inode, 3);
@@ -5346,14 +5395,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5346 goto err_out; 5395 goto err_out;
5347 } 5396 }
5348 } 5397 }
5349 /* ext4_truncate will clear the flag */
5350 if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
5351 ext4_truncate(inode);
5352 } 5398 }
5353 5399
5354 if ((attr->ia_valid & ATTR_SIZE) && 5400 if (attr->ia_valid & ATTR_SIZE) {
5355 attr->ia_size != i_size_read(inode)) 5401 if (attr->ia_size != i_size_read(inode)) {
5356 rc = vmtruncate(inode, attr->ia_size); 5402 truncate_setsize(inode, attr->ia_size);
5403 ext4_truncate(inode);
5404 } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
5405 ext4_truncate(inode);
5406 }
5357 5407
5358 if (!rc) { 5408 if (!rc) {
5359 setattr_copy(inode, attr); 5409 setattr_copy(inode, attr);
@@ -5811,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5811 goto out_unlock; 5861 goto out_unlock;
5812 } 5862 }
5813 ret = 0; 5863 ret = 0;
5814 if (PageMappedToDisk(page)) 5864
5815 goto out_unlock; 5865 lock_page(page);
5866 wait_on_page_writeback(page);
5867 if (PageMappedToDisk(page)) {
5868 up_read(&inode->i_alloc_sem);
5869 return VM_FAULT_LOCKED;
5870 }
5816 5871
5817 if (page->index == size >> PAGE_CACHE_SHIFT) 5872 if (page->index == size >> PAGE_CACHE_SHIFT)
5818 len = size & ~PAGE_CACHE_MASK; 5873 len = size & ~PAGE_CACHE_MASK;
5819 else 5874 else
5820 len = PAGE_CACHE_SIZE; 5875 len = PAGE_CACHE_SIZE;
5821 5876
5822 lock_page(page);
5823 /* 5877 /*
5824 * return if we have all the buffers mapped. This avoid 5878 * return if we have all the buffers mapped. This avoid
5825 * the need to call write_begin/write_end which does a 5879 * the need to call write_begin/write_end which does a
@@ -5829,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5829 if (page_has_buffers(page)) { 5883 if (page_has_buffers(page)) {
5830 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 5884 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
5831 ext4_bh_unmapped)) { 5885 ext4_bh_unmapped)) {
5832 unlock_page(page); 5886 up_read(&inode->i_alloc_sem);
5833 goto out_unlock; 5887 return VM_FAULT_LOCKED;
5834 } 5888 }
5835 } 5889 }
5836 unlock_page(page); 5890 unlock_page(page);
@@ -5850,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5850 if (ret < 0) 5904 if (ret < 0)
5851 goto out_unlock; 5905 goto out_unlock;
5852 ret = 0; 5906 ret = 0;
5907
5908 /*
5909 * write_begin/end might have created a dirty page and someone
5910 * could wander in and start the IO. Make sure that hasn't
5911 * happened.
5912 */
5913 lock_page(page);
5914 wait_on_page_writeback(page);
5915 up_read(&inode->i_alloc_sem);
5916 return VM_FAULT_LOCKED;
5853out_unlock: 5917out_unlock:
5854 if (ret) 5918 if (ret)
5855 ret = VM_FAULT_SIGBUS; 5919 ret = VM_FAULT_SIGBUS;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d8a16eecf1d5..859f2ae8864e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -787,6 +787,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
787 struct inode *inode; 787 struct inode *inode;
788 char *data; 788 char *data;
789 char *bitmap; 789 char *bitmap;
790 struct ext4_group_info *grinfo;
790 791
791 mb_debug(1, "init page %lu\n", page->index); 792 mb_debug(1, "init page %lu\n", page->index);
792 793
@@ -819,6 +820,18 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
819 if (first_group + i >= ngroups) 820 if (first_group + i >= ngroups)
820 break; 821 break;
821 822
823 grinfo = ext4_get_group_info(sb, first_group + i);
824 /*
825 * If page is uptodate then we came here after online resize
826 * which added some new uninitialized group info structs, so
827 * we must skip all initialized uptodate buddies on the page,
828 * which may be currently in use by an allocating task.
829 */
830 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
831 bh[i] = NULL;
832 continue;
833 }
834
822 err = -EIO; 835 err = -EIO;
823 desc = ext4_get_group_desc(sb, first_group + i, NULL); 836 desc = ext4_get_group_desc(sb, first_group + i, NULL);
824 if (desc == NULL) 837 if (desc == NULL)
@@ -871,26 +884,28 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
871 } 884 }
872 885
873 /* wait for I/O completion */ 886 /* wait for I/O completion */
874 for (i = 0; i < groups_per_page && bh[i]; i++) 887 for (i = 0; i < groups_per_page; i++)
875 wait_on_buffer(bh[i]); 888 if (bh[i])
889 wait_on_buffer(bh[i]);
876 890
877 err = -EIO; 891 err = -EIO;
878 for (i = 0; i < groups_per_page && bh[i]; i++) 892 for (i = 0; i < groups_per_page; i++)
879 if (!buffer_uptodate(bh[i])) 893 if (bh[i] && !buffer_uptodate(bh[i]))
880 goto out; 894 goto out;
881 895
882 err = 0; 896 err = 0;
883 first_block = page->index * blocks_per_page; 897 first_block = page->index * blocks_per_page;
884 /* init the page */
885 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
886 for (i = 0; i < blocks_per_page; i++) { 898 for (i = 0; i < blocks_per_page; i++) {
887 int group; 899 int group;
888 struct ext4_group_info *grinfo;
889 900
890 group = (first_block + i) >> 1; 901 group = (first_block + i) >> 1;
891 if (group >= ngroups) 902 if (group >= ngroups)
892 break; 903 break;
893 904
905 if (!bh[group - first_group])
906 /* skip initialized uptodate buddy */
907 continue;
908
894 /* 909 /*
895 * data carry information regarding this 910 * data carry information regarding this
896 * particular group in the format specified 911 * particular group in the format specified
@@ -919,6 +934,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
919 * incore got set to the group block bitmap below 934 * incore got set to the group block bitmap below
920 */ 935 */
921 ext4_lock_group(sb, group); 936 ext4_lock_group(sb, group);
937 /* init the buddy */
938 memset(data, 0xff, blocksize);
922 ext4_mb_generate_buddy(sb, data, incore, group); 939 ext4_mb_generate_buddy(sb, data, incore, group);
923 ext4_unlock_group(sb, group); 940 ext4_unlock_group(sb, group);
924 incore = NULL; 941 incore = NULL;
@@ -948,7 +965,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
948 965
949out: 966out:
950 if (bh) { 967 if (bh) {
951 for (i = 0; i < groups_per_page && bh[i]; i++) 968 for (i = 0; i < groups_per_page; i++)
952 brelse(bh[i]); 969 brelse(bh[i]);
953 if (bh != &bhs) 970 if (bh != &bhs)
954 kfree(bh); 971 kfree(bh);
@@ -957,22 +974,21 @@ out:
957} 974}
958 975
959/* 976/*
960 * lock the group_info alloc_sem of all the groups 977 * Lock the buddy and bitmap pages. This make sure other parallel init_group
961 * belonging to the same buddy cache page. This 978 * on the same buddy page doesn't happen whild holding the buddy page lock.
962 * make sure other parallel operation on the buddy 979 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
963 * cache doesn't happen whild holding the buddy cache 980 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
964 * lock
965 */ 981 */
966static int ext4_mb_get_buddy_cache_lock(struct super_block *sb, 982static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
967 ext4_group_t group) 983 ext4_group_t group, struct ext4_buddy *e4b)
968{ 984{
969 int i; 985 struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
970 int block, pnum; 986 int block, pnum, poff;
971 int blocks_per_page; 987 int blocks_per_page;
972 int groups_per_page; 988 struct page *page;
973 ext4_group_t ngroups = ext4_get_groups_count(sb); 989
974 ext4_group_t first_group; 990 e4b->bd_buddy_page = NULL;
975 struct ext4_group_info *grp; 991 e4b->bd_bitmap_page = NULL;
976 992
977 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 993 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
978 /* 994 /*
@@ -982,57 +998,40 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
982 */ 998 */
983 block = group * 2; 999 block = group * 2;
984 pnum = block / blocks_per_page; 1000 pnum = block / blocks_per_page;
985 first_group = pnum * blocks_per_page / 2; 1001 poff = block % blocks_per_page;
986 1002 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
987 groups_per_page = blocks_per_page >> 1; 1003 if (!page)
988 if (groups_per_page == 0) 1004 return -EIO;
989 groups_per_page = 1; 1005 BUG_ON(page->mapping != inode->i_mapping);
990 /* read all groups the page covers into the cache */ 1006 e4b->bd_bitmap_page = page;
991 for (i = 0; i < groups_per_page; i++) { 1007 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
992 1008
993 if ((first_group + i) >= ngroups) 1009 if (blocks_per_page >= 2) {
994 break; 1010 /* buddy and bitmap are on the same page */
995 grp = ext4_get_group_info(sb, first_group + i); 1011 return 0;
996 /* take all groups write allocation
997 * semaphore. This make sure there is
998 * no block allocation going on in any
999 * of that groups
1000 */
1001 down_write_nested(&grp->alloc_sem, i);
1002 } 1012 }
1003 return i; 1013
1014 block++;
1015 pnum = block / blocks_per_page;
1016 poff = block % blocks_per_page;
1017 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1018 if (!page)
1019 return -EIO;
1020 BUG_ON(page->mapping != inode->i_mapping);
1021 e4b->bd_buddy_page = page;
1022 return 0;
1004} 1023}
1005 1024
1006static void ext4_mb_put_buddy_cache_lock(struct super_block *sb, 1025static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1007 ext4_group_t group, int locked_group)
1008{ 1026{
1009 int i; 1027 if (e4b->bd_bitmap_page) {
1010 int block, pnum; 1028 unlock_page(e4b->bd_bitmap_page);
1011 int blocks_per_page; 1029 page_cache_release(e4b->bd_bitmap_page);
1012 ext4_group_t first_group; 1030 }
1013 struct ext4_group_info *grp; 1031 if (e4b->bd_buddy_page) {
1014 1032 unlock_page(e4b->bd_buddy_page);
1015 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1033 page_cache_release(e4b->bd_buddy_page);
1016 /*
1017 * the buddy cache inode stores the block bitmap
1018 * and buddy information in consecutive blocks.
1019 * So for each group we need two blocks.
1020 */
1021 block = group * 2;
1022 pnum = block / blocks_per_page;
1023 first_group = pnum * blocks_per_page / 2;
1024 /* release locks on all the groups */
1025 for (i = 0; i < locked_group; i++) {
1026
1027 grp = ext4_get_group_info(sb, first_group + i);
1028 /* take all groups write allocation
1029 * semaphore. This make sure there is
1030 * no block allocation going on in any
1031 * of that groups
1032 */
1033 up_write(&grp->alloc_sem);
1034 } 1034 }
1035
1036} 1035}
1037 1036
1038/* 1037/*
@@ -1044,93 +1043,60 @@ static noinline_for_stack
1044int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 1043int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1045{ 1044{
1046 1045
1047 int ret = 0;
1048 void *bitmap;
1049 int blocks_per_page;
1050 int block, pnum, poff;
1051 int num_grp_locked = 0;
1052 struct ext4_group_info *this_grp; 1046 struct ext4_group_info *this_grp;
1053 struct ext4_sb_info *sbi = EXT4_SB(sb); 1047 struct ext4_buddy e4b;
1054 struct inode *inode = sbi->s_buddy_cache; 1048 struct page *page;
1055 struct page *page = NULL, *bitmap_page = NULL; 1049 int ret = 0;
1056 1050
1057 mb_debug(1, "init group %u\n", group); 1051 mb_debug(1, "init group %u\n", group);
1058 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1059 this_grp = ext4_get_group_info(sb, group); 1052 this_grp = ext4_get_group_info(sb, group);
1060 /* 1053 /*
1061 * This ensures that we don't reinit the buddy cache 1054 * This ensures that we don't reinit the buddy cache
1062 * page which map to the group from which we are already 1055 * page which map to the group from which we are already
1063 * allocating. If we are looking at the buddy cache we would 1056 * allocating. If we are looking at the buddy cache we would
1064 * have taken a reference using ext4_mb_load_buddy and that 1057 * have taken a reference using ext4_mb_load_buddy and that
1065 * would have taken the alloc_sem lock. 1058 * would have pinned buddy page to page cache.
1066 */ 1059 */
1067 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); 1060 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
1068 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { 1061 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1069 /* 1062 /*
1070 * somebody initialized the group 1063 * somebody initialized the group
1071 * return without doing anything 1064 * return without doing anything
1072 */ 1065 */
1073 ret = 0;
1074 goto err; 1066 goto err;
1075 } 1067 }
1076 /* 1068
1077 * the buddy cache inode stores the block bitmap 1069 page = e4b.bd_bitmap_page;
1078 * and buddy information in consecutive blocks. 1070 ret = ext4_mb_init_cache(page, NULL);
1079 * So for each group we need two blocks. 1071 if (ret)
1080 */ 1072 goto err;
1081 block = group * 2; 1073 if (!PageUptodate(page)) {
1082 pnum = block / blocks_per_page;
1083 poff = block % blocks_per_page;
1084 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1085 if (page) {
1086 BUG_ON(page->mapping != inode->i_mapping);
1087 ret = ext4_mb_init_cache(page, NULL);
1088 if (ret) {
1089 unlock_page(page);
1090 goto err;
1091 }
1092 unlock_page(page);
1093 }
1094 if (page == NULL || !PageUptodate(page)) {
1095 ret = -EIO; 1074 ret = -EIO;
1096 goto err; 1075 goto err;
1097 } 1076 }
1098 mark_page_accessed(page); 1077 mark_page_accessed(page);
1099 bitmap_page = page;
1100 bitmap = page_address(page) + (poff * sb->s_blocksize);
1101 1078
1102 /* init buddy cache */ 1079 if (e4b.bd_buddy_page == NULL) {
1103 block++;
1104 pnum = block / blocks_per_page;
1105 poff = block % blocks_per_page;
1106 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1107 if (page == bitmap_page) {
1108 /* 1080 /*
1109 * If both the bitmap and buddy are in 1081 * If both the bitmap and buddy are in
1110 * the same page we don't need to force 1082 * the same page we don't need to force
1111 * init the buddy 1083 * init the buddy
1112 */ 1084 */
1113 unlock_page(page); 1085 ret = 0;
1114 } else if (page) { 1086 goto err;
1115 BUG_ON(page->mapping != inode->i_mapping);
1116 ret = ext4_mb_init_cache(page, bitmap);
1117 if (ret) {
1118 unlock_page(page);
1119 goto err;
1120 }
1121 unlock_page(page);
1122 } 1087 }
1123 if (page == NULL || !PageUptodate(page)) { 1088 /* init buddy cache */
1089 page = e4b.bd_buddy_page;
1090 ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
1091 if (ret)
1092 goto err;
1093 if (!PageUptodate(page)) {
1124 ret = -EIO; 1094 ret = -EIO;
1125 goto err; 1095 goto err;
1126 } 1096 }
1127 mark_page_accessed(page); 1097 mark_page_accessed(page);
1128err: 1098err:
1129 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); 1099 ext4_mb_put_buddy_page_lock(&e4b);
1130 if (bitmap_page)
1131 page_cache_release(bitmap_page);
1132 if (page)
1133 page_cache_release(page);
1134 return ret; 1100 return ret;
1135} 1101}
1136 1102
@@ -1164,24 +1130,8 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1164 e4b->bd_group = group; 1130 e4b->bd_group = group;
1165 e4b->bd_buddy_page = NULL; 1131 e4b->bd_buddy_page = NULL;
1166 e4b->bd_bitmap_page = NULL; 1132 e4b->bd_bitmap_page = NULL;
1167 e4b->alloc_semp = &grp->alloc_sem;
1168
1169 /* Take the read lock on the group alloc
1170 * sem. This would make sure a parallel
1171 * ext4_mb_init_group happening on other
1172 * groups mapped by the page is blocked
1173 * till we are done with allocation
1174 */
1175repeat_load_buddy:
1176 down_read(e4b->alloc_semp);
1177 1133
1178 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1134 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1179 /* we need to check for group need init flag
1180 * with alloc_semp held so that we can be sure
1181 * that new blocks didn't get added to the group
1182 * when we are loading the buddy cache
1183 */
1184 up_read(e4b->alloc_semp);
1185 /* 1135 /*
1186 * we need full data about the group 1136 * we need full data about the group
1187 * to make a good selection 1137 * to make a good selection
@@ -1189,7 +1139,6 @@ repeat_load_buddy:
1189 ret = ext4_mb_init_group(sb, group); 1139 ret = ext4_mb_init_group(sb, group);
1190 if (ret) 1140 if (ret)
1191 return ret; 1141 return ret;
1192 goto repeat_load_buddy;
1193 } 1142 }
1194 1143
1195 /* 1144 /*
@@ -1273,15 +1222,14 @@ repeat_load_buddy:
1273 return 0; 1222 return 0;
1274 1223
1275err: 1224err:
1225 if (page)
1226 page_cache_release(page);
1276 if (e4b->bd_bitmap_page) 1227 if (e4b->bd_bitmap_page)
1277 page_cache_release(e4b->bd_bitmap_page); 1228 page_cache_release(e4b->bd_bitmap_page);
1278 if (e4b->bd_buddy_page) 1229 if (e4b->bd_buddy_page)
1279 page_cache_release(e4b->bd_buddy_page); 1230 page_cache_release(e4b->bd_buddy_page);
1280 e4b->bd_buddy = NULL; 1231 e4b->bd_buddy = NULL;
1281 e4b->bd_bitmap = NULL; 1232 e4b->bd_bitmap = NULL;
1282
1283 /* Done with the buddy cache */
1284 up_read(e4b->alloc_semp);
1285 return ret; 1233 return ret;
1286} 1234}
1287 1235
@@ -1291,9 +1239,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1291 page_cache_release(e4b->bd_bitmap_page); 1239 page_cache_release(e4b->bd_bitmap_page);
1292 if (e4b->bd_buddy_page) 1240 if (e4b->bd_buddy_page)
1293 page_cache_release(e4b->bd_buddy_page); 1241 page_cache_release(e4b->bd_buddy_page);
1294 /* Done with the buddy cache */
1295 if (e4b->alloc_semp)
1296 up_read(e4b->alloc_semp);
1297} 1242}
1298 1243
1299 1244
@@ -1606,9 +1551,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1606 get_page(ac->ac_bitmap_page); 1551 get_page(ac->ac_bitmap_page);
1607 ac->ac_buddy_page = e4b->bd_buddy_page; 1552 ac->ac_buddy_page = e4b->bd_buddy_page;
1608 get_page(ac->ac_buddy_page); 1553 get_page(ac->ac_buddy_page);
1609 /* on allocation we use ac to track the held semaphore */
1610 ac->alloc_semp = e4b->alloc_semp;
1611 e4b->alloc_semp = NULL;
1612 /* store last allocated for subsequent stream allocation */ 1554 /* store last allocated for subsequent stream allocation */
1613 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 1555 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1614 spin_lock(&sbi->s_md_lock); 1556 spin_lock(&sbi->s_md_lock);
@@ -2659,7 +2601,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2659 struct super_block *sb = journal->j_private; 2601 struct super_block *sb = journal->j_private;
2660 struct ext4_buddy e4b; 2602 struct ext4_buddy e4b;
2661 struct ext4_group_info *db; 2603 struct ext4_group_info *db;
2662 int err, ret, count = 0, count2 = 0; 2604 int err, count = 0, count2 = 0;
2663 struct ext4_free_data *entry; 2605 struct ext4_free_data *entry;
2664 struct list_head *l, *ltmp; 2606 struct list_head *l, *ltmp;
2665 2607
@@ -2669,15 +2611,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2669 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2611 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2670 entry->count, entry->group, entry); 2612 entry->count, entry->group, entry);
2671 2613
2672 if (test_opt(sb, DISCARD)) { 2614 if (test_opt(sb, DISCARD))
2673 ret = ext4_issue_discard(sb, entry->group, 2615 ext4_issue_discard(sb, entry->group,
2674 entry->start_blk, entry->count); 2616 entry->start_blk, entry->count);
2675 if (unlikely(ret == -EOPNOTSUPP)) {
2676 ext4_warning(sb, "discard not supported, "
2677 "disabling");
2678 clear_opt(sb, DISCARD);
2679 }
2680 }
2681 2617
2682 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2618 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2683 /* we expect to find existing buddy because it's pinned */ 2619 /* we expect to find existing buddy because it's pinned */
@@ -4226,15 +4162,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4226 spin_unlock(&pa->pa_lock); 4162 spin_unlock(&pa->pa_lock);
4227 } 4163 }
4228 } 4164 }
4229 if (ac->alloc_semp)
4230 up_read(ac->alloc_semp);
4231 if (pa) { 4165 if (pa) {
4232 /* 4166 /*
4233 * We want to add the pa to the right bucket. 4167 * We want to add the pa to the right bucket.
4234 * Remove it from the list and while adding 4168 * Remove it from the list and while adding
4235 * make sure the list to which we are adding 4169 * make sure the list to which we are adding
4236 * doesn't grow big. We need to release 4170 * doesn't grow big.
4237 * alloc_semp before calling ext4_mb_add_n_trim()
4238 */ 4171 */
4239 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { 4172 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4240 spin_lock(pa->pa_obj_lock); 4173 spin_lock(pa->pa_obj_lock);
@@ -4303,7 +4236,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4303 * there is enough free blocks to do block allocation 4236 * there is enough free blocks to do block allocation
4304 * and verify allocation doesn't exceed the quota limits. 4237 * and verify allocation doesn't exceed the quota limits.
4305 */ 4238 */
4306 while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { 4239 while (ar->len &&
4240 ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
4241
4307 /* let others to free the space */ 4242 /* let others to free the space */
4308 yield(); 4243 yield();
4309 ar->len = ar->len >> 1; 4244 ar->len = ar->len >> 1;
@@ -4313,9 +4248,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4313 return 0; 4248 return 0;
4314 } 4249 }
4315 reserv_blks = ar->len; 4250 reserv_blks = ar->len;
4316 while (ar->len && dquot_alloc_block(ar->inode, ar->len)) { 4251 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
4317 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4252 dquot_alloc_block_nofail(ar->inode, ar->len);
4318 ar->len--; 4253 } else {
4254 while (ar->len &&
4255 dquot_alloc_block(ar->inode, ar->len)) {
4256
4257 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4258 ar->len--;
4259 }
4319 } 4260 }
4320 inquota = ar->len; 4261 inquota = ar->len;
4321 if (ar->len == 0) { 4262 if (ar->len == 0) {
@@ -4704,6 +4645,127 @@ error_return:
4704} 4645}
4705 4646
4706/** 4647/**
4648 * ext4_add_groupblocks() -- Add given blocks to an existing group
4649 * @handle: handle to this transaction
4650 * @sb: super block
4651 * @block: start physcial block to add to the block group
4652 * @count: number of blocks to free
4653 *
4654 * This marks the blocks as free in the bitmap and buddy.
4655 */
4656void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
4657 ext4_fsblk_t block, unsigned long count)
4658{
4659 struct buffer_head *bitmap_bh = NULL;
4660 struct buffer_head *gd_bh;
4661 ext4_group_t block_group;
4662 ext4_grpblk_t bit;
4663 unsigned int i;
4664 struct ext4_group_desc *desc;
4665 struct ext4_sb_info *sbi = EXT4_SB(sb);
4666 struct ext4_buddy e4b;
4667 int err = 0, ret, blk_free_count;
4668 ext4_grpblk_t blocks_freed;
4669 struct ext4_group_info *grp;
4670
4671 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
4672
4673 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4674 grp = ext4_get_group_info(sb, block_group);
4675 /*
4676 * Check to see if we are freeing blocks across a group
4677 * boundary.
4678 */
4679 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
4680 goto error_return;
4681
4682 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4683 if (!bitmap_bh)
4684 goto error_return;
4685 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
4686 if (!desc)
4687 goto error_return;
4688
4689 if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
4690 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
4691 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
4692 in_range(block + count - 1, ext4_inode_table(sb, desc),
4693 sbi->s_itb_per_group)) {
4694 ext4_error(sb, "Adding blocks in system zones - "
4695 "Block = %llu, count = %lu",
4696 block, count);
4697 goto error_return;
4698 }
4699
4700 BUFFER_TRACE(bitmap_bh, "getting write access");
4701 err = ext4_journal_get_write_access(handle, bitmap_bh);
4702 if (err)
4703 goto error_return;
4704
4705 /*
4706 * We are about to modify some metadata. Call the journal APIs
4707 * to unshare ->b_data if a currently-committing transaction is
4708 * using it
4709 */
4710 BUFFER_TRACE(gd_bh, "get_write_access");
4711 err = ext4_journal_get_write_access(handle, gd_bh);
4712 if (err)
4713 goto error_return;
4714
4715 for (i = 0, blocks_freed = 0; i < count; i++) {
4716 BUFFER_TRACE(bitmap_bh, "clear bit");
4717 if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
4718 ext4_error(sb, "bit already cleared for block %llu",
4719 (ext4_fsblk_t)(block + i));
4720 BUFFER_TRACE(bitmap_bh, "bit already cleared");
4721 } else {
4722 blocks_freed++;
4723 }
4724 }
4725
4726 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4727 if (err)
4728 goto error_return;
4729
4730 /*
4731 * need to update group_info->bb_free and bitmap
4732 * with group lock held. generate_buddy look at
4733 * them with group lock_held
4734 */
4735 ext4_lock_group(sb, block_group);
4736 mb_clear_bits(bitmap_bh->b_data, bit, count);
4737 mb_free_blocks(NULL, &e4b, bit, count);
4738 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
4739 ext4_free_blks_set(sb, desc, blk_free_count);
4740 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
4741 ext4_unlock_group(sb, block_group);
4742 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
4743
4744 if (sbi->s_log_groups_per_flex) {
4745 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4746 atomic_add(blocks_freed,
4747 &sbi->s_flex_groups[flex_group].free_blocks);
4748 }
4749
4750 ext4_mb_unload_buddy(&e4b);
4751
4752 /* We dirtied the bitmap block */
4753 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4754 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4755
4756 /* And the group descriptor block */
4757 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4758 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4759 if (!err)
4760 err = ret;
4761
4762error_return:
4763 brelse(bitmap_bh);
4764 ext4_std_error(sb, err);
4765 return;
4766}
4767
4768/**
4707 * ext4_trim_extent -- function to TRIM one single free extent in the group 4769 * ext4_trim_extent -- function to TRIM one single free extent in the group
4708 * @sb: super block for the file system 4770 * @sb: super block for the file system
4709 * @start: starting block of the free extent in the alloc. group 4771 * @start: starting block of the free extent in the alloc. group
@@ -4715,11 +4777,10 @@ error_return:
4715 * one will allocate those blocks, mark it as used in buddy bitmap. This must 4777 * one will allocate those blocks, mark it as used in buddy bitmap. This must
4716 * be called with under the group lock. 4778 * be called with under the group lock.
4717 */ 4779 */
4718static int ext4_trim_extent(struct super_block *sb, int start, int count, 4780static void ext4_trim_extent(struct super_block *sb, int start, int count,
4719 ext4_group_t group, struct ext4_buddy *e4b) 4781 ext4_group_t group, struct ext4_buddy *e4b)
4720{ 4782{
4721 struct ext4_free_extent ex; 4783 struct ext4_free_extent ex;
4722 int ret = 0;
4723 4784
4724 assert_spin_locked(ext4_group_lock_ptr(sb, group)); 4785 assert_spin_locked(ext4_group_lock_ptr(sb, group));
4725 4786
@@ -4733,12 +4794,9 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
4733 */ 4794 */
4734 mb_mark_used(e4b, &ex); 4795 mb_mark_used(e4b, &ex);
4735 ext4_unlock_group(sb, group); 4796 ext4_unlock_group(sb, group);
4736 4797 ext4_issue_discard(sb, group, start, count);
4737 ret = ext4_issue_discard(sb, group, start, count);
4738
4739 ext4_lock_group(sb, group); 4798 ext4_lock_group(sb, group);
4740 mb_free_blocks(NULL, e4b, start, ex.fe_len); 4799 mb_free_blocks(NULL, e4b, start, ex.fe_len);
4741 return ret;
4742} 4800}
4743 4801
4744/** 4802/**
@@ -4760,21 +4818,26 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
4760 * the group buddy bitmap. This is done until whole group is scanned. 4818 * the group buddy bitmap. This is done until whole group is scanned.
4761 */ 4819 */
4762static ext4_grpblk_t 4820static ext4_grpblk_t
4763ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, 4821ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4764 ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) 4822 ext4_grpblk_t start, ext4_grpblk_t max,
4823 ext4_grpblk_t minblocks)
4765{ 4824{
4766 void *bitmap; 4825 void *bitmap;
4767 ext4_grpblk_t next, count = 0; 4826 ext4_grpblk_t next, count = 0;
4768 ext4_group_t group; 4827 struct ext4_buddy e4b;
4769 int ret = 0; 4828 int ret;
4770 4829
4771 BUG_ON(e4b == NULL); 4830 ret = ext4_mb_load_buddy(sb, group, &e4b);
4831 if (ret) {
4832 ext4_error(sb, "Error in loading buddy "
4833 "information for %u", group);
4834 return ret;
4835 }
4836 bitmap = e4b.bd_bitmap;
4772 4837
4773 bitmap = e4b->bd_bitmap;
4774 group = e4b->bd_group;
4775 start = (e4b->bd_info->bb_first_free > start) ?
4776 e4b->bd_info->bb_first_free : start;
4777 ext4_lock_group(sb, group); 4838 ext4_lock_group(sb, group);
4839 start = (e4b.bd_info->bb_first_free > start) ?
4840 e4b.bd_info->bb_first_free : start;
4778 4841
4779 while (start < max) { 4842 while (start < max) {
4780 start = mb_find_next_zero_bit(bitmap, max, start); 4843 start = mb_find_next_zero_bit(bitmap, max, start);
@@ -4783,10 +4846,8 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4783 next = mb_find_next_bit(bitmap, max, start); 4846 next = mb_find_next_bit(bitmap, max, start);
4784 4847
4785 if ((next - start) >= minblocks) { 4848 if ((next - start) >= minblocks) {
4786 ret = ext4_trim_extent(sb, start, 4849 ext4_trim_extent(sb, start,
4787 next - start, group, e4b); 4850 next - start, group, &e4b);
4788 if (ret < 0)
4789 break;
4790 count += next - start; 4851 count += next - start;
4791 } 4852 }
4792 start = next + 1; 4853 start = next + 1;
@@ -4802,17 +4863,15 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4802 ext4_lock_group(sb, group); 4863 ext4_lock_group(sb, group);
4803 } 4864 }
4804 4865
4805 if ((e4b->bd_info->bb_free - count) < minblocks) 4866 if ((e4b.bd_info->bb_free - count) < minblocks)
4806 break; 4867 break;
4807 } 4868 }
4808 ext4_unlock_group(sb, group); 4869 ext4_unlock_group(sb, group);
4870 ext4_mb_unload_buddy(&e4b);
4809 4871
4810 ext4_debug("trimmed %d blocks in the group %d\n", 4872 ext4_debug("trimmed %d blocks in the group %d\n",
4811 count, group); 4873 count, group);
4812 4874
4813 if (ret < 0)
4814 count = ret;
4815
4816 return count; 4875 return count;
4817} 4876}
4818 4877
@@ -4830,11 +4889,11 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4830 */ 4889 */
4831int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) 4890int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4832{ 4891{
4833 struct ext4_buddy e4b; 4892 struct ext4_group_info *grp;
4834 ext4_group_t first_group, last_group; 4893 ext4_group_t first_group, last_group;
4835 ext4_group_t group, ngroups = ext4_get_groups_count(sb); 4894 ext4_group_t group, ngroups = ext4_get_groups_count(sb);
4836 ext4_grpblk_t cnt = 0, first_block, last_block; 4895 ext4_grpblk_t cnt = 0, first_block, last_block;
4837 uint64_t start, len, minlen, trimmed; 4896 uint64_t start, len, minlen, trimmed = 0;
4838 ext4_fsblk_t first_data_blk = 4897 ext4_fsblk_t first_data_blk =
4839 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 4898 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
4840 int ret = 0; 4899 int ret = 0;
@@ -4842,7 +4901,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4842 start = range->start >> sb->s_blocksize_bits; 4901 start = range->start >> sb->s_blocksize_bits;
4843 len = range->len >> sb->s_blocksize_bits; 4902 len = range->len >> sb->s_blocksize_bits;
4844 minlen = range->minlen >> sb->s_blocksize_bits; 4903 minlen = range->minlen >> sb->s_blocksize_bits;
4845 trimmed = 0;
4846 4904
4847 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) 4905 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
4848 return -EINVAL; 4906 return -EINVAL;
@@ -4863,11 +4921,12 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4863 return -EINVAL; 4921 return -EINVAL;
4864 4922
4865 for (group = first_group; group <= last_group; group++) { 4923 for (group = first_group; group <= last_group; group++) {
4866 ret = ext4_mb_load_buddy(sb, group, &e4b); 4924 grp = ext4_get_group_info(sb, group);
4867 if (ret) { 4925 /* We only do this if the grp has never been initialized */
4868 ext4_error(sb, "Error in loading buddy " 4926 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
4869 "information for %u", group); 4927 ret = ext4_mb_init_group(sb, group);
4870 break; 4928 if (ret)
4929 break;
4871 } 4930 }
4872 4931
4873 /* 4932 /*
@@ -4880,16 +4939,14 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4880 last_block = first_block + len; 4939 last_block = first_block + len;
4881 len -= last_block - first_block; 4940 len -= last_block - first_block;
4882 4941
4883 if (e4b.bd_info->bb_free >= minlen) { 4942 if (grp->bb_free >= minlen) {
4884 cnt = ext4_trim_all_free(sb, &e4b, first_block, 4943 cnt = ext4_trim_all_free(sb, group, first_block,
4885 last_block, minlen); 4944 last_block, minlen);
4886 if (cnt < 0) { 4945 if (cnt < 0) {
4887 ret = cnt; 4946 ret = cnt;
4888 ext4_mb_unload_buddy(&e4b);
4889 break; 4947 break;
4890 } 4948 }
4891 } 4949 }
4892 ext4_mb_unload_buddy(&e4b);
4893 trimmed += cnt; 4950 trimmed += cnt;
4894 first_block = 0; 4951 first_block = 0;
4895 } 4952 }
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 22bd4d7f289b..20b5e7bfebd1 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -193,11 +193,6 @@ struct ext4_allocation_context {
193 __u8 ac_op; /* operation, for history only */ 193 __u8 ac_op; /* operation, for history only */
194 struct page *ac_bitmap_page; 194 struct page *ac_bitmap_page;
195 struct page *ac_buddy_page; 195 struct page *ac_buddy_page;
196 /*
197 * pointer to the held semaphore upon successful
198 * block allocation
199 */
200 struct rw_semaphore *alloc_semp;
201 struct ext4_prealloc_space *ac_pa; 196 struct ext4_prealloc_space *ac_pa;
202 struct ext4_locality_group *ac_lg; 197 struct ext4_locality_group *ac_lg;
203}; 198};
@@ -215,7 +210,6 @@ struct ext4_buddy {
215 struct super_block *bd_sb; 210 struct super_block *bd_sb;
216 __u16 bd_blkbits; 211 __u16 bd_blkbits;
217 ext4_group_t bd_group; 212 ext4_group_t bd_group;
218 struct rw_semaphore *alloc_semp;
219}; 213};
220#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) 214#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
221#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) 215#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 92816b4e0f16..b57b98fb44d1 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
376 * We have the extent map build with the tmp inode. 376 * We have the extent map build with the tmp inode.
377 * Now copy the i_data across 377 * Now copy the i_data across
378 */ 378 */
379 ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS); 379 ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
380 memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); 380 memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
381 381
382 /* 382 /*
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
new file mode 100644
index 000000000000..9bdef3f537c5
--- /dev/null
+++ b/fs/ext4/mmp.c
@@ -0,0 +1,351 @@
1#include <linux/fs.h>
2#include <linux/random.h>
3#include <linux/buffer_head.h>
4#include <linux/utsname.h>
5#include <linux/kthread.h>
6
7#include "ext4.h"
8
9/*
10 * Write the MMP block using WRITE_SYNC to try to get the block on-disk
11 * faster.
12 */
13static int write_mmp_block(struct buffer_head *bh)
14{
15 mark_buffer_dirty(bh);
16 lock_buffer(bh);
17 bh->b_end_io = end_buffer_write_sync;
18 get_bh(bh);
19 submit_bh(WRITE_SYNC, bh);
20 wait_on_buffer(bh);
21 if (unlikely(!buffer_uptodate(bh)))
22 return 1;
23
24 return 0;
25}
26
27/*
28 * Read the MMP block. It _must_ be read from disk and hence we clear the
29 * uptodate flag on the buffer.
30 */
31static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
32 ext4_fsblk_t mmp_block)
33{
34 struct mmp_struct *mmp;
35
36 if (*bh)
37 clear_buffer_uptodate(*bh);
38
39 /* This would be sb_bread(sb, mmp_block), except we need to be sure
40 * that the MD RAID device cache has been bypassed, and that the read
41 * is not blocked in the elevator. */
42 if (!*bh)
43 *bh = sb_getblk(sb, mmp_block);
44 if (*bh) {
45 get_bh(*bh);
46 lock_buffer(*bh);
47 (*bh)->b_end_io = end_buffer_read_sync;
48 submit_bh(READ_SYNC, *bh);
49 wait_on_buffer(*bh);
50 if (!buffer_uptodate(*bh)) {
51 brelse(*bh);
52 *bh = NULL;
53 }
54 }
55 if (!*bh) {
56 ext4_warning(sb, "Error while reading MMP block %llu",
57 mmp_block);
58 return -EIO;
59 }
60
61 mmp = (struct mmp_struct *)((*bh)->b_data);
62 if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
63 return -EINVAL;
64
65 return 0;
66}
67
68/*
69 * Dump as much information as possible to help the admin.
70 */
71void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
72 const char *function, unsigned int line, const char *msg)
73{
74 __ext4_warning(sb, function, line, msg);
75 __ext4_warning(sb, function, line,
76 "MMP failure info: last update time: %llu, last update "
77 "node: %s, last update device: %s\n",
78 (long long unsigned int) le64_to_cpu(mmp->mmp_time),
79 mmp->mmp_nodename, mmp->mmp_bdevname);
80}
81
82/*
83 * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
84 */
85static int kmmpd(void *data)
86{
87 struct super_block *sb = ((struct mmpd_data *) data)->sb;
88 struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
89 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
90 struct mmp_struct *mmp;
91 ext4_fsblk_t mmp_block;
92 u32 seq = 0;
93 unsigned long failed_writes = 0;
94 int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
95 unsigned mmp_check_interval;
96 unsigned long last_update_time;
97 unsigned long diff;
98 int retval;
99
100 mmp_block = le64_to_cpu(es->s_mmp_block);
101 mmp = (struct mmp_struct *)(bh->b_data);
102 mmp->mmp_time = cpu_to_le64(get_seconds());
103 /*
104 * Start with the higher mmp_check_interval and reduce it if
105 * the MMP block is being updated on time.
106 */
107 mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
108 EXT4_MMP_MIN_CHECK_INTERVAL);
109 mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
110 bdevname(bh->b_bdev, mmp->mmp_bdevname);
111
112 memcpy(mmp->mmp_nodename, init_utsname()->sysname,
113 sizeof(mmp->mmp_nodename));
114
115 while (!kthread_should_stop()) {
116 if (++seq > EXT4_MMP_SEQ_MAX)
117 seq = 1;
118
119 mmp->mmp_seq = cpu_to_le32(seq);
120 mmp->mmp_time = cpu_to_le64(get_seconds());
121 last_update_time = jiffies;
122
123 retval = write_mmp_block(bh);
124 /*
125 * Don't spew too many error messages. Print one every
126 * (s_mmp_update_interval * 60) seconds.
127 */
128 if (retval && (failed_writes % 60) == 0) {
129 ext4_error(sb, "Error writing to MMP block");
130 failed_writes++;
131 }
132
133 if (!(le32_to_cpu(es->s_feature_incompat) &
134 EXT4_FEATURE_INCOMPAT_MMP)) {
135 ext4_warning(sb, "kmmpd being stopped since MMP feature"
136 " has been disabled.");
137 EXT4_SB(sb)->s_mmp_tsk = NULL;
138 goto failed;
139 }
140
141 if (sb->s_flags & MS_RDONLY) {
142 ext4_warning(sb, "kmmpd being stopped since filesystem "
143 "has been remounted as readonly.");
144 EXT4_SB(sb)->s_mmp_tsk = NULL;
145 goto failed;
146 }
147
148 diff = jiffies - last_update_time;
149 if (diff < mmp_update_interval * HZ)
150 schedule_timeout_interruptible(mmp_update_interval *
151 HZ - diff);
152
153 /*
154 * We need to make sure that more than mmp_check_interval
155 * seconds have not passed since writing. If that has happened
156 * we need to check if the MMP block is as we left it.
157 */
158 diff = jiffies - last_update_time;
159 if (diff > mmp_check_interval * HZ) {
160 struct buffer_head *bh_check = NULL;
161 struct mmp_struct *mmp_check;
162
163 retval = read_mmp_block(sb, &bh_check, mmp_block);
164 if (retval) {
165 ext4_error(sb, "error reading MMP data: %d",
166 retval);
167
168 EXT4_SB(sb)->s_mmp_tsk = NULL;
169 goto failed;
170 }
171
172 mmp_check = (struct mmp_struct *)(bh_check->b_data);
173 if (mmp->mmp_seq != mmp_check->mmp_seq ||
174 memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
175 sizeof(mmp->mmp_nodename))) {
176 dump_mmp_msg(sb, mmp_check,
177 "Error while updating MMP info. "
178 "The filesystem seems to have been"
179 " multiply mounted.");
180 ext4_error(sb, "abort");
181 goto failed;
182 }
183 put_bh(bh_check);
184 }
185
186 /*
187 * Adjust the mmp_check_interval depending on how much time
188 * it took for the MMP block to be written.
189 */
190 mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
191 EXT4_MMP_MAX_CHECK_INTERVAL),
192 EXT4_MMP_MIN_CHECK_INTERVAL);
193 mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
194 }
195
196 /*
197 * Unmount seems to be clean.
198 */
199 mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
200 mmp->mmp_time = cpu_to_le64(get_seconds());
201
202 retval = write_mmp_block(bh);
203
204failed:
205 kfree(data);
206 brelse(bh);
207 return retval;
208}
209
210/*
211 * Get a random new sequence number but make sure it is not greater than
212 * EXT4_MMP_SEQ_MAX.
213 */
214static unsigned int mmp_new_seq(void)
215{
216 u32 new_seq;
217
218 do {
219 get_random_bytes(&new_seq, sizeof(u32));
220 } while (new_seq > EXT4_MMP_SEQ_MAX);
221
222 return new_seq;
223}
224
225/*
226 * Protect the filesystem from being mounted more than once.
227 */
228int ext4_multi_mount_protect(struct super_block *sb,
229 ext4_fsblk_t mmp_block)
230{
231 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
232 struct buffer_head *bh = NULL;
233 struct mmp_struct *mmp = NULL;
234 struct mmpd_data *mmpd_data;
235 u32 seq;
236 unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
237 unsigned int wait_time = 0;
238 int retval;
239
240 if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
241 mmp_block >= ext4_blocks_count(es)) {
242 ext4_warning(sb, "Invalid MMP block in superblock");
243 goto failed;
244 }
245
246 retval = read_mmp_block(sb, &bh, mmp_block);
247 if (retval)
248 goto failed;
249
250 mmp = (struct mmp_struct *)(bh->b_data);
251
252 if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
253 mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
254
255 /*
256 * If check_interval in MMP block is larger, use that instead of
257 * update_interval from the superblock.
258 */
259 if (mmp->mmp_check_interval > mmp_check_interval)
260 mmp_check_interval = mmp->mmp_check_interval;
261
262 seq = le32_to_cpu(mmp->mmp_seq);
263 if (seq == EXT4_MMP_SEQ_CLEAN)
264 goto skip;
265
266 if (seq == EXT4_MMP_SEQ_FSCK) {
267 dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
268 goto failed;
269 }
270
271 wait_time = min(mmp_check_interval * 2 + 1,
272 mmp_check_interval + 60);
273
274 /* Print MMP interval if more than 20 secs. */
275 if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
276 ext4_warning(sb, "MMP interval %u higher than expected, please"
277 " wait.\n", wait_time * 2);
278
279 if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
280 ext4_warning(sb, "MMP startup interrupted, failing mount\n");
281 goto failed;
282 }
283
284 retval = read_mmp_block(sb, &bh, mmp_block);
285 if (retval)
286 goto failed;
287 mmp = (struct mmp_struct *)(bh->b_data);
288 if (seq != le32_to_cpu(mmp->mmp_seq)) {
289 dump_mmp_msg(sb, mmp,
290 "Device is already active on another node.");
291 goto failed;
292 }
293
294skip:
295 /*
296 * write a new random sequence number.
297 */
298 mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
299
300 retval = write_mmp_block(bh);
301 if (retval)
302 goto failed;
303
304 /*
305 * wait for MMP interval and check mmp_seq.
306 */
307 if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
308 ext4_warning(sb, "MMP startup interrupted, failing mount\n");
309 goto failed;
310 }
311
312 retval = read_mmp_block(sb, &bh, mmp_block);
313 if (retval)
314 goto failed;
315 mmp = (struct mmp_struct *)(bh->b_data);
316 if (seq != le32_to_cpu(mmp->mmp_seq)) {
317 dump_mmp_msg(sb, mmp,
318 "Device is already active on another node.");
319 goto failed;
320 }
321
322 mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
323 if (!mmpd_data) {
324 ext4_warning(sb, "not enough memory for mmpd_data");
325 goto failed;
326 }
327 mmpd_data->sb = sb;
328 mmpd_data->bh = bh;
329
330 /*
331 * Start a kernel thread to update the MMP block periodically.
332 */
333 EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
334 bdevname(bh->b_bdev,
335 mmp->mmp_bdevname));
336 if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
337 EXT4_SB(sb)->s_mmp_tsk = NULL;
338 kfree(mmpd_data);
339 ext4_warning(sb, "Unable to create kmmpd thread for %s.",
340 sb->s_id);
341 goto failed;
342 }
343
344 return 0;
345
346failed:
347 brelse(bh);
348 return 1;
349}
350
351
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b9f3e7862f13..2b8304bf3c50 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -876,8 +876,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
876 * It needs to call wait_on_page_writeback() to wait for the 876 * It needs to call wait_on_page_writeback() to wait for the
877 * writeback of the page. 877 * writeback of the page.
878 */ 878 */
879 if (PageWriteback(page)) 879 wait_on_page_writeback(page);
880 wait_on_page_writeback(page);
881 880
882 /* Release old bh and drop refs */ 881 /* Release old bh and drop refs */
883 try_to_release_page(page, 0); 882 try_to_release_page(page, 0);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 67fd0b025858..b754b7721f51 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1413,10 +1413,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1413 frame->at = entries; 1413 frame->at = entries;
1414 frame->bh = bh; 1414 frame->bh = bh;
1415 bh = bh2; 1415 bh = bh2;
1416
1417 ext4_handle_dirty_metadata(handle, dir, frame->bh);
1418 ext4_handle_dirty_metadata(handle, dir, bh);
1419
1416 de = do_split(handle,dir, &bh, frame, &hinfo, &retval); 1420 de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1417 dx_release (frames); 1421 if (!de) {
1418 if (!(de)) 1422 /*
1423 * Even if the block split failed, we have to properly write
1424 * out all the changes we did so far. Otherwise we can end up
1425 * with corrupted filesystem.
1426 */
1427 ext4_mark_inode_dirty(handle, dir);
1428 dx_release(frames);
1419 return retval; 1429 return retval;
1430 }
1431 dx_release(frames);
1420 1432
1421 retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1433 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1422 brelse(bh); 1434 brelse(bh);
@@ -2240,6 +2252,7 @@ static int ext4_symlink(struct inode *dir,
2240 handle_t *handle; 2252 handle_t *handle;
2241 struct inode *inode; 2253 struct inode *inode;
2242 int l, err, retries = 0; 2254 int l, err, retries = 0;
2255 int credits;
2243 2256
2244 l = strlen(symname)+1; 2257 l = strlen(symname)+1;
2245 if (l > dir->i_sb->s_blocksize) 2258 if (l > dir->i_sb->s_blocksize)
@@ -2247,10 +2260,26 @@ static int ext4_symlink(struct inode *dir,
2247 2260
2248 dquot_initialize(dir); 2261 dquot_initialize(dir);
2249 2262
2263 if (l > EXT4_N_BLOCKS * 4) {
2264 /*
2265 * For non-fast symlinks, we just allocate inode and put it on
2266 * orphan list in the first transaction => we need bitmap,
2267 * group descriptor, sb, inode block, quota blocks.
2268 */
2269 credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2270 } else {
2271 /*
2272 * Fast symlink. We have to add entry to directory
2273 * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
2274 * allocate new inode (bitmap, group descriptor, inode block,
2275 * quota blocks, sb is already counted in previous macros).
2276 */
2277 credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2278 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
2279 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
2280 }
2250retry: 2281retry:
2251 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2282 handle = ext4_journal_start(dir, credits);
2252 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
2253 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2254 if (IS_ERR(handle)) 2283 if (IS_ERR(handle))
2255 return PTR_ERR(handle); 2284 return PTR_ERR(handle);
2256 2285
@@ -2263,21 +2292,44 @@ retry:
2263 if (IS_ERR(inode)) 2292 if (IS_ERR(inode))
2264 goto out_stop; 2293 goto out_stop;
2265 2294
2266 if (l > sizeof(EXT4_I(inode)->i_data)) { 2295 if (l > EXT4_N_BLOCKS * 4) {
2267 inode->i_op = &ext4_symlink_inode_operations; 2296 inode->i_op = &ext4_symlink_inode_operations;
2268 ext4_set_aops(inode); 2297 ext4_set_aops(inode);
2269 /* 2298 /*
2270 * page_symlink() calls into ext4_prepare/commit_write. 2299 * We cannot call page_symlink() with transaction started
2271 * We have a transaction open. All is sweetness. It also sets 2300 * because it calls into ext4_write_begin() which can wait
2272 * i_size in generic_commit_write(). 2301 * for transaction commit if we are running out of space
2302 * and thus we deadlock. So we have to stop transaction now
2303 * and restart it when symlink contents is written.
2304 *
2305 * To keep fs consistent in case of crash, we have to put inode
2306 * to orphan list in the mean time.
2273 */ 2307 */
2308 drop_nlink(inode);
2309 err = ext4_orphan_add(handle, inode);
2310 ext4_journal_stop(handle);
2311 if (err)
2312 goto err_drop_inode;
2274 err = __page_symlink(inode, symname, l, 1); 2313 err = __page_symlink(inode, symname, l, 1);
2314 if (err)
2315 goto err_drop_inode;
2316 /*
2317 * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
2318 * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
2319 */
2320 handle = ext4_journal_start(dir,
2321 EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2322 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
2323 if (IS_ERR(handle)) {
2324 err = PTR_ERR(handle);
2325 goto err_drop_inode;
2326 }
2327 inc_nlink(inode);
2328 err = ext4_orphan_del(handle, inode);
2275 if (err) { 2329 if (err) {
2330 ext4_journal_stop(handle);
2276 clear_nlink(inode); 2331 clear_nlink(inode);
2277 unlock_new_inode(inode); 2332 goto err_drop_inode;
2278 ext4_mark_inode_dirty(handle, inode);
2279 iput(inode);
2280 goto out_stop;
2281 } 2333 }
2282 } else { 2334 } else {
2283 /* clear the extent format for fast symlink */ 2335 /* clear the extent format for fast symlink */
@@ -2293,6 +2345,10 @@ out_stop:
2293 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) 2345 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2294 goto retry; 2346 goto retry;
2295 return err; 2347 return err;
2348err_drop_inode:
2349 unlock_new_inode(inode);
2350 iput(inode);
2351 return err;
2296} 2352}
2297 2353
2298static int ext4_link(struct dentry *old_dentry, 2354static int ext4_link(struct dentry *old_dentry,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b6dbd056fcb1..7bb8f76d470a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -203,46 +203,29 @@ static void ext4_end_bio(struct bio *bio, int error)
203 for (i = 0; i < io_end->num_io_pages; i++) { 203 for (i = 0; i < io_end->num_io_pages; i++) {
204 struct page *page = io_end->pages[i]->p_page; 204 struct page *page = io_end->pages[i]->p_page;
205 struct buffer_head *bh, *head; 205 struct buffer_head *bh, *head;
206 int partial_write = 0; 206 loff_t offset;
207 loff_t io_end_offset;
207 208
208 head = page_buffers(page); 209 if (error) {
209 if (error)
210 SetPageError(page); 210 SetPageError(page);
211 BUG_ON(!head); 211 set_bit(AS_EIO, &page->mapping->flags);
212 if (head->b_size != PAGE_CACHE_SIZE) { 212 head = page_buffers(page);
213 loff_t offset; 213 BUG_ON(!head);
214 loff_t io_end_offset = io_end->offset + io_end->size; 214
215 io_end_offset = io_end->offset + io_end->size;
215 216
216 offset = (sector_t) page->index << PAGE_CACHE_SHIFT; 217 offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
217 bh = head; 218 bh = head;
218 do { 219 do {
219 if ((offset >= io_end->offset) && 220 if ((offset >= io_end->offset) &&
220 (offset+bh->b_size <= io_end_offset)) { 221 (offset+bh->b_size <= io_end_offset))
221 if (error) 222 buffer_io_error(bh);
222 buffer_io_error(bh); 223
223
224 }
225 if (buffer_delay(bh))
226 partial_write = 1;
227 else if (!buffer_mapped(bh))
228 clear_buffer_dirty(bh);
229 else if (buffer_dirty(bh))
230 partial_write = 1;
231 offset += bh->b_size; 224 offset += bh->b_size;
232 bh = bh->b_this_page; 225 bh = bh->b_this_page;
233 } while (bh != head); 226 } while (bh != head);
234 } 227 }
235 228
236 /*
237 * If this is a partial write which happened to make
238 * all buffers uptodate then we can optimize away a
239 * bogus readpage() for the next read(). Here we
240 * 'discover' whether the page went uptodate as a
241 * result of this (potentially partial) write.
242 */
243 if (!partial_write)
244 SetPageUptodate(page);
245
246 put_io_page(io_end->pages[i]); 229 put_io_page(io_end->pages[i]);
247 } 230 }
248 io_end->num_io_pages = 0; 231 io_end->num_io_pages = 0;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8553dfb310af..cc5c157aa11d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -38,6 +38,7 @@
38#include <linux/ctype.h> 38#include <linux/ctype.h>
39#include <linux/log2.h> 39#include <linux/log2.h>
40#include <linux/crc16.h> 40#include <linux/crc16.h>
41#include <linux/cleancache.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42 43
43#include <linux/kthread.h> 44#include <linux/kthread.h>
@@ -75,11 +76,27 @@ static void ext4_write_super(struct super_block *sb);
75static int ext4_freeze(struct super_block *sb); 76static int ext4_freeze(struct super_block *sb);
76static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, 77static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
77 const char *dev_name, void *data); 78 const char *dev_name, void *data);
79static inline int ext2_feature_set_ok(struct super_block *sb);
80static inline int ext3_feature_set_ok(struct super_block *sb);
78static int ext4_feature_set_ok(struct super_block *sb, int readonly); 81static int ext4_feature_set_ok(struct super_block *sb, int readonly);
79static void ext4_destroy_lazyinit_thread(void); 82static void ext4_destroy_lazyinit_thread(void);
80static void ext4_unregister_li_request(struct super_block *sb); 83static void ext4_unregister_li_request(struct super_block *sb);
81static void ext4_clear_request_list(void); 84static void ext4_clear_request_list(void);
82 85
86#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
87static struct file_system_type ext2_fs_type = {
88 .owner = THIS_MODULE,
89 .name = "ext2",
90 .mount = ext4_mount,
91 .kill_sb = kill_block_super,
92 .fs_flags = FS_REQUIRES_DEV,
93};
94#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
95#else
96#define IS_EXT2_SB(sb) (0)
97#endif
98
99
83#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 100#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
84static struct file_system_type ext3_fs_type = { 101static struct file_system_type ext3_fs_type = {
85 .owner = THIS_MODULE, 102 .owner = THIS_MODULE,
@@ -806,6 +823,8 @@ static void ext4_put_super(struct super_block *sb)
806 invalidate_bdev(sbi->journal_bdev); 823 invalidate_bdev(sbi->journal_bdev);
807 ext4_blkdev_remove(sbi); 824 ext4_blkdev_remove(sbi);
808 } 825 }
826 if (sbi->s_mmp_tsk)
827 kthread_stop(sbi->s_mmp_tsk);
809 sb->s_fs_info = NULL; 828 sb->s_fs_info = NULL;
810 /* 829 /*
811 * Now that we are completely done shutting down the 830 * Now that we are completely done shutting down the
@@ -1096,7 +1115,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1096 1115
1097 if (!test_opt(sb, INIT_INODE_TABLE)) 1116 if (!test_opt(sb, INIT_INODE_TABLE))
1098 seq_puts(seq, ",noinit_inode_table"); 1117 seq_puts(seq, ",noinit_inode_table");
1099 else if (sbi->s_li_wait_mult) 1118 else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
1100 seq_printf(seq, ",init_inode_table=%u", 1119 seq_printf(seq, ",init_inode_table=%u",
1101 (unsigned) sbi->s_li_wait_mult); 1120 (unsigned) sbi->s_li_wait_mult);
1102 1121
@@ -1187,9 +1206,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
1187 const char *data, size_t len, loff_t off); 1206 const char *data, size_t len, loff_t off);
1188 1207
1189static const struct dquot_operations ext4_quota_operations = { 1208static const struct dquot_operations ext4_quota_operations = {
1190#ifdef CONFIG_QUOTA
1191 .get_reserved_space = ext4_get_reserved_space, 1209 .get_reserved_space = ext4_get_reserved_space,
1192#endif
1193 .write_dquot = ext4_write_dquot, 1210 .write_dquot = ext4_write_dquot,
1194 .acquire_dquot = ext4_acquire_dquot, 1211 .acquire_dquot = ext4_acquire_dquot,
1195 .release_dquot = ext4_release_dquot, 1212 .release_dquot = ext4_release_dquot,
@@ -1900,7 +1917,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1900 ext4_msg(sb, KERN_WARNING, 1917 ext4_msg(sb, KERN_WARNING,
1901 "warning: mounting fs with errors, " 1918 "warning: mounting fs with errors, "
1902 "running e2fsck is recommended"); 1919 "running e2fsck is recommended");
1903 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && 1920 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
1904 le16_to_cpu(es->s_mnt_count) >= 1921 le16_to_cpu(es->s_mnt_count) >=
1905 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) 1922 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
1906 ext4_msg(sb, KERN_WARNING, 1923 ext4_msg(sb, KERN_WARNING,
@@ -1932,6 +1949,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
1932 EXT4_INODES_PER_GROUP(sb), 1949 EXT4_INODES_PER_GROUP(sb),
1933 sbi->s_mount_opt, sbi->s_mount_opt2); 1950 sbi->s_mount_opt, sbi->s_mount_opt2);
1934 1951
1952 cleancache_init_fs(sb);
1935 return res; 1953 return res;
1936} 1954}
1937 1955
@@ -2425,6 +2443,18 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
2425 EXT4_SB(sb)->s_sectors_written_start) >> 1))); 2443 EXT4_SB(sb)->s_sectors_written_start) >> 1)));
2426} 2444}
2427 2445
2446static ssize_t extent_cache_hits_show(struct ext4_attr *a,
2447 struct ext4_sb_info *sbi, char *buf)
2448{
2449 return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
2450}
2451
2452static ssize_t extent_cache_misses_show(struct ext4_attr *a,
2453 struct ext4_sb_info *sbi, char *buf)
2454{
2455 return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
2456}
2457
2428static ssize_t inode_readahead_blks_store(struct ext4_attr *a, 2458static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2429 struct ext4_sb_info *sbi, 2459 struct ext4_sb_info *sbi,
2430 const char *buf, size_t count) 2460 const char *buf, size_t count)
@@ -2482,6 +2512,8 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2482EXT4_RO_ATTR(delayed_allocation_blocks); 2512EXT4_RO_ATTR(delayed_allocation_blocks);
2483EXT4_RO_ATTR(session_write_kbytes); 2513EXT4_RO_ATTR(session_write_kbytes);
2484EXT4_RO_ATTR(lifetime_write_kbytes); 2514EXT4_RO_ATTR(lifetime_write_kbytes);
2515EXT4_RO_ATTR(extent_cache_hits);
2516EXT4_RO_ATTR(extent_cache_misses);
2485EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, 2517EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
2486 inode_readahead_blks_store, s_inode_readahead_blks); 2518 inode_readahead_blks_store, s_inode_readahead_blks);
2487EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); 2519EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2497,6 +2529,8 @@ static struct attribute *ext4_attrs[] = {
2497 ATTR_LIST(delayed_allocation_blocks), 2529 ATTR_LIST(delayed_allocation_blocks),
2498 ATTR_LIST(session_write_kbytes), 2530 ATTR_LIST(session_write_kbytes),
2499 ATTR_LIST(lifetime_write_kbytes), 2531 ATTR_LIST(lifetime_write_kbytes),
2532 ATTR_LIST(extent_cache_hits),
2533 ATTR_LIST(extent_cache_misses),
2500 ATTR_LIST(inode_readahead_blks), 2534 ATTR_LIST(inode_readahead_blks),
2501 ATTR_LIST(inode_goal), 2535 ATTR_LIST(inode_goal),
2502 ATTR_LIST(mb_stats), 2536 ATTR_LIST(mb_stats),
@@ -2659,12 +2693,6 @@ static void print_daily_error_info(unsigned long arg)
2659 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ 2693 mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
2660} 2694}
2661 2695
2662static void ext4_lazyinode_timeout(unsigned long data)
2663{
2664 struct task_struct *p = (struct task_struct *)data;
2665 wake_up_process(p);
2666}
2667
2668/* Find next suitable group and run ext4_init_inode_table */ 2696/* Find next suitable group and run ext4_init_inode_table */
2669static int ext4_run_li_request(struct ext4_li_request *elr) 2697static int ext4_run_li_request(struct ext4_li_request *elr)
2670{ 2698{
@@ -2696,11 +2724,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
2696 ret = ext4_init_inode_table(sb, group, 2724 ret = ext4_init_inode_table(sb, group,
2697 elr->lr_timeout ? 0 : 1); 2725 elr->lr_timeout ? 0 : 1);
2698 if (elr->lr_timeout == 0) { 2726 if (elr->lr_timeout == 0) {
2699 timeout = jiffies - timeout; 2727 timeout = (jiffies - timeout) *
2700 if (elr->lr_sbi->s_li_wait_mult) 2728 elr->lr_sbi->s_li_wait_mult;
2701 timeout *= elr->lr_sbi->s_li_wait_mult;
2702 else
2703 timeout *= 20;
2704 elr->lr_timeout = timeout; 2729 elr->lr_timeout = timeout;
2705 } 2730 }
2706 elr->lr_next_sched = jiffies + elr->lr_timeout; 2731 elr->lr_next_sched = jiffies + elr->lr_timeout;
@@ -2712,7 +2737,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
2712 2737
2713/* 2738/*
2714 * Remove lr_request from the list_request and free the 2739 * Remove lr_request from the list_request and free the
2715 * request tructure. Should be called with li_list_mtx held 2740 * request structure. Should be called with li_list_mtx held
2716 */ 2741 */
2717static void ext4_remove_li_request(struct ext4_li_request *elr) 2742static void ext4_remove_li_request(struct ext4_li_request *elr)
2718{ 2743{
@@ -2730,14 +2755,16 @@ static void ext4_remove_li_request(struct ext4_li_request *elr)
2730 2755
2731static void ext4_unregister_li_request(struct super_block *sb) 2756static void ext4_unregister_li_request(struct super_block *sb)
2732{ 2757{
2733 struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request; 2758 mutex_lock(&ext4_li_mtx);
2734 2759 if (!ext4_li_info) {
2735 if (!ext4_li_info) 2760 mutex_unlock(&ext4_li_mtx);
2736 return; 2761 return;
2762 }
2737 2763
2738 mutex_lock(&ext4_li_info->li_list_mtx); 2764 mutex_lock(&ext4_li_info->li_list_mtx);
2739 ext4_remove_li_request(elr); 2765 ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
2740 mutex_unlock(&ext4_li_info->li_list_mtx); 2766 mutex_unlock(&ext4_li_info->li_list_mtx);
2767 mutex_unlock(&ext4_li_mtx);
2741} 2768}
2742 2769
2743static struct task_struct *ext4_lazyinit_task; 2770static struct task_struct *ext4_lazyinit_task;
@@ -2756,17 +2783,10 @@ static int ext4_lazyinit_thread(void *arg)
2756 struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; 2783 struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
2757 struct list_head *pos, *n; 2784 struct list_head *pos, *n;
2758 struct ext4_li_request *elr; 2785 struct ext4_li_request *elr;
2759 unsigned long next_wakeup; 2786 unsigned long next_wakeup, cur;
2760 DEFINE_WAIT(wait);
2761 2787
2762 BUG_ON(NULL == eli); 2788 BUG_ON(NULL == eli);
2763 2789
2764 eli->li_timer.data = (unsigned long)current;
2765 eli->li_timer.function = ext4_lazyinode_timeout;
2766
2767 eli->li_task = current;
2768 wake_up(&eli->li_wait_task);
2769
2770cont_thread: 2790cont_thread:
2771 while (true) { 2791 while (true) {
2772 next_wakeup = MAX_JIFFY_OFFSET; 2792 next_wakeup = MAX_JIFFY_OFFSET;
@@ -2797,19 +2817,15 @@ cont_thread:
2797 if (freezing(current)) 2817 if (freezing(current))
2798 refrigerator(); 2818 refrigerator();
2799 2819
2800 if ((time_after_eq(jiffies, next_wakeup)) || 2820 cur = jiffies;
2821 if ((time_after_eq(cur, next_wakeup)) ||
2801 (MAX_JIFFY_OFFSET == next_wakeup)) { 2822 (MAX_JIFFY_OFFSET == next_wakeup)) {
2802 cond_resched(); 2823 cond_resched();
2803 continue; 2824 continue;
2804 } 2825 }
2805 2826
2806 eli->li_timer.expires = next_wakeup; 2827 schedule_timeout_interruptible(next_wakeup - cur);
2807 add_timer(&eli->li_timer); 2828
2808 prepare_to_wait(&eli->li_wait_daemon, &wait,
2809 TASK_INTERRUPTIBLE);
2810 if (time_before(jiffies, next_wakeup))
2811 schedule();
2812 finish_wait(&eli->li_wait_daemon, &wait);
2813 if (kthread_should_stop()) { 2829 if (kthread_should_stop()) {
2814 ext4_clear_request_list(); 2830 ext4_clear_request_list();
2815 goto exit_thread; 2831 goto exit_thread;
@@ -2833,12 +2849,7 @@ exit_thread:
2833 goto cont_thread; 2849 goto cont_thread;
2834 } 2850 }
2835 mutex_unlock(&eli->li_list_mtx); 2851 mutex_unlock(&eli->li_list_mtx);
2836 del_timer_sync(&ext4_li_info->li_timer);
2837 eli->li_task = NULL;
2838 wake_up(&eli->li_wait_task);
2839
2840 kfree(ext4_li_info); 2852 kfree(ext4_li_info);
2841 ext4_lazyinit_task = NULL;
2842 ext4_li_info = NULL; 2853 ext4_li_info = NULL;
2843 mutex_unlock(&ext4_li_mtx); 2854 mutex_unlock(&ext4_li_mtx);
2844 2855
@@ -2866,7 +2877,6 @@ static int ext4_run_lazyinit_thread(void)
2866 if (IS_ERR(ext4_lazyinit_task)) { 2877 if (IS_ERR(ext4_lazyinit_task)) {
2867 int err = PTR_ERR(ext4_lazyinit_task); 2878 int err = PTR_ERR(ext4_lazyinit_task);
2868 ext4_clear_request_list(); 2879 ext4_clear_request_list();
2869 del_timer_sync(&ext4_li_info->li_timer);
2870 kfree(ext4_li_info); 2880 kfree(ext4_li_info);
2871 ext4_li_info = NULL; 2881 ext4_li_info = NULL;
2872 printk(KERN_CRIT "EXT4: error %d creating inode table " 2882 printk(KERN_CRIT "EXT4: error %d creating inode table "
@@ -2875,8 +2885,6 @@ static int ext4_run_lazyinit_thread(void)
2875 return err; 2885 return err;
2876 } 2886 }
2877 ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING; 2887 ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
2878
2879 wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
2880 return 0; 2888 return 0;
2881} 2889}
2882 2890
@@ -2911,13 +2919,9 @@ static int ext4_li_info_new(void)
2911 if (!eli) 2919 if (!eli)
2912 return -ENOMEM; 2920 return -ENOMEM;
2913 2921
2914 eli->li_task = NULL;
2915 INIT_LIST_HEAD(&eli->li_request_list); 2922 INIT_LIST_HEAD(&eli->li_request_list);
2916 mutex_init(&eli->li_list_mtx); 2923 mutex_init(&eli->li_list_mtx);
2917 2924
2918 init_waitqueue_head(&eli->li_wait_daemon);
2919 init_waitqueue_head(&eli->li_wait_task);
2920 init_timer(&eli->li_timer);
2921 eli->li_state |= EXT4_LAZYINIT_QUIT; 2925 eli->li_state |= EXT4_LAZYINIT_QUIT;
2922 2926
2923 ext4_li_info = eli; 2927 ext4_li_info = eli;
@@ -2960,20 +2964,19 @@ static int ext4_register_li_request(struct super_block *sb,
2960 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; 2964 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
2961 int ret = 0; 2965 int ret = 0;
2962 2966
2963 if (sbi->s_li_request != NULL) 2967 if (sbi->s_li_request != NULL) {
2968 /*
2969 * Reset timeout so it can be computed again, because
2970 * s_li_wait_mult might have changed.
2971 */
2972 sbi->s_li_request->lr_timeout = 0;
2964 return 0; 2973 return 0;
2974 }
2965 2975
2966 if (first_not_zeroed == ngroups || 2976 if (first_not_zeroed == ngroups ||
2967 (sb->s_flags & MS_RDONLY) || 2977 (sb->s_flags & MS_RDONLY) ||
2968 !test_opt(sb, INIT_INODE_TABLE)) { 2978 !test_opt(sb, INIT_INODE_TABLE))
2969 sbi->s_li_request = NULL;
2970 return 0; 2979 return 0;
2971 }
2972
2973 if (first_not_zeroed == ngroups) {
2974 sbi->s_li_request = NULL;
2975 return 0;
2976 }
2977 2980
2978 elr = ext4_li_request_new(sb, first_not_zeroed); 2981 elr = ext4_li_request_new(sb, first_not_zeroed);
2979 if (!elr) 2982 if (!elr)
@@ -3166,6 +3169,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3166 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) 3169 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
3167 set_opt(sb, DELALLOC); 3170 set_opt(sb, DELALLOC);
3168 3171
3172 /*
3173 * set default s_li_wait_mult for lazyinit, for the case there is
3174 * no mount option specified.
3175 */
3176 sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
3177
3169 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, 3178 if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
3170 &journal_devnum, &journal_ioprio, NULL, 0)) { 3179 &journal_devnum, &journal_ioprio, NULL, 0)) {
3171 ext4_msg(sb, KERN_WARNING, 3180 ext4_msg(sb, KERN_WARNING,
@@ -3187,6 +3196,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3187 "feature flags set on rev 0 fs, " 3196 "feature flags set on rev 0 fs, "
3188 "running e2fsck is recommended"); 3197 "running e2fsck is recommended");
3189 3198
3199 if (IS_EXT2_SB(sb)) {
3200 if (ext2_feature_set_ok(sb))
3201 ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
3202 "using the ext4 subsystem");
3203 else {
3204 ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
3205 "to feature incompatibilities");
3206 goto failed_mount;
3207 }
3208 }
3209
3210 if (IS_EXT3_SB(sb)) {
3211 if (ext3_feature_set_ok(sb))
3212 ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
3213 "using the ext4 subsystem");
3214 else {
3215 ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
3216 "to feature incompatibilities");
3217 goto failed_mount;
3218 }
3219 }
3220
3190 /* 3221 /*
3191 * Check feature flags regardless of the revision level, since we 3222 * Check feature flags regardless of the revision level, since we
3192 * previously didn't change the revision level when setting the flags, 3223 * previously didn't change the revision level when setting the flags,
@@ -3459,6 +3490,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3459 EXT4_HAS_INCOMPAT_FEATURE(sb, 3490 EXT4_HAS_INCOMPAT_FEATURE(sb,
3460 EXT4_FEATURE_INCOMPAT_RECOVER)); 3491 EXT4_FEATURE_INCOMPAT_RECOVER));
3461 3492
3493 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
3494 !(sb->s_flags & MS_RDONLY))
3495 if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
3496 goto failed_mount3;
3497
3462 /* 3498 /*
3463 * The first inode we look at is the journal inode. Don't try 3499 * The first inode we look at is the journal inode. Don't try
3464 * root first: it may be modified in the journal! 3500 * root first: it may be modified in the journal!
@@ -3474,7 +3510,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3474 goto failed_mount_wq; 3510 goto failed_mount_wq;
3475 } else { 3511 } else {
3476 clear_opt(sb, DATA_FLAGS); 3512 clear_opt(sb, DATA_FLAGS);
3477 set_opt(sb, WRITEBACK_DATA);
3478 sbi->s_journal = NULL; 3513 sbi->s_journal = NULL;
3479 needs_recovery = 0; 3514 needs_recovery = 0;
3480 goto no_journal; 3515 goto no_journal;
@@ -3707,6 +3742,8 @@ failed_mount3:
3707 percpu_counter_destroy(&sbi->s_freeinodes_counter); 3742 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3708 percpu_counter_destroy(&sbi->s_dirs_counter); 3743 percpu_counter_destroy(&sbi->s_dirs_counter);
3709 percpu_counter_destroy(&sbi->s_dirtyblocks_counter); 3744 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3745 if (sbi->s_mmp_tsk)
3746 kthread_stop(sbi->s_mmp_tsk);
3710failed_mount2: 3747failed_mount2:
3711 for (i = 0; i < db_count; i++) 3748 for (i = 0; i < db_count; i++)
3712 brelse(sbi->s_group_desc[i]); 3749 brelse(sbi->s_group_desc[i]);
@@ -4242,7 +4279,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4242 int enable_quota = 0; 4279 int enable_quota = 0;
4243 ext4_group_t g; 4280 ext4_group_t g;
4244 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 4281 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
4245 int err; 4282 int err = 0;
4246#ifdef CONFIG_QUOTA 4283#ifdef CONFIG_QUOTA
4247 int i; 4284 int i;
4248#endif 4285#endif
@@ -4368,6 +4405,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4368 goto restore_opts; 4405 goto restore_opts;
4369 if (!ext4_setup_super(sb, es, 0)) 4406 if (!ext4_setup_super(sb, es, 0))
4370 sb->s_flags &= ~MS_RDONLY; 4407 sb->s_flags &= ~MS_RDONLY;
4408 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
4409 EXT4_FEATURE_INCOMPAT_MMP))
4410 if (ext4_multi_mount_protect(sb,
4411 le64_to_cpu(es->s_mmp_block))) {
4412 err = -EROFS;
4413 goto restore_opts;
4414 }
4371 enable_quota = 1; 4415 enable_quota = 1;
4372 } 4416 }
4373 } 4417 }
@@ -4432,6 +4476,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4432 struct ext4_sb_info *sbi = EXT4_SB(sb); 4476 struct ext4_sb_info *sbi = EXT4_SB(sb);
4433 struct ext4_super_block *es = sbi->s_es; 4477 struct ext4_super_block *es = sbi->s_es;
4434 u64 fsid; 4478 u64 fsid;
4479 s64 bfree;
4435 4480
4436 if (test_opt(sb, MINIX_DF)) { 4481 if (test_opt(sb, MINIX_DF)) {
4437 sbi->s_overhead_last = 0; 4482 sbi->s_overhead_last = 0;
@@ -4475,8 +4520,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
4475 buf->f_type = EXT4_SUPER_MAGIC; 4520 buf->f_type = EXT4_SUPER_MAGIC;
4476 buf->f_bsize = sb->s_blocksize; 4521 buf->f_bsize = sb->s_blocksize;
4477 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; 4522 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
4478 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - 4523 bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
4479 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); 4524 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
4525 /* prevent underflow in case that few free space is available */
4526 buf->f_bfree = max_t(s64, bfree, 0);
4480 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 4527 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
4481 if (buf->f_bfree < ext4_r_blocks_count(es)) 4528 if (buf->f_bfree < ext4_r_blocks_count(es))
4482 buf->f_bavail = 0; 4529 buf->f_bavail = 0;
@@ -4652,6 +4699,9 @@ static int ext4_quota_off(struct super_block *sb, int type)
4652 if (test_opt(sb, DELALLOC)) 4699 if (test_opt(sb, DELALLOC))
4653 sync_filesystem(sb); 4700 sync_filesystem(sb);
4654 4701
4702 if (!inode)
4703 goto out;
4704
4655 /* Update modification times of quota files when userspace can 4705 /* Update modification times of quota files when userspace can
4656 * start looking at them */ 4706 * start looking at them */
4657 handle = ext4_journal_start(inode, 1); 4707 handle = ext4_journal_start(inode, 1);
@@ -4772,14 +4822,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
4772} 4822}
4773 4823
4774#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4824#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
4775static struct file_system_type ext2_fs_type = {
4776 .owner = THIS_MODULE,
4777 .name = "ext2",
4778 .mount = ext4_mount,
4779 .kill_sb = kill_block_super,
4780 .fs_flags = FS_REQUIRES_DEV,
4781};
4782
4783static inline void register_as_ext2(void) 4825static inline void register_as_ext2(void)
4784{ 4826{
4785 int err = register_filesystem(&ext2_fs_type); 4827 int err = register_filesystem(&ext2_fs_type);
@@ -4792,10 +4834,22 @@ static inline void unregister_as_ext2(void)
4792{ 4834{
4793 unregister_filesystem(&ext2_fs_type); 4835 unregister_filesystem(&ext2_fs_type);
4794} 4836}
4837
4838static inline int ext2_feature_set_ok(struct super_block *sb)
4839{
4840 if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
4841 return 0;
4842 if (sb->s_flags & MS_RDONLY)
4843 return 1;
4844 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
4845 return 0;
4846 return 1;
4847}
4795MODULE_ALIAS("ext2"); 4848MODULE_ALIAS("ext2");
4796#else 4849#else
4797static inline void register_as_ext2(void) { } 4850static inline void register_as_ext2(void) { }
4798static inline void unregister_as_ext2(void) { } 4851static inline void unregister_as_ext2(void) { }
4852static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
4799#endif 4853#endif
4800 4854
4801#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 4855#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
@@ -4811,10 +4865,24 @@ static inline void unregister_as_ext3(void)
4811{ 4865{
4812 unregister_filesystem(&ext3_fs_type); 4866 unregister_filesystem(&ext3_fs_type);
4813} 4867}
4868
4869static inline int ext3_feature_set_ok(struct super_block *sb)
4870{
4871 if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
4872 return 0;
4873 if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
4874 return 0;
4875 if (sb->s_flags & MS_RDONLY)
4876 return 1;
4877 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
4878 return 0;
4879 return 1;
4880}
4814MODULE_ALIAS("ext3"); 4881MODULE_ALIAS("ext3");
4815#else 4882#else
4816static inline void register_as_ext3(void) { } 4883static inline void register_as_ext3(void) { }
4817static inline void unregister_as_ext3(void) { } 4884static inline void unregister_as_ext3(void) { }
4885static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
4818#endif 4886#endif
4819 4887
4820static struct file_system_type ext4_fs_type = { 4888static struct file_system_type ext4_fs_type = {
@@ -4898,8 +4966,8 @@ static int __init ext4_init_fs(void)
4898 err = init_inodecache(); 4966 err = init_inodecache();
4899 if (err) 4967 if (err)
4900 goto out1; 4968 goto out1;
4901 register_as_ext2();
4902 register_as_ext3(); 4969 register_as_ext3();
4970 register_as_ext2();
4903 err = register_filesystem(&ext4_fs_type); 4971 err = register_filesystem(&ext4_fs_type);
4904 if (err) 4972 if (err)
4905 goto out; 4973 goto out;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b545ca1c459c..c757adc97250 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -820,8 +820,8 @@ inserted:
820 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 820 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
821 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 821 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
822 822
823 block = ext4_new_meta_blocks(handle, inode, 823 block = ext4_new_meta_blocks(handle, inode, goal, 0,
824 goal, NULL, &error); 824 NULL, &error);
825 if (error) 825 if (error)
826 goto cleanup; 826 goto cleanup;
827 827
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 3b222dafd15b..be15437c272e 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -326,6 +326,8 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
326 struct fat_slot_info sinfo; 326 struct fat_slot_info sinfo;
327 int err; 327 int err;
328 328
329 dentry_unhash(dentry);
330
329 lock_super(sb); 331 lock_super(sb);
330 /* 332 /*
331 * Check whether the directory is not in use, then check 333 * Check whether the directory is not in use, then check
@@ -457,6 +459,9 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
457 old_inode = old_dentry->d_inode; 459 old_inode = old_dentry->d_inode;
458 new_inode = new_dentry->d_inode; 460 new_inode = new_dentry->d_inode;
459 461
462 if (new_inode && S_ISDIR(new_inode->i_mode))
463 dentry_unhash(new_dentry);
464
460 err = fat_scan(old_dir, old_name, &old_sinfo); 465 err = fat_scan(old_dir, old_name, &old_sinfo);
461 if (err) { 466 if (err) {
462 err = -EIO; 467 err = -EIO;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 20b4ea53fdc4..c61a6789f36c 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -824,6 +824,8 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
824 struct fat_slot_info sinfo; 824 struct fat_slot_info sinfo;
825 int err; 825 int err;
826 826
827 dentry_unhash(dentry);
828
827 lock_super(sb); 829 lock_super(sb);
828 830
829 err = fat_dir_empty(inode); 831 err = fat_dir_empty(inode);
@@ -931,6 +933,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
931 int err, is_dir, update_dotdot, corrupt = 0; 933 int err, is_dir, update_dotdot, corrupt = 0;
932 struct super_block *sb = old_dir->i_sb; 934 struct super_block *sb = old_dir->i_sb;
933 935
936 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
937 dentry_unhash(new_dentry);
938
934 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; 939 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
935 old_inode = old_dentry->d_inode; 940 old_inode = old_dentry->d_inode;
936 new_inode = new_dentry->d_inode; 941 new_inode = new_dentry->d_inode;
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 48a18f184d50..30afdfa7aec7 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -33,8 +33,6 @@ void fscache_enqueue_operation(struct fscache_operation *op)
33 _enter("{OBJ%x OP%x,%u}", 33 _enter("{OBJ%x OP%x,%u}",
34 op->object->debug_id, op->debug_id, atomic_read(&op->usage)); 34 op->object->debug_id, op->debug_id, atomic_read(&op->usage));
35 35
36 fscache_set_op_state(op, "EnQ");
37
38 ASSERT(list_empty(&op->pend_link)); 36 ASSERT(list_empty(&op->pend_link));
39 ASSERT(op->processor != NULL); 37 ASSERT(op->processor != NULL);
40 ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); 38 ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
@@ -66,8 +64,6 @@ EXPORT_SYMBOL(fscache_enqueue_operation);
66static void fscache_run_op(struct fscache_object *object, 64static void fscache_run_op(struct fscache_object *object,
67 struct fscache_operation *op) 65 struct fscache_operation *op)
68{ 66{
69 fscache_set_op_state(op, "Run");
70
71 object->n_in_progress++; 67 object->n_in_progress++;
72 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) 68 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
73 wake_up_bit(&op->flags, FSCACHE_OP_WAITING); 69 wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
@@ -88,8 +84,6 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
88 84
89 _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); 85 _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
90 86
91 fscache_set_op_state(op, "SubmitX");
92
93 spin_lock(&object->lock); 87 spin_lock(&object->lock);
94 ASSERTCMP(object->n_ops, >=, object->n_in_progress); 88 ASSERTCMP(object->n_ops, >=, object->n_in_progress);
95 ASSERTCMP(object->n_ops, >=, object->n_exclusive); 89 ASSERTCMP(object->n_ops, >=, object->n_exclusive);
@@ -194,8 +188,6 @@ int fscache_submit_op(struct fscache_object *object,
194 188
195 ASSERTCMP(atomic_read(&op->usage), >, 0); 189 ASSERTCMP(atomic_read(&op->usage), >, 0);
196 190
197 fscache_set_op_state(op, "Submit");
198
199 spin_lock(&object->lock); 191 spin_lock(&object->lock);
200 ASSERTCMP(object->n_ops, >=, object->n_in_progress); 192 ASSERTCMP(object->n_ops, >=, object->n_in_progress);
201 ASSERTCMP(object->n_ops, >=, object->n_exclusive); 193 ASSERTCMP(object->n_ops, >=, object->n_exclusive);
@@ -335,8 +327,6 @@ void fscache_put_operation(struct fscache_operation *op)
335 if (!atomic_dec_and_test(&op->usage)) 327 if (!atomic_dec_and_test(&op->usage))
336 return; 328 return;
337 329
338 fscache_set_op_state(op, "Put");
339
340 _debug("PUT OP"); 330 _debug("PUT OP");
341 if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags)) 331 if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
342 BUG(); 332 BUG();
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 41c441c2058d..a2a5d19ece6a 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -155,11 +155,9 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
155 fscache_stat(&fscache_n_attr_changed_calls); 155 fscache_stat(&fscache_n_attr_changed_calls);
156 156
157 if (fscache_object_is_active(object)) { 157 if (fscache_object_is_active(object)) {
158 fscache_set_op_state(op, "CallFS");
159 fscache_stat(&fscache_n_cop_attr_changed); 158 fscache_stat(&fscache_n_cop_attr_changed);
160 ret = object->cache->ops->attr_changed(object); 159 ret = object->cache->ops->attr_changed(object);
161 fscache_stat_d(&fscache_n_cop_attr_changed); 160 fscache_stat_d(&fscache_n_cop_attr_changed);
162 fscache_set_op_state(op, "Done");
163 if (ret < 0) 161 if (ret < 0)
164 fscache_abort_object(object); 162 fscache_abort_object(object);
165 } 163 }
@@ -190,7 +188,6 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
190 188
191 fscache_operation_init(op, fscache_attr_changed_op, NULL); 189 fscache_operation_init(op, fscache_attr_changed_op, NULL);
192 op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE); 190 op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
193 fscache_set_op_name(op, "Attr");
194 191
195 spin_lock(&cookie->lock); 192 spin_lock(&cookie->lock);
196 193
@@ -257,7 +254,6 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
257 op->context = context; 254 op->context = context;
258 op->start_time = jiffies; 255 op->start_time = jiffies;
259 INIT_LIST_HEAD(&op->to_do); 256 INIT_LIST_HEAD(&op->to_do);
260 fscache_set_op_name(&op->op, "Retr");
261 return op; 257 return op;
262} 258}
263 259
@@ -368,7 +364,6 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
368 _leave(" = -ENOMEM"); 364 _leave(" = -ENOMEM");
369 return -ENOMEM; 365 return -ENOMEM;
370 } 366 }
371 fscache_set_op_name(&op->op, "RetrRA1");
372 367
373 spin_lock(&cookie->lock); 368 spin_lock(&cookie->lock);
374 369
@@ -487,7 +482,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
487 op = fscache_alloc_retrieval(mapping, end_io_func, context); 482 op = fscache_alloc_retrieval(mapping, end_io_func, context);
488 if (!op) 483 if (!op)
489 return -ENOMEM; 484 return -ENOMEM;
490 fscache_set_op_name(&op->op, "RetrRAN");
491 485
492 spin_lock(&cookie->lock); 486 spin_lock(&cookie->lock);
493 487
@@ -589,7 +583,6 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
589 op = fscache_alloc_retrieval(page->mapping, NULL, NULL); 583 op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
590 if (!op) 584 if (!op)
591 return -ENOMEM; 585 return -ENOMEM;
592 fscache_set_op_name(&op->op, "RetrAL1");
593 586
594 spin_lock(&cookie->lock); 587 spin_lock(&cookie->lock);
595 588
@@ -662,8 +655,6 @@ static void fscache_write_op(struct fscache_operation *_op)
662 655
663 _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage)); 656 _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
664 657
665 fscache_set_op_state(&op->op, "GetPage");
666
667 spin_lock(&object->lock); 658 spin_lock(&object->lock);
668 cookie = object->cookie; 659 cookie = object->cookie;
669 660
@@ -698,15 +689,12 @@ static void fscache_write_op(struct fscache_operation *_op)
698 spin_unlock(&cookie->stores_lock); 689 spin_unlock(&cookie->stores_lock);
699 spin_unlock(&object->lock); 690 spin_unlock(&object->lock);
700 691
701 fscache_set_op_state(&op->op, "Store");
702 fscache_stat(&fscache_n_store_pages); 692 fscache_stat(&fscache_n_store_pages);
703 fscache_stat(&fscache_n_cop_write_page); 693 fscache_stat(&fscache_n_cop_write_page);
704 ret = object->cache->ops->write_page(op, page); 694 ret = object->cache->ops->write_page(op, page);
705 fscache_stat_d(&fscache_n_cop_write_page); 695 fscache_stat_d(&fscache_n_cop_write_page);
706 fscache_set_op_state(&op->op, "EndWrite");
707 fscache_end_page_write(object, page); 696 fscache_end_page_write(object, page);
708 if (ret < 0) { 697 if (ret < 0) {
709 fscache_set_op_state(&op->op, "Abort");
710 fscache_abort_object(object); 698 fscache_abort_object(object);
711 } else { 699 } else {
712 fscache_enqueue_operation(&op->op); 700 fscache_enqueue_operation(&op->op);
@@ -778,7 +766,6 @@ int __fscache_write_page(struct fscache_cookie *cookie,
778 fscache_operation_init(&op->op, fscache_write_op, 766 fscache_operation_init(&op->op, fscache_write_op,
779 fscache_release_write_op); 767 fscache_release_write_op);
780 op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING); 768 op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
781 fscache_set_op_name(&op->op, "Write1");
782 769
783 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); 770 ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
784 if (ret < 0) 771 if (ret < 0)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b32eb29a4e6f..0d0e3faddcfa 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -667,6 +667,8 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
667 if (IS_ERR(req)) 667 if (IS_ERR(req))
668 return PTR_ERR(req); 668 return PTR_ERR(req);
669 669
670 dentry_unhash(entry);
671
670 req->in.h.opcode = FUSE_RMDIR; 672 req->in.h.opcode = FUSE_RMDIR;
671 req->in.h.nodeid = get_node_id(dir); 673 req->in.h.nodeid = get_node_id(dir);
672 req->in.numargs = 1; 674 req->in.numargs = 1;
@@ -691,6 +693,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
691 struct fuse_rename_in inarg; 693 struct fuse_rename_in inarg;
692 struct fuse_conn *fc = get_fuse_conn(olddir); 694 struct fuse_conn *fc = get_fuse_conn(olddir);
693 struct fuse_req *req = fuse_get_req(fc); 695 struct fuse_req *req = fuse_get_req(fc);
696
697 if (newent->d_inode && S_ISDIR(newent->d_inode->i_mode))
698 dentry_unhash(newent);
699
694 if (IS_ERR(req)) 700 if (IS_ERR(req))
695 return PTR_ERR(req); 701 return PTR_ERR(req);
696 702
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a2a6abbccc07..2792a790e50b 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1346,11 +1346,14 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
1346} 1346}
1347 1347
1348 1348
1349static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) 1349static int gfs2_shrink_glock_memory(struct shrinker *shrink,
1350 struct shrink_control *sc)
1350{ 1351{
1351 struct gfs2_glock *gl; 1352 struct gfs2_glock *gl;
1352 int may_demote; 1353 int may_demote;
1353 int nr_skipped = 0; 1354 int nr_skipped = 0;
1355 int nr = sc->nr_to_scan;
1356 gfp_t gfp_mask = sc->gfp_mask;
1354 LIST_HEAD(skipped); 1357 LIST_HEAD(skipped);
1355 1358
1356 if (nr == 0) 1359 if (nr == 0)
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index e23d9864c418..42e8d23bc047 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -38,6 +38,7 @@
38 38
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/mm.h>
41#include <linux/spinlock.h> 42#include <linux/spinlock.h>
42#include <linux/completion.h> 43#include <linux/completion.h>
43#include <linux/buffer_head.h> 44#include <linux/buffer_head.h>
@@ -77,19 +78,20 @@ static LIST_HEAD(qd_lru_list);
77static atomic_t qd_lru_count = ATOMIC_INIT(0); 78static atomic_t qd_lru_count = ATOMIC_INIT(0);
78static DEFINE_SPINLOCK(qd_lru_lock); 79static DEFINE_SPINLOCK(qd_lru_lock);
79 80
80int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) 81int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc)
81{ 82{
82 struct gfs2_quota_data *qd; 83 struct gfs2_quota_data *qd;
83 struct gfs2_sbd *sdp; 84 struct gfs2_sbd *sdp;
85 int nr_to_scan = sc->nr_to_scan;
84 86
85 if (nr == 0) 87 if (nr_to_scan == 0)
86 goto out; 88 goto out;
87 89
88 if (!(gfp_mask & __GFP_FS)) 90 if (!(sc->gfp_mask & __GFP_FS))
89 return -1; 91 return -1;
90 92
91 spin_lock(&qd_lru_lock); 93 spin_lock(&qd_lru_lock);
92 while (nr && !list_empty(&qd_lru_list)) { 94 while (nr_to_scan && !list_empty(&qd_lru_list)) {
93 qd = list_entry(qd_lru_list.next, 95 qd = list_entry(qd_lru_list.next,
94 struct gfs2_quota_data, qd_reclaim); 96 struct gfs2_quota_data, qd_reclaim);
95 sdp = qd->qd_gl->gl_sbd; 97 sdp = qd->qd_gl->gl_sbd;
@@ -110,7 +112,7 @@ int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
110 spin_unlock(&qd_lru_lock); 112 spin_unlock(&qd_lru_lock);
111 kmem_cache_free(gfs2_quotad_cachep, qd); 113 kmem_cache_free(gfs2_quotad_cachep, qd);
112 spin_lock(&qd_lru_lock); 114 spin_lock(&qd_lru_lock);
113 nr--; 115 nr_to_scan--;
114 } 116 }
115 spin_unlock(&qd_lru_lock); 117 spin_unlock(&qd_lru_lock);
116 118
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index e7d236ca48bd..90bf1c302a98 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -12,6 +12,7 @@
12 12
13struct gfs2_inode; 13struct gfs2_inode;
14struct gfs2_sbd; 14struct gfs2_sbd;
15struct shrink_control;
15 16
16#define NO_QUOTA_CHANGE ((u32)-1) 17#define NO_QUOTA_CHANGE ((u32)-1)
17 18
@@ -51,7 +52,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
51 return ret; 52 return ret;
52} 53}
53 54
54extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask); 55extern int gfs2_shrink_qd_memory(struct shrinker *shrink,
56 struct shrink_control *sc);
55extern const struct quotactl_ops gfs2_quotactl_ops; 57extern const struct quotactl_ops gfs2_quotactl_ops;
56 58
57#endif /* __QUOTA_DOT_H__ */ 59#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index b4d70b13be92..1cb70cdba2c1 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -253,6 +253,9 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
253 struct inode *inode = dentry->d_inode; 253 struct inode *inode = dentry->d_inode;
254 int res; 254 int res;
255 255
256 if (S_ISDIR(inode->i_mode))
257 dentry_unhash(dentry);
258
256 if (S_ISDIR(inode->i_mode) && inode->i_size != 2) 259 if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
257 return -ENOTEMPTY; 260 return -ENOTEMPTY;
258 res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); 261 res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
@@ -283,6 +286,9 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
283 286
284 /* Unlink destination if it already exists */ 287 /* Unlink destination if it already exists */
285 if (new_dentry->d_inode) { 288 if (new_dentry->d_inode) {
289 if (S_ISDIR(new_dentry->d_inode->i_mode))
290 dentry_unhash(new_dentry);
291
286 res = hfs_remove(new_dir, new_dentry); 292 res = hfs_remove(new_dir, new_dentry);
287 if (res) 293 if (res)
288 return res; 294 return res;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 4df5059c25da..b28835091dd0 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -370,6 +370,8 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
370 struct inode *inode = dentry->d_inode; 370 struct inode *inode = dentry->d_inode;
371 int res; 371 int res;
372 372
373 dentry_unhash(dentry);
374
373 if (inode->i_size != 2) 375 if (inode->i_size != 2)
374 return -ENOTEMPTY; 376 return -ENOTEMPTY;
375 377
@@ -467,10 +469,12 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
467 469
468 /* Unlink destination if it already exists */ 470 /* Unlink destination if it already exists */
469 if (new_dentry->d_inode) { 471 if (new_dentry->d_inode) {
470 if (S_ISDIR(new_dentry->d_inode->i_mode)) 472 if (S_ISDIR(new_dentry->d_inode->i_mode)) {
473 dentry_unhash(new_dentry);
471 res = hfsplus_rmdir(new_dir, new_dentry); 474 res = hfsplus_rmdir(new_dir, new_dentry);
472 else 475 } else {
473 res = hfsplus_unlink(new_dir, new_dentry); 476 res = hfsplus_unlink(new_dir, new_dentry);
477 }
474 if (res) 478 if (res)
475 return res; 479 return res;
476 } 480 }
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2638c834ed28..e6816b9e6903 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -683,6 +683,8 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
683 char *file; 683 char *file;
684 int err; 684 int err;
685 685
686 dentry_unhash(dentry);
687
686 if ((file = dentry_name(dentry)) == NULL) 688 if ((file = dentry_name(dentry)) == NULL)
687 return -ENOMEM; 689 return -ENOMEM;
688 err = do_rmdir(file); 690 err = do_rmdir(file);
@@ -736,6 +738,9 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
736 char *from_name, *to_name; 738 char *from_name, *to_name;
737 int err; 739 int err;
738 740
741 if (to->d_inode && S_ISDIR(to->d_inode->i_mode))
742 dentry_unhash(to);
743
739 if ((from_name = dentry_name(from)) == NULL) 744 if ((from_name = dentry_name(from)) == NULL)
740 return -ENOMEM; 745 return -ENOMEM;
741 if ((to_name = dentry_name(to)) == NULL) { 746 if ((to_name = dentry_name(to)) == NULL) {
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 1f05839c27a7..ff0ce21c0867 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -395,7 +395,6 @@ again:
395 395
396 dentry_unhash(dentry); 396 dentry_unhash(dentry);
397 if (!d_unhashed(dentry)) { 397 if (!d_unhashed(dentry)) {
398 dput(dentry);
399 hpfs_unlock(dir->i_sb); 398 hpfs_unlock(dir->i_sb);
400 return -ENOSPC; 399 return -ENOSPC;
401 } 400 }
@@ -403,7 +402,6 @@ again:
403 !S_ISREG(inode->i_mode) || 402 !S_ISREG(inode->i_mode) ||
404 get_write_access(inode)) { 403 get_write_access(inode)) {
405 d_rehash(dentry); 404 d_rehash(dentry);
406 dput(dentry);
407 } else { 405 } else {
408 struct iattr newattrs; 406 struct iattr newattrs;
409 /*printk("HPFS: truncating file before delete.\n");*/ 407 /*printk("HPFS: truncating file before delete.\n");*/
@@ -411,7 +409,6 @@ again:
411 newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; 409 newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
412 err = notify_change(dentry, &newattrs); 410 err = notify_change(dentry, &newattrs);
413 put_write_access(inode); 411 put_write_access(inode);
414 dput(dentry);
415 if (!err) 412 if (!err)
416 goto again; 413 goto again;
417 } 414 }
@@ -442,6 +439,8 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
442 int err; 439 int err;
443 int r; 440 int r;
444 441
442 dentry_unhash(dentry);
443
445 hpfs_adjust_length(name, &len); 444 hpfs_adjust_length(name, &len);
446 hpfs_lock(dir->i_sb); 445 hpfs_lock(dir->i_sb);
447 err = -ENOENT; 446 err = -ENOENT;
@@ -535,6 +534,10 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
535 struct buffer_head *bh; 534 struct buffer_head *bh;
536 struct fnode *fnode; 535 struct fnode *fnode;
537 int err; 536 int err;
537
538 if (new_inode && S_ISDIR(new_inode->i_mode))
539 dentry_unhash(new_dentry);
540
538 if ((err = hpfs_chk_name(new_name, &new_len))) return err; 541 if ((err = hpfs_chk_name(new_name, &new_len))) return err;
539 err = 0; 542 err = 0;
540 hpfs_adjust_length(old_name, &old_len); 543 hpfs_adjust_length(old_name, &old_len);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b9eeb1cd03ff..7aafeb8fa300 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -412,10 +412,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
412 pgoff = offset >> PAGE_SHIFT; 412 pgoff = offset >> PAGE_SHIFT;
413 413
414 i_size_write(inode, offset); 414 i_size_write(inode, offset);
415 spin_lock(&mapping->i_mmap_lock); 415 mutex_lock(&mapping->i_mmap_mutex);
416 if (!prio_tree_empty(&mapping->i_mmap)) 416 if (!prio_tree_empty(&mapping->i_mmap))
417 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); 417 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
418 spin_unlock(&mapping->i_mmap_lock); 418 mutex_unlock(&mapping->i_mmap_mutex);
419 truncate_hugepages(inode, offset); 419 truncate_hugepages(inode, offset);
420 return 0; 420 return 0;
421} 421}
@@ -921,7 +921,8 @@ static int can_do_hugetlb_shm(void)
921 return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); 921 return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
922} 922}
923 923
924struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, 924struct file *hugetlb_file_setup(const char *name, size_t size,
925 vm_flags_t acctflag,
925 struct user_struct **user, int creat_flags) 926 struct user_struct **user, int creat_flags)
926{ 927{
927 int error = -ENOMEM; 928 int error = -ENOMEM;
diff --git a/fs/inode.c b/fs/inode.c
index 05f4fa521325..990d284877a1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -326,12 +326,11 @@ void address_space_init_once(struct address_space *mapping)
326 memset(mapping, 0, sizeof(*mapping)); 326 memset(mapping, 0, sizeof(*mapping));
327 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); 327 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
328 spin_lock_init(&mapping->tree_lock); 328 spin_lock_init(&mapping->tree_lock);
329 spin_lock_init(&mapping->i_mmap_lock); 329 mutex_init(&mapping->i_mmap_mutex);
330 INIT_LIST_HEAD(&mapping->private_list); 330 INIT_LIST_HEAD(&mapping->private_list);
331 spin_lock_init(&mapping->private_lock); 331 spin_lock_init(&mapping->private_lock);
332 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); 332 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
333 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); 333 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
334 mutex_init(&mapping->unmap_mutex);
335} 334}
336EXPORT_SYMBOL(address_space_init_once); 335EXPORT_SYMBOL(address_space_init_once);
337 336
@@ -752,8 +751,12 @@ static void prune_icache(int nr_to_scan)
752 * This function is passed the number of inodes to scan, and it returns the 751 * This function is passed the number of inodes to scan, and it returns the
753 * total number of remaining possibly-reclaimable inodes. 752 * total number of remaining possibly-reclaimable inodes.
754 */ 753 */
755static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) 754static int shrink_icache_memory(struct shrinker *shrink,
755 struct shrink_control *sc)
756{ 756{
757 int nr = sc->nr_to_scan;
758 gfp_t gfp_mask = sc->gfp_mask;
759
757 if (nr) { 760 if (nr) {
758 /* 761 /*
759 * Nasty deadlock avoidance. We may hold various FS locks, 762 * Nasty deadlock avoidance. We may hold various FS locks,
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 29148a81c783..7f21cf3aaf92 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -219,7 +219,6 @@ static int journal_submit_data_buffers(journal_t *journal,
219 ret = err; 219 ret = err;
220 spin_lock(&journal->j_list_lock); 220 spin_lock(&journal->j_list_lock);
221 J_ASSERT(jinode->i_transaction == commit_transaction); 221 J_ASSERT(jinode->i_transaction == commit_transaction);
222 commit_transaction->t_flushed_data_blocks = 1;
223 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 222 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
224 smp_mb__after_clear_bit(); 223 smp_mb__after_clear_bit();
225 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 224 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
@@ -672,12 +671,16 @@ start_journal_io:
672 err = 0; 671 err = 0;
673 } 672 }
674 673
674 write_lock(&journal->j_state_lock);
675 J_ASSERT(commit_transaction->t_state == T_COMMIT);
676 commit_transaction->t_state = T_COMMIT_DFLUSH;
677 write_unlock(&journal->j_state_lock);
675 /* 678 /*
676 * If the journal is not located on the file system device, 679 * If the journal is not located on the file system device,
677 * then we must flush the file system device before we issue 680 * then we must flush the file system device before we issue
678 * the commit record 681 * the commit record
679 */ 682 */
680 if (commit_transaction->t_flushed_data_blocks && 683 if (commit_transaction->t_need_data_flush &&
681 (journal->j_fs_dev != journal->j_dev) && 684 (journal->j_fs_dev != journal->j_dev) &&
682 (journal->j_flags & JBD2_BARRIER)) 685 (journal->j_flags & JBD2_BARRIER))
683 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); 686 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
@@ -754,8 +757,13 @@ wait_for_iobuf:
754 required. */ 757 required. */
755 JBUFFER_TRACE(jh, "file as BJ_Forget"); 758 JBUFFER_TRACE(jh, "file as BJ_Forget");
756 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); 759 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
757 /* Wake up any transactions which were waiting for this 760 /*
758 IO to complete */ 761 * Wake up any transactions which were waiting for this IO to
762 * complete. The barrier must be here so that changes by
763 * jbd2_journal_file_buffer() take effect before wake_up_bit()
764 * does the waitqueue check.
765 */
766 smp_mb();
759 wake_up_bit(&bh->b_state, BH_Unshadow); 767 wake_up_bit(&bh->b_state, BH_Unshadow);
760 JBUFFER_TRACE(jh, "brelse shadowed buffer"); 768 JBUFFER_TRACE(jh, "brelse shadowed buffer");
761 __brelse(bh); 769 __brelse(bh);
@@ -794,6 +802,10 @@ wait_for_iobuf:
794 jbd2_journal_abort(journal, err); 802 jbd2_journal_abort(journal, err);
795 803
796 jbd_debug(3, "JBD: commit phase 5\n"); 804 jbd_debug(3, "JBD: commit phase 5\n");
805 write_lock(&journal->j_state_lock);
806 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
807 commit_transaction->t_state = T_COMMIT_JFLUSH;
808 write_unlock(&journal->j_state_lock);
797 809
798 if (!JBD2_HAS_INCOMPAT_FEATURE(journal, 810 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
799 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 811 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -949,7 +961,7 @@ restart_loop:
949 961
950 jbd_debug(3, "JBD: commit phase 7\n"); 962 jbd_debug(3, "JBD: commit phase 7\n");
951 963
952 J_ASSERT(commit_transaction->t_state == T_COMMIT); 964 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
953 965
954 commit_transaction->t_start = jiffies; 966 commit_transaction->t_start = jiffies;
955 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, 967 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e0ec3db1c395..9a7826990304 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -479,9 +479,12 @@ int __jbd2_log_space_left(journal_t *journal)
479int __jbd2_log_start_commit(journal_t *journal, tid_t target) 479int __jbd2_log_start_commit(journal_t *journal, tid_t target)
480{ 480{
481 /* 481 /*
482 * Are we already doing a recent enough commit? 482 * The only transaction we can possibly wait upon is the
483 * currently running transaction (if it exists). Otherwise,
484 * the target tid must be an old one.
483 */ 485 */
484 if (!tid_geq(journal->j_commit_request, target)) { 486 if (journal->j_running_transaction &&
487 journal->j_running_transaction->t_tid == target) {
485 /* 488 /*
486 * We want a new commit: OK, mark the request and wakeup the 489 * We want a new commit: OK, mark the request and wakeup the
487 * commit thread. We do _not_ do the commit ourselves. 490 * commit thread. We do _not_ do the commit ourselves.
@@ -493,7 +496,15 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
493 journal->j_commit_sequence); 496 journal->j_commit_sequence);
494 wake_up(&journal->j_wait_commit); 497 wake_up(&journal->j_wait_commit);
495 return 1; 498 return 1;
496 } 499 } else if (!tid_geq(journal->j_commit_request, target))
500 /* This should never happen, but if it does, preserve
501 the evidence before kjournald goes into a loop and
502 increments j_commit_sequence beyond all recognition. */
503 WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
504 journal->j_commit_request,
505 journal->j_commit_sequence,
506 target, journal->j_running_transaction ?
507 journal->j_running_transaction->t_tid : 0);
497 return 0; 508 return 0;
498} 509}
499 510
@@ -577,6 +588,47 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
577} 588}
578 589
579/* 590/*
591 * Return 1 if a given transaction has not yet sent barrier request
592 * connected with a transaction commit. If 0 is returned, transaction
593 * may or may not have sent the barrier. Used to avoid sending barrier
594 * twice in common cases.
595 */
596int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
597{
598 int ret = 0;
599 transaction_t *commit_trans;
600
601 if (!(journal->j_flags & JBD2_BARRIER))
602 return 0;
603 read_lock(&journal->j_state_lock);
604 /* Transaction already committed? */
605 if (tid_geq(journal->j_commit_sequence, tid))
606 goto out;
607 commit_trans = journal->j_committing_transaction;
608 if (!commit_trans || commit_trans->t_tid != tid) {
609 ret = 1;
610 goto out;
611 }
612 /*
613 * Transaction is being committed and we already proceeded to
614 * submitting a flush to fs partition?
615 */
616 if (journal->j_fs_dev != journal->j_dev) {
617 if (!commit_trans->t_need_data_flush ||
618 commit_trans->t_state >= T_COMMIT_DFLUSH)
619 goto out;
620 } else {
621 if (commit_trans->t_state >= T_COMMIT_JFLUSH)
622 goto out;
623 }
624 ret = 1;
625out:
626 read_unlock(&journal->j_state_lock);
627 return ret;
628}
629EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
630
631/*
580 * Wait for a specified commit to complete. 632 * Wait for a specified commit to complete.
581 * The caller may not hold the journal lock. 633 * The caller may not hold the journal lock.
582 */ 634 */
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 05fa77a23711..3eec82d32fd4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -82,7 +82,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
82 */ 82 */
83 83
84/* 84/*
85 * Update transiaction's maximum wait time, if debugging is enabled. 85 * Update transaction's maximum wait time, if debugging is enabled.
86 * 86 *
87 * In order for t_max_wait to be reliable, it must be protected by a 87 * In order for t_max_wait to be reliable, it must be protected by a
88 * lock. But doing so will mean that start_this_handle() can not be 88 * lock. But doing so will mean that start_this_handle() can not be
@@ -91,11 +91,10 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
91 * means that maximum wait time reported by the jbd2_run_stats 91 * means that maximum wait time reported by the jbd2_run_stats
92 * tracepoint will always be zero. 92 * tracepoint will always be zero.
93 */ 93 */
94static inline void update_t_max_wait(transaction_t *transaction) 94static inline void update_t_max_wait(transaction_t *transaction,
95 unsigned long ts)
95{ 96{
96#ifdef CONFIG_JBD2_DEBUG 97#ifdef CONFIG_JBD2_DEBUG
97 unsigned long ts = jiffies;
98
99 if (jbd2_journal_enable_debug && 98 if (jbd2_journal_enable_debug &&
100 time_after(transaction->t_start, ts)) { 99 time_after(transaction->t_start, ts)) {
101 ts = jbd2_time_diff(ts, transaction->t_start); 100 ts = jbd2_time_diff(ts, transaction->t_start);
@@ -121,6 +120,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
121 tid_t tid; 120 tid_t tid;
122 int needed, need_to_start; 121 int needed, need_to_start;
123 int nblocks = handle->h_buffer_credits; 122 int nblocks = handle->h_buffer_credits;
123 unsigned long ts = jiffies;
124 124
125 if (nblocks > journal->j_max_transaction_buffers) { 125 if (nblocks > journal->j_max_transaction_buffers) {
126 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", 126 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -271,7 +271,7 @@ repeat:
271 /* OK, account for the buffers that this operation expects to 271 /* OK, account for the buffers that this operation expects to
272 * use and add the handle to the running transaction. 272 * use and add the handle to the running transaction.
273 */ 273 */
274 update_t_max_wait(transaction); 274 update_t_max_wait(transaction, ts);
275 handle->h_transaction = transaction; 275 handle->h_transaction = transaction;
276 atomic_inc(&transaction->t_updates); 276 atomic_inc(&transaction->t_updates);
277 atomic_inc(&transaction->t_handle_count); 277 atomic_inc(&transaction->t_handle_count);
@@ -316,7 +316,8 @@ static handle_t *new_handle(int nblocks)
316 * This function is visible to journal users (like ext3fs), so is not 316 * This function is visible to journal users (like ext3fs), so is not
317 * called with the journal already locked. 317 * called with the journal already locked.
318 * 318 *
319 * Return a pointer to a newly allocated handle, or NULL on failure 319 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
320 * on failure.
320 */ 321 */
321handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) 322handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
322{ 323{
@@ -921,8 +922,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
921 */ 922 */
922 JBUFFER_TRACE(jh, "cancelling revoke"); 923 JBUFFER_TRACE(jh, "cancelling revoke");
923 jbd2_journal_cancel_revoke(handle, jh); 924 jbd2_journal_cancel_revoke(handle, jh);
924 jbd2_journal_put_journal_head(jh);
925out: 925out:
926 jbd2_journal_put_journal_head(jh);
926 return err; 927 return err;
927} 928}
928 929
@@ -2147,6 +2148,13 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
2147 jinode->i_next_transaction == transaction) 2148 jinode->i_next_transaction == transaction)
2148 goto done; 2149 goto done;
2149 2150
2151 /*
2152 * We only ever set this variable to 1 so the test is safe. Since
2153 * t_need_data_flush is likely to be set, we do the test to save some
2154 * cacheline bouncing
2155 */
2156 if (!transaction->t_need_data_flush)
2157 transaction->t_need_data_flush = 1;
2150 /* On some different transaction's list - should be 2158 /* On some different transaction's list - should be
2151 * the committing one */ 2159 * the committing one */
2152 if (jinode->i_transaction) { 2160 if (jinode->i_transaction) {
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 82faddd1f321..05f73328b28b 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -609,6 +609,8 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
609 int ret; 609 int ret;
610 uint32_t now = get_seconds(); 610 uint32_t now = get_seconds();
611 611
612 dentry_unhash(dentry);
613
612 for (fd = f->dents ; fd; fd = fd->next) { 614 for (fd = f->dents ; fd; fd = fd->next) {
613 if (fd->ino) 615 if (fd->ino)
614 return -ENOTEMPTY; 616 return -ENOTEMPTY;
@@ -784,6 +786,9 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
784 uint8_t type; 786 uint8_t type;
785 uint32_t now; 787 uint32_t now;
786 788
789 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
790 dentry_unhash(new_dentry);
791
787 /* The VFS will check for us and prevent trying to rename a 792 /* The VFS will check for us and prevent trying to rename a
788 * file over a directory and vice versa, but if it's a directory, 793 * file over a directory and vice versa, but if it's a directory,
789 * the VFS can't check whether the victim is empty. The filesystem 794 * the VFS can't check whether the victim is empty. The filesystem
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index eaaf2b511e89..865df16a6cf3 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -360,6 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
360 360
361 jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); 361 jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
362 362
363 dentry_unhash(dentry);
364
363 /* Init inode for quota operations. */ 365 /* Init inode for quota operations. */
364 dquot_initialize(dip); 366 dquot_initialize(dip);
365 dquot_initialize(ip); 367 dquot_initialize(ip);
@@ -1095,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1095 jfs_info("jfs_rename: %s %s", old_dentry->d_name.name, 1097 jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
1096 new_dentry->d_name.name); 1098 new_dentry->d_name.name);
1097 1099
1100 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
1101 dentry_unhash(new_dentry);
1102
1098 dquot_initialize(old_dir); 1103 dquot_initialize(old_dir);
1099 dquot_initialize(new_dir); 1104 dquot_initialize(new_dir);
1100 1105
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 9ed89d1663f8..f34c9cde9e94 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -273,6 +273,8 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
273{ 273{
274 struct inode *inode = dentry->d_inode; 274 struct inode *inode = dentry->d_inode;
275 275
276 dentry_unhash(dentry);
277
276 if (!logfs_empty_dir(inode)) 278 if (!logfs_empty_dir(inode))
277 return -ENOTEMPTY; 279 return -ENOTEMPTY;
278 280
@@ -622,6 +624,9 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
622 loff_t pos; 624 loff_t pos;
623 int err; 625 int err;
624 626
627 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
628 dentry_unhash(new_dentry);
629
625 /* 1. locate source dd */ 630 /* 1. locate source dd */
626 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); 631 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
627 if (err) 632 if (err)
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 2f174be06555..8c32ef3ba88e 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -90,7 +90,8 @@ static DEFINE_SPINLOCK(mb_cache_spinlock);
90 * What the mbcache registers as to get shrunk dynamically. 90 * What the mbcache registers as to get shrunk dynamically.
91 */ 91 */
92 92
93static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); 93static int mb_cache_shrink_fn(struct shrinker *shrink,
94 struct shrink_control *sc);
94 95
95static struct shrinker mb_cache_shrinker = { 96static struct shrinker mb_cache_shrinker = {
96 .shrink = mb_cache_shrink_fn, 97 .shrink = mb_cache_shrink_fn,
@@ -156,18 +157,19 @@ forget:
156 * gets low. 157 * gets low.
157 * 158 *
158 * @shrink: (ignored) 159 * @shrink: (ignored)
159 * @nr_to_scan: Number of objects to scan 160 * @sc: shrink_control passed from reclaim
160 * @gfp_mask: (ignored)
161 * 161 *
162 * Returns the number of objects which are present in the cache. 162 * Returns the number of objects which are present in the cache.
163 */ 163 */
164static int 164static int
165mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 165mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc)
166{ 166{
167 LIST_HEAD(free_list); 167 LIST_HEAD(free_list);
168 struct mb_cache *cache; 168 struct mb_cache *cache;
169 struct mb_cache_entry *entry, *tmp; 169 struct mb_cache_entry *entry, *tmp;
170 int count = 0; 170 int count = 0;
171 int nr_to_scan = sc->nr_to_scan;
172 gfp_t gfp_mask = sc->gfp_mask;
171 173
172 mb_debug("trying to free %d entries", nr_to_scan); 174 mb_debug("trying to free %d entries", nr_to_scan);
173 spin_lock(&mb_cache_spinlock); 175 spin_lock(&mb_cache_spinlock);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 6e6777f1b4b2..f60aed8db9c4 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -168,6 +168,8 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
168 struct inode * inode = dentry->d_inode; 168 struct inode * inode = dentry->d_inode;
169 int err = -ENOTEMPTY; 169 int err = -ENOTEMPTY;
170 170
171 dentry_unhash(dentry);
172
171 if (minix_empty_dir(inode)) { 173 if (minix_empty_dir(inode)) {
172 err = minix_unlink(dir, dentry); 174 err = minix_unlink(dir, dentry);
173 if (!err) { 175 if (!err) {
@@ -190,6 +192,9 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
190 struct minix_dir_entry * old_de; 192 struct minix_dir_entry * old_de;
191 int err = -ENOENT; 193 int err = -ENOENT;
192 194
195 if (new_inode && S_ISDIR(new_inode->i_mode))
196 dentry_unhash(new_dentry);
197
193 old_de = minix_find_entry(old_dentry, &old_page); 198 old_de = minix_find_entry(old_dentry, &old_page);
194 if (!old_de) 199 if (!old_de)
195 goto out; 200 goto out;
diff --git a/fs/mpage.c b/fs/mpage.c
index 0afc809e46e0..fdfae9fa98cd 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -27,6 +27,7 @@
27#include <linux/writeback.h> 27#include <linux/writeback.h>
28#include <linux/backing-dev.h> 28#include <linux/backing-dev.h>
29#include <linux/pagevec.h> 29#include <linux/pagevec.h>
30#include <linux/cleancache.h>
30 31
31/* 32/*
32 * I/O completion handler for multipage BIOs. 33 * I/O completion handler for multipage BIOs.
@@ -271,6 +272,12 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
271 SetPageMappedToDisk(page); 272 SetPageMappedToDisk(page);
272 } 273 }
273 274
275 if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
276 cleancache_get_page(page) == 0) {
277 SetPageUptodate(page);
278 goto confused;
279 }
280
274 /* 281 /*
275 * This page will go to BIO. Do we need to send this BIO off first? 282 * This page will go to BIO. Do we need to send this BIO off first?
276 */ 283 */
diff --git a/fs/namei.c b/fs/namei.c
index 6ff858c049c0..2358b326b221 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -391,79 +391,28 @@ void path_put(struct path *path)
391} 391}
392EXPORT_SYMBOL(path_put); 392EXPORT_SYMBOL(path_put);
393 393
394/** 394/*
395 * nameidata_drop_rcu - drop this nameidata out of rcu-walk
396 * @nd: nameidata pathwalk data to drop
397 * Returns: 0 on success, -ECHILD on failure
398 *
399 * Path walking has 2 modes, rcu-walk and ref-walk (see 395 * Path walking has 2 modes, rcu-walk and ref-walk (see
400 * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt 396 * Documentation/filesystems/path-lookup.txt). In situations when we can't
401 * to drop out of rcu-walk mode and take normal reference counts on dentries 397 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
402 * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take 398 * normal reference counts on dentries and vfsmounts to transition to rcu-walk
403 * refcounts at the last known good point before rcu-walk got stuck, so 399 * mode. Refcounts are grabbed at the last known good point before rcu-walk
404 * ref-walk may continue from there. If this is not successful (eg. a seqcount 400 * got stuck, so ref-walk may continue from there. If this is not successful
405 * has changed), then failure is returned and path walk restarts from the 401 * (eg. a seqcount has changed), then failure is returned and it's up to caller
406 * beginning in ref-walk mode. 402 * to restart the path walk from the beginning in ref-walk mode.
407 *
408 * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
409 * ref-walk. Must be called from rcu-walk context.
410 */ 403 */
411static int nameidata_drop_rcu(struct nameidata *nd)
412{
413 struct fs_struct *fs = current->fs;
414 struct dentry *dentry = nd->path.dentry;
415 int want_root = 0;
416
417 BUG_ON(!(nd->flags & LOOKUP_RCU));
418 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
419 want_root = 1;
420 spin_lock(&fs->lock);
421 if (nd->root.mnt != fs->root.mnt ||
422 nd->root.dentry != fs->root.dentry)
423 goto err_root;
424 }
425 spin_lock(&dentry->d_lock);
426 if (!__d_rcu_to_refcount(dentry, nd->seq))
427 goto err;
428 BUG_ON(nd->inode != dentry->d_inode);
429 spin_unlock(&dentry->d_lock);
430 if (want_root) {
431 path_get(&nd->root);
432 spin_unlock(&fs->lock);
433 }
434 mntget(nd->path.mnt);
435
436 rcu_read_unlock();
437 br_read_unlock(vfsmount_lock);
438 nd->flags &= ~LOOKUP_RCU;
439 return 0;
440err:
441 spin_unlock(&dentry->d_lock);
442err_root:
443 if (want_root)
444 spin_unlock(&fs->lock);
445 return -ECHILD;
446}
447
448/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
449static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
450{
451 if (nd->flags & LOOKUP_RCU)
452 return nameidata_drop_rcu(nd);
453 return 0;
454}
455 404
456/** 405/**
457 * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk 406 * unlazy_walk - try to switch to ref-walk mode.
458 * @nd: nameidata pathwalk data to drop 407 * @nd: nameidata pathwalk data
459 * @dentry: dentry to drop 408 * @dentry: child of nd->path.dentry or NULL
460 * Returns: 0 on success, -ECHILD on failure 409 * Returns: 0 on success, -ECHILD on failure
461 * 410 *
462 * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root, 411 * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
463 * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on 412 * for ref-walk mode. @dentry must be a path found by a do_lookup call on
464 * @nd. Must be called from rcu-walk context. 413 * @nd or NULL. Must be called from rcu-walk context.
465 */ 414 */
466static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry) 415static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
467{ 416{
468 struct fs_struct *fs = current->fs; 417 struct fs_struct *fs = current->fs;
469 struct dentry *parent = nd->path.dentry; 418 struct dentry *parent = nd->path.dentry;
@@ -478,18 +427,25 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
478 goto err_root; 427 goto err_root;
479 } 428 }
480 spin_lock(&parent->d_lock); 429 spin_lock(&parent->d_lock);
481 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 430 if (!dentry) {
482 if (!__d_rcu_to_refcount(dentry, nd->seq)) 431 if (!__d_rcu_to_refcount(parent, nd->seq))
483 goto err; 432 goto err_parent;
484 /* 433 BUG_ON(nd->inode != parent->d_inode);
485 * If the sequence check on the child dentry passed, then the child has 434 } else {
486 * not been removed from its parent. This means the parent dentry must 435 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
487 * be valid and able to take a reference at this point. 436 if (!__d_rcu_to_refcount(dentry, nd->seq))
488 */ 437 goto err_child;
489 BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent); 438 /*
490 BUG_ON(!parent->d_count); 439 * If the sequence check on the child dentry passed, then
491 parent->d_count++; 440 * the child has not been removed from its parent. This
492 spin_unlock(&dentry->d_lock); 441 * means the parent dentry must be valid and able to take
442 * a reference at this point.
443 */
444 BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
445 BUG_ON(!parent->d_count);
446 parent->d_count++;
447 spin_unlock(&dentry->d_lock);
448 }
493 spin_unlock(&parent->d_lock); 449 spin_unlock(&parent->d_lock);
494 if (want_root) { 450 if (want_root) {
495 path_get(&nd->root); 451 path_get(&nd->root);
@@ -501,8 +457,10 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
501 br_read_unlock(vfsmount_lock); 457 br_read_unlock(vfsmount_lock);
502 nd->flags &= ~LOOKUP_RCU; 458 nd->flags &= ~LOOKUP_RCU;
503 return 0; 459 return 0;
504err: 460
461err_child:
505 spin_unlock(&dentry->d_lock); 462 spin_unlock(&dentry->d_lock);
463err_parent:
506 spin_unlock(&parent->d_lock); 464 spin_unlock(&parent->d_lock);
507err_root: 465err_root:
508 if (want_root) 466 if (want_root)
@@ -510,59 +468,6 @@ err_root:
510 return -ECHILD; 468 return -ECHILD;
511} 469}
512 470
513/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
514static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
515{
516 if (nd->flags & LOOKUP_RCU) {
517 if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
518 nd->flags &= ~LOOKUP_RCU;
519 if (!(nd->flags & LOOKUP_ROOT))
520 nd->root.mnt = NULL;
521 rcu_read_unlock();
522 br_read_unlock(vfsmount_lock);
523 return -ECHILD;
524 }
525 }
526 return 0;
527}
528
529/**
530 * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
531 * @nd: nameidata pathwalk data to drop
532 * Returns: 0 on success, -ECHILD on failure
533 *
534 * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
535 * nd->path should be the final element of the lookup, so nd->root is discarded.
536 * Must be called from rcu-walk context.
537 */
538static int nameidata_drop_rcu_last(struct nameidata *nd)
539{
540 struct dentry *dentry = nd->path.dentry;
541
542 BUG_ON(!(nd->flags & LOOKUP_RCU));
543 nd->flags &= ~LOOKUP_RCU;
544 if (!(nd->flags & LOOKUP_ROOT))
545 nd->root.mnt = NULL;
546 spin_lock(&dentry->d_lock);
547 if (!__d_rcu_to_refcount(dentry, nd->seq))
548 goto err_unlock;
549 BUG_ON(nd->inode != dentry->d_inode);
550 spin_unlock(&dentry->d_lock);
551
552 mntget(nd->path.mnt);
553
554 rcu_read_unlock();
555 br_read_unlock(vfsmount_lock);
556
557 return 0;
558
559err_unlock:
560 spin_unlock(&dentry->d_lock);
561 rcu_read_unlock();
562 br_read_unlock(vfsmount_lock);
563 return -ECHILD;
564}
565
566/** 471/**
567 * release_open_intent - free up open intent resources 472 * release_open_intent - free up open intent resources
568 * @nd: pointer to nameidata 473 * @nd: pointer to nameidata
@@ -606,26 +511,39 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
606 return dentry; 511 return dentry;
607} 512}
608 513
609/* 514/**
610 * handle_reval_path - force revalidation of a dentry 515 * complete_walk - successful completion of path walk
611 * 516 * @nd: pointer nameidata
612 * In some situations the path walking code will trust dentries without
613 * revalidating them. This causes problems for filesystems that depend on
614 * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set
615 * (which indicates that it's possible for the dentry to go stale), force
616 * a d_revalidate call before proceeding.
617 * 517 *
618 * Returns 0 if the revalidation was successful. If the revalidation fails, 518 * If we had been in RCU mode, drop out of it and legitimize nd->path.
619 * either return the error returned by d_revalidate or -ESTALE if the 519 * Revalidate the final result, unless we'd already done that during
620 * revalidation it just returned 0. If d_revalidate returns 0, we attempt to 520 * the path walk or the filesystem doesn't ask for it. Return 0 on
621 * invalidate the dentry. It's up to the caller to handle putting references 521 * success, -error on failure. In case of failure caller does not
622 * to the path if necessary. 522 * need to drop nd->path.
623 */ 523 */
624static inline int handle_reval_path(struct nameidata *nd) 524static int complete_walk(struct nameidata *nd)
625{ 525{
626 struct dentry *dentry = nd->path.dentry; 526 struct dentry *dentry = nd->path.dentry;
627 int status; 527 int status;
628 528
529 if (nd->flags & LOOKUP_RCU) {
530 nd->flags &= ~LOOKUP_RCU;
531 if (!(nd->flags & LOOKUP_ROOT))
532 nd->root.mnt = NULL;
533 spin_lock(&dentry->d_lock);
534 if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
535 spin_unlock(&dentry->d_lock);
536 rcu_read_unlock();
537 br_read_unlock(vfsmount_lock);
538 return -ECHILD;
539 }
540 BUG_ON(nd->inode != dentry->d_inode);
541 spin_unlock(&dentry->d_lock);
542 mntget(nd->path.mnt);
543 rcu_read_unlock();
544 br_read_unlock(vfsmount_lock);
545 }
546
629 if (likely(!(nd->flags & LOOKUP_JUMPED))) 547 if (likely(!(nd->flags & LOOKUP_JUMPED)))
630 return 0; 548 return 0;
631 549
@@ -643,6 +561,7 @@ static inline int handle_reval_path(struct nameidata *nd)
643 if (!status) 561 if (!status)
644 status = -ESTALE; 562 status = -ESTALE;
645 563
564 path_put(&nd->path);
646 return status; 565 return status;
647} 566}
648 567
@@ -1241,13 +1160,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
1241 if (likely(__follow_mount_rcu(nd, path, inode, false))) 1160 if (likely(__follow_mount_rcu(nd, path, inode, false)))
1242 return 0; 1161 return 0;
1243unlazy: 1162unlazy:
1244 if (dentry) { 1163 if (unlazy_walk(nd, dentry))
1245 if (nameidata_dentry_drop_rcu(nd, dentry)) 1164 return -ECHILD;
1246 return -ECHILD;
1247 } else {
1248 if (nameidata_drop_rcu(nd))
1249 return -ECHILD;
1250 }
1251 } else { 1165 } else {
1252 dentry = __d_lookup(parent, name); 1166 dentry = __d_lookup(parent, name);
1253 } 1167 }
@@ -1303,7 +1217,7 @@ static inline int may_lookup(struct nameidata *nd)
1303 int err = exec_permission(nd->inode, IPERM_FLAG_RCU); 1217 int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
1304 if (err != -ECHILD) 1218 if (err != -ECHILD)
1305 return err; 1219 return err;
1306 if (nameidata_drop_rcu(nd)) 1220 if (unlazy_walk(nd, NULL))
1307 return -ECHILD; 1221 return -ECHILD;
1308 } 1222 }
1309 return exec_permission(nd->inode, 0); 1223 return exec_permission(nd->inode, 0);
@@ -1357,8 +1271,12 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
1357 return -ENOENT; 1271 return -ENOENT;
1358 } 1272 }
1359 if (unlikely(inode->i_op->follow_link) && follow) { 1273 if (unlikely(inode->i_op->follow_link) && follow) {
1360 if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry)) 1274 if (nd->flags & LOOKUP_RCU) {
1361 return -ECHILD; 1275 if (unlikely(unlazy_walk(nd, path->dentry))) {
1276 terminate_walk(nd);
1277 return -ECHILD;
1278 }
1279 }
1362 BUG_ON(inode != path->dentry->d_inode); 1280 BUG_ON(inode != path->dentry->d_inode);
1363 return 1; 1281 return 1;
1364 } 1282 }
@@ -1657,18 +1575,8 @@ static int path_lookupat(int dfd, const char *name,
1657 } 1575 }
1658 } 1576 }
1659 1577
1660 if (nd->flags & LOOKUP_RCU) { 1578 if (!err)
1661 /* went all way through without dropping RCU */ 1579 err = complete_walk(nd);
1662 BUG_ON(err);
1663 if (nameidata_drop_rcu_last(nd))
1664 err = -ECHILD;
1665 }
1666
1667 if (!err) {
1668 err = handle_reval_path(nd);
1669 if (err)
1670 path_put(&nd->path);
1671 }
1672 1580
1673 if (!err && nd->flags & LOOKUP_DIRECTORY) { 1581 if (!err && nd->flags & LOOKUP_DIRECTORY) {
1674 if (!nd->inode->i_op->lookup) { 1582 if (!nd->inode->i_op->lookup) {
@@ -2134,13 +2042,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2134 return ERR_PTR(error); 2042 return ERR_PTR(error);
2135 /* fallthrough */ 2043 /* fallthrough */
2136 case LAST_ROOT: 2044 case LAST_ROOT:
2137 if (nd->flags & LOOKUP_RCU) { 2045 error = complete_walk(nd);
2138 if (nameidata_drop_rcu_last(nd))
2139 return ERR_PTR(-ECHILD);
2140 }
2141 error = handle_reval_path(nd);
2142 if (error) 2046 if (error)
2143 goto exit; 2047 return ERR_PTR(error);
2144 audit_inode(pathname, nd->path.dentry); 2048 audit_inode(pathname, nd->path.dentry);
2145 if (open_flag & O_CREAT) { 2049 if (open_flag & O_CREAT) {
2146 error = -EISDIR; 2050 error = -EISDIR;
@@ -2148,10 +2052,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2148 } 2052 }
2149 goto ok; 2053 goto ok;
2150 case LAST_BIND: 2054 case LAST_BIND:
2151 /* can't be RCU mode here */ 2055 error = complete_walk(nd);
2152 error = handle_reval_path(nd);
2153 if (error) 2056 if (error)
2154 goto exit; 2057 return ERR_PTR(error);
2155 audit_inode(pathname, dir); 2058 audit_inode(pathname, dir);
2156 goto ok; 2059 goto ok;
2157 } 2060 }
@@ -2170,10 +2073,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2170 if (error) /* symlink */ 2073 if (error) /* symlink */
2171 return NULL; 2074 return NULL;
2172 /* sayonara */ 2075 /* sayonara */
2173 if (nd->flags & LOOKUP_RCU) { 2076 error = complete_walk(nd);
2174 if (nameidata_drop_rcu_last(nd)) 2077 if (error)
2175 return ERR_PTR(-ECHILD); 2078 return ERR_PTR(-ECHILD);
2176 }
2177 2079
2178 error = -ENOTDIR; 2080 error = -ENOTDIR;
2179 if (nd->flags & LOOKUP_DIRECTORY) { 2081 if (nd->flags & LOOKUP_DIRECTORY) {
@@ -2185,11 +2087,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2185 } 2087 }
2186 2088
2187 /* create side of things */ 2089 /* create side of things */
2188 2090 error = complete_walk(nd);
2189 if (nd->flags & LOOKUP_RCU) { 2091 if (error)
2190 if (nameidata_drop_rcu_last(nd)) 2092 return ERR_PTR(error);
2191 return ERR_PTR(-ECHILD);
2192 }
2193 2093
2194 audit_inode(pathname, dir); 2094 audit_inode(pathname, dir);
2195 error = -EISDIR; 2095 error = -EISDIR;
@@ -2629,10 +2529,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
2629} 2529}
2630 2530
2631/* 2531/*
2632 * We try to drop the dentry early: we should have 2532 * The dentry_unhash() helper will try to drop the dentry early: we
2633 * a usage count of 2 if we're the only user of this 2533 * should have a usage count of 2 if we're the only user of this
2634 * dentry, and if that is true (possibly after pruning 2534 * dentry, and if that is true (possibly after pruning the dcache),
2635 * the dcache), then we drop the dentry now. 2535 * then we drop the dentry now.
2636 * 2536 *
2637 * A low-level filesystem can, if it choses, legally 2537 * A low-level filesystem can, if it choses, legally
2638 * do a 2538 * do a
@@ -2645,10 +2545,9 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
2645 */ 2545 */
2646void dentry_unhash(struct dentry *dentry) 2546void dentry_unhash(struct dentry *dentry)
2647{ 2547{
2648 dget(dentry);
2649 shrink_dcache_parent(dentry); 2548 shrink_dcache_parent(dentry);
2650 spin_lock(&dentry->d_lock); 2549 spin_lock(&dentry->d_lock);
2651 if (dentry->d_count == 2) 2550 if (dentry->d_count == 1)
2652 __d_drop(dentry); 2551 __d_drop(dentry);
2653 spin_unlock(&dentry->d_lock); 2552 spin_unlock(&dentry->d_lock);
2654} 2553}
@@ -2664,25 +2563,26 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2664 return -EPERM; 2563 return -EPERM;
2665 2564
2666 mutex_lock(&dentry->d_inode->i_mutex); 2565 mutex_lock(&dentry->d_inode->i_mutex);
2667 dentry_unhash(dentry); 2566
2567 error = -EBUSY;
2668 if (d_mountpoint(dentry)) 2568 if (d_mountpoint(dentry))
2669 error = -EBUSY; 2569 goto out;
2670 else { 2570
2671 error = security_inode_rmdir(dir, dentry); 2571 error = security_inode_rmdir(dir, dentry);
2672 if (!error) { 2572 if (error)
2673 error = dir->i_op->rmdir(dir, dentry); 2573 goto out;
2674 if (!error) { 2574
2675 dentry->d_inode->i_flags |= S_DEAD; 2575 error = dir->i_op->rmdir(dir, dentry);
2676 dont_mount(dentry); 2576 if (error)
2677 } 2577 goto out;
2678 } 2578
2679 } 2579 dentry->d_inode->i_flags |= S_DEAD;
2580 dont_mount(dentry);
2581
2582out:
2680 mutex_unlock(&dentry->d_inode->i_mutex); 2583 mutex_unlock(&dentry->d_inode->i_mutex);
2681 if (!error) { 2584 if (!error)
2682 d_delete(dentry); 2585 d_delete(dentry);
2683 }
2684 dput(dentry);
2685
2686 return error; 2586 return error;
2687} 2587}
2688 2588
@@ -3053,12 +2953,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
3053 * HOWEVER, it relies on the assumption that any object with ->lookup() 2953 * HOWEVER, it relies on the assumption that any object with ->lookup()
3054 * has no more than 1 dentry. If "hybrid" objects will ever appear, 2954 * has no more than 1 dentry. If "hybrid" objects will ever appear,
3055 * we'd better make sure that there's no link(2) for them. 2955 * we'd better make sure that there's no link(2) for them.
3056 * d) some filesystems don't support opened-but-unlinked directories, 2956 * d) conversion from fhandle to dentry may come in the wrong moment - when
3057 * either because of layout or because they are not ready to deal with
3058 * all cases correctly. The latter will be fixed (taking this sort of
3059 * stuff into VFS), but the former is not going away. Solution: the same
3060 * trick as in rmdir().
3061 * e) conversion from fhandle to dentry may come in the wrong moment - when
3062 * we are removing the target. Solution: we will have to grab ->i_mutex 2957 * we are removing the target. Solution: we will have to grab ->i_mutex
3063 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on 2958 * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
3064 * ->i_mutex on parents, which works but leads to some truly excessive 2959 * ->i_mutex on parents, which works but leads to some truly excessive
@@ -3068,7 +2963,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
3068 struct inode *new_dir, struct dentry *new_dentry) 2963 struct inode *new_dir, struct dentry *new_dentry)
3069{ 2964{
3070 int error = 0; 2965 int error = 0;
3071 struct inode *target; 2966 struct inode *target = new_dentry->d_inode;
3072 2967
3073 /* 2968 /*
3074 * If we are going to change the parent - check write permissions, 2969 * If we are going to change the parent - check write permissions,
@@ -3084,26 +2979,24 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
3084 if (error) 2979 if (error)
3085 return error; 2980 return error;
3086 2981
3087 target = new_dentry->d_inode;
3088 if (target) 2982 if (target)
3089 mutex_lock(&target->i_mutex); 2983 mutex_lock(&target->i_mutex);
3090 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) 2984
3091 error = -EBUSY; 2985 error = -EBUSY;
3092 else { 2986 if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
3093 if (target) 2987 goto out;
3094 dentry_unhash(new_dentry); 2988
3095 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 2989 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
3096 } 2990 if (error)
2991 goto out;
2992
3097 if (target) { 2993 if (target) {
3098 if (!error) { 2994 target->i_flags |= S_DEAD;
3099 target->i_flags |= S_DEAD; 2995 dont_mount(new_dentry);
3100 dont_mount(new_dentry);
3101 }
3102 mutex_unlock(&target->i_mutex);
3103 if (d_unhashed(new_dentry))
3104 d_rehash(new_dentry);
3105 dput(new_dentry);
3106 } 2996 }
2997out:
2998 if (target)
2999 mutex_unlock(&target->i_mutex);
3107 if (!error) 3000 if (!error)
3108 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 3001 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
3109 d_move(old_dentry,new_dentry); 3002 d_move(old_dentry,new_dentry);
@@ -3113,7 +3006,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
3113static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, 3006static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
3114 struct inode *new_dir, struct dentry *new_dentry) 3007 struct inode *new_dir, struct dentry *new_dentry)
3115{ 3008{
3116 struct inode *target; 3009 struct inode *target = new_dentry->d_inode;
3117 int error; 3010 int error;
3118 3011
3119 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); 3012 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
@@ -3121,19 +3014,22 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
3121 return error; 3014 return error;
3122 3015
3123 dget(new_dentry); 3016 dget(new_dentry);
3124 target = new_dentry->d_inode;
3125 if (target) 3017 if (target)
3126 mutex_lock(&target->i_mutex); 3018 mutex_lock(&target->i_mutex);
3019
3020 error = -EBUSY;
3127 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) 3021 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
3128 error = -EBUSY; 3022 goto out;
3129 else 3023
3130 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 3024 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
3131 if (!error) { 3025 if (error)
3132 if (target) 3026 goto out;
3133 dont_mount(new_dentry); 3027
3134 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 3028 if (target)
3135 d_move(old_dentry, new_dentry); 3029 dont_mount(new_dentry);
3136 } 3030 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
3031 d_move(old_dentry, new_dentry);
3032out:
3137 if (target) 3033 if (target)
3138 mutex_unlock(&target->i_mutex); 3034 mutex_unlock(&target->i_mutex);
3139 dput(new_dentry); 3035 dput(new_dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index d99bcf59e4c2..fe59bd145d21 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1695,7 +1695,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
1695 1695
1696static int flags_to_propagation_type(int flags) 1696static int flags_to_propagation_type(int flags)
1697{ 1697{
1698 int type = flags & ~MS_REC; 1698 int type = flags & ~(MS_REC | MS_SILENT);
1699 1699
1700 /* Fail if any non-propagation flags are set */ 1700 /* Fail if any non-propagation flags are set */
1701 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 1701 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index f6946bb5cb55..e3e646b06404 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1033,6 +1033,8 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
1033 DPRINTK("ncp_rmdir: removing %s/%s\n", 1033 DPRINTK("ncp_rmdir: removing %s/%s\n",
1034 dentry->d_parent->d_name.name, dentry->d_name.name); 1034 dentry->d_parent->d_name.name, dentry->d_name.name);
1035 1035
1036 dentry_unhash(dentry);
1037
1036 error = -EBUSY; 1038 error = -EBUSY;
1037 if (!d_unhashed(dentry)) 1039 if (!d_unhashed(dentry))
1038 goto out; 1040 goto out;
@@ -1139,6 +1141,9 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
1139 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1141 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1140 new_dentry->d_parent->d_name.name, new_dentry->d_name.name); 1142 new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
1141 1143
1144 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
1145 dentry_unhash(new_dentry);
1146
1142 ncp_age_dentry(server, old_dentry); 1147 ncp_age_dentry(server, old_dentry);
1143 ncp_age_dentry(server, new_dentry); 1148 ncp_age_dentry(server, new_dentry);
1144 1149
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 0250e4ce4893..202f370526a7 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -461,7 +461,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
461#endif 461#endif
462 struct ncp_entry_info finfo; 462 struct ncp_entry_info finfo;
463 463
464 data.wdog_pid = NULL; 464 memset(&data, 0, sizeof(data));
465 server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL); 465 server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL);
466 if (!server) 466 if (!server)
467 return -ENOMEM; 467 return -ENOMEM;
@@ -496,7 +496,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
496 struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data; 496 struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data;
497 497
498 data.flags = md->flags; 498 data.flags = md->flags;
499 data.int_flags = 0;
500 data.mounted_uid = md->mounted_uid; 499 data.mounted_uid = md->mounted_uid;
501 data.wdog_pid = find_get_pid(md->wdog_pid); 500 data.wdog_pid = find_get_pid(md->wdog_pid);
502 data.ncp_fd = md->ncp_fd; 501 data.ncp_fd = md->ncp_fd;
@@ -507,7 +506,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
507 data.file_mode = md->file_mode; 506 data.file_mode = md->file_mode;
508 data.dir_mode = md->dir_mode; 507 data.dir_mode = md->dir_mode;
509 data.info_fd = -1; 508 data.info_fd = -1;
510 data.mounted_vol[0] = 0;
511 } 509 }
512 break; 510 break;
513 default: 511 default:
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7237672216c8..424e47773a84 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2042,11 +2042,14 @@ static void nfs_access_free_list(struct list_head *head)
2042 } 2042 }
2043} 2043}
2044 2044
2045int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 2045int nfs_access_cache_shrinker(struct shrinker *shrink,
2046 struct shrink_control *sc)
2046{ 2047{
2047 LIST_HEAD(head); 2048 LIST_HEAD(head);
2048 struct nfs_inode *nfsi, *next; 2049 struct nfs_inode *nfsi, *next;
2049 struct nfs_access_entry *cache; 2050 struct nfs_access_entry *cache;
2051 int nr_to_scan = sc->nr_to_scan;
2052 gfp_t gfp_mask = sc->gfp_mask;
2050 2053
2051 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) 2054 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
2052 return (nr_to_scan == 0) ? 0 : -1; 2055 return (nr_to_scan == 0) ? 0 : -1;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ce118ce885dd..2df6ca7b5898 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -234,7 +234,7 @@ extern int nfs_init_client(struct nfs_client *clp,
234 234
235/* dir.c */ 235/* dir.c */
236extern int nfs_access_cache_shrinker(struct shrinker *shrink, 236extern int nfs_access_cache_shrinker(struct shrinker *shrink,
237 int nr_to_scan, gfp_t gfp_mask); 237 struct shrink_control *sc);
238 238
239/* inode.c */ 239/* inode.c */
240extern struct workqueue_struct *nfsiod_workqueue; 240extern struct workqueue_struct *nfsiod_workqueue;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 546849b3e88f..1102a5fbb744 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -334,6 +334,8 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
334 struct nilfs_transaction_info ti; 334 struct nilfs_transaction_info ti;
335 int err; 335 int err;
336 336
337 dentry_unhash(dentry);
338
337 err = nilfs_transaction_begin(dir->i_sb, &ti, 0); 339 err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
338 if (err) 340 if (err)
339 return err; 341 return err;
@@ -369,6 +371,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
369 struct nilfs_transaction_info ti; 371 struct nilfs_transaction_info ti;
370 int err; 372 int err;
371 373
374 if (new_inode && S_ISDIR(new_inode->i_mode))
375 dentry_unhash(new_dentry);
376
372 err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1); 377 err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
373 if (unlikely(err)) 378 if (unlikely(err))
374 return err; 379 return err;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 823bc35334e0..cdbaf5e97308 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,6 +41,7 @@
41#include <linux/mount.h> 41#include <linux/mount.h>
42#include <linux/seq_file.h> 42#include <linux/seq_file.h>
43#include <linux/quotaops.h> 43#include <linux/quotaops.h>
44#include <linux/cleancache.h>
44 45
45#define CREATE_TRACE_POINTS 46#define CREATE_TRACE_POINTS
46#include "ocfs2_trace.h" 47#include "ocfs2_trace.h"
@@ -2352,6 +2353,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
2352 mlog_errno(status); 2353 mlog_errno(status);
2353 goto bail; 2354 goto bail;
2354 } 2355 }
2356 cleancache_init_shared_fs((char *)&uuid_net_key, sb);
2355 2357
2356bail: 2358bail:
2357 return status; 2359 return status;
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index de4ff29f1e05..c368360c35a1 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -240,8 +240,12 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry)
240 struct inode *inode = dentry->d_inode; 240 struct inode *inode = dentry->d_inode;
241 int ret; 241 int ret;
242 242
243 if (S_ISDIR(inode->i_mode) && !omfs_dir_is_empty(inode)) 243
244 return -ENOTEMPTY; 244 if (S_ISDIR(inode->i_mode)) {
245 dentry_unhash(dentry);
246 if (!omfs_dir_is_empty(inode))
247 return -ENOTEMPTY;
248 }
245 249
246 ret = omfs_delete_entry(dentry); 250 ret = omfs_delete_entry(dentry);
247 if (ret) 251 if (ret)
@@ -378,6 +382,9 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
378 int err; 382 int err;
379 383
380 if (new_inode) { 384 if (new_inode) {
385 if (S_ISDIR(new_inode->i_mode))
386 dentry_unhash(new_dentry);
387
381 /* overwriting existing file/dir */ 388 /* overwriting existing file/dir */
382 err = omfs_remove(new_dir, new_dentry); 389 err = omfs_remove(new_dir, new_dentry);
383 if (err) 390 if (err)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index d545e97d99c3..8ed4d3433199 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -255,7 +255,11 @@ ssize_t part_discard_alignment_show(struct device *dev,
255 struct device_attribute *attr, char *buf) 255 struct device_attribute *attr, char *buf)
256{ 256{
257 struct hd_struct *p = dev_to_part(dev); 257 struct hd_struct *p = dev_to_part(dev);
258 return sprintf(buf, "%u\n", p->discard_alignment); 258 struct gendisk *disk = dev_to_disk(dev);
259
260 return sprintf(buf, "%u\n",
261 queue_limit_discard_alignment(&disk->queue->limits,
262 p->start_sect));
259} 263}
260 264
261ssize_t part_stat_show(struct device *dev, 265ssize_t part_stat_show(struct device *dev,
@@ -449,8 +453,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
449 p->start_sect = start; 453 p->start_sect = start;
450 p->alignment_offset = 454 p->alignment_offset =
451 queue_limit_alignment_offset(&disk->queue->limits, start); 455 queue_limit_alignment_offset(&disk->queue->limits, start);
452 p->discard_alignment =
453 queue_limit_discard_alignment(&disk->queue->limits, start);
454 p->nr_sects = len; 456 p->nr_sects = len;
455 p->partno = partno; 457 p->partno = partno;
456 p->policy = get_disk_ro(disk); 458 p->policy = get_disk_ro(disk);
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index df434c5f28fb..c1c729335924 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -20,6 +20,7 @@ proc-y += stat.o
20proc-y += uptime.o 20proc-y += uptime.o
21proc-y += version.o 21proc-y += version.o
22proc-y += softirqs.o 22proc-y += softirqs.o
23proc-y += namespaces.o
23proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o 24proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
24proc-$(CONFIG_NET) += proc_net.o 25proc-$(CONFIG_NET) += proc_net.o
25proc-$(CONFIG_PROC_KCORE) += kcore.o 26proc-$(CONFIG_PROC_KCORE) += kcore.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dfa532730e55..dc8bca72b002 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -600,7 +600,7 @@ static int proc_fd_access_allowed(struct inode *inode)
600 return allowed; 600 return allowed;
601} 601}
602 602
603static int proc_setattr(struct dentry *dentry, struct iattr *attr) 603int proc_setattr(struct dentry *dentry, struct iattr *attr)
604{ 604{
605 int error; 605 int error;
606 struct inode *inode = dentry->d_inode; 606 struct inode *inode = dentry->d_inode;
@@ -1736,8 +1736,7 @@ static int task_dumpable(struct task_struct *task)
1736 return 0; 1736 return 0;
1737} 1737}
1738 1738
1739 1739struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
1740static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
1741{ 1740{
1742 struct inode * inode; 1741 struct inode * inode;
1743 struct proc_inode *ei; 1742 struct proc_inode *ei;
@@ -1779,7 +1778,7 @@ out_unlock:
1779 return NULL; 1778 return NULL;
1780} 1779}
1781 1780
1782static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) 1781int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1783{ 1782{
1784 struct inode *inode = dentry->d_inode; 1783 struct inode *inode = dentry->d_inode;
1785 struct task_struct *task; 1784 struct task_struct *task;
@@ -1820,7 +1819,7 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
1820 * made this apply to all per process world readable and executable 1819 * made this apply to all per process world readable and executable
1821 * directories. 1820 * directories.
1822 */ 1821 */
1823static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) 1822int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
1824{ 1823{
1825 struct inode *inode; 1824 struct inode *inode;
1826 struct task_struct *task; 1825 struct task_struct *task;
@@ -1862,7 +1861,7 @@ static int pid_delete_dentry(const struct dentry * dentry)
1862 return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; 1861 return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
1863} 1862}
1864 1863
1865static const struct dentry_operations pid_dentry_operations = 1864const struct dentry_operations pid_dentry_operations =
1866{ 1865{
1867 .d_revalidate = pid_revalidate, 1866 .d_revalidate = pid_revalidate,
1868 .d_delete = pid_delete_dentry, 1867 .d_delete = pid_delete_dentry,
@@ -1870,9 +1869,6 @@ static const struct dentry_operations pid_dentry_operations =
1870 1869
1871/* Lookups */ 1870/* Lookups */
1872 1871
1873typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
1874 struct task_struct *, const void *);
1875
1876/* 1872/*
1877 * Fill a directory entry. 1873 * Fill a directory entry.
1878 * 1874 *
@@ -1885,8 +1881,8 @@ typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
1885 * reported by readdir in sync with the inode numbers reported 1881 * reported by readdir in sync with the inode numbers reported
1886 * by stat. 1882 * by stat.
1887 */ 1883 */
1888static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, 1884int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
1889 char *name, int len, 1885 const char *name, int len,
1890 instantiate_t instantiate, struct task_struct *task, const void *ptr) 1886 instantiate_t instantiate, struct task_struct *task, const void *ptr)
1891{ 1887{
1892 struct dentry *child, *dir = filp->f_path.dentry; 1888 struct dentry *child, *dir = filp->f_path.dentry;
@@ -2820,6 +2816,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2820 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), 2816 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
2821 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), 2817 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2822 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), 2818 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2819 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
2823#ifdef CONFIG_NET 2820#ifdef CONFIG_NET
2824 DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), 2821 DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
2825#endif 2822#endif
@@ -3168,6 +3165,7 @@ out_no_task:
3168static const struct pid_entry tid_base_stuff[] = { 3165static const struct pid_entry tid_base_stuff[] = {
3169 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), 3166 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
3170 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), 3167 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
3168 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
3171 REG("environ", S_IRUSR, proc_environ_operations), 3169 REG("environ", S_IRUSR, proc_environ_operations),
3172 INF("auxv", S_IRUSR, proc_pid_auxv), 3170 INF("auxv", S_IRUSR, proc_pid_auxv),
3173 ONE("status", S_IRUGO, proc_pid_status), 3171 ONE("status", S_IRUGO, proc_pid_status),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index f1281339b6fa..f1637f17c37c 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -674,6 +674,7 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
674 } 674 }
675 return ent; 675 return ent;
676} 676}
677EXPORT_SYMBOL(proc_mkdir_mode);
677 678
678struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name, 679struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
679 struct proc_dir_entry *parent) 680 struct proc_dir_entry *parent)
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d15aa1b1cc8f..74b48cfa1bb2 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -28,6 +28,7 @@ static void proc_evict_inode(struct inode *inode)
28{ 28{
29 struct proc_dir_entry *de; 29 struct proc_dir_entry *de;
30 struct ctl_table_header *head; 30 struct ctl_table_header *head;
31 const struct proc_ns_operations *ns_ops;
31 32
32 truncate_inode_pages(&inode->i_data, 0); 33 truncate_inode_pages(&inode->i_data, 0);
33 end_writeback(inode); 34 end_writeback(inode);
@@ -44,6 +45,10 @@ static void proc_evict_inode(struct inode *inode)
44 rcu_assign_pointer(PROC_I(inode)->sysctl, NULL); 45 rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
45 sysctl_head_put(head); 46 sysctl_head_put(head);
46 } 47 }
48 /* Release any associated namespace */
49 ns_ops = PROC_I(inode)->ns_ops;
50 if (ns_ops && ns_ops->put)
51 ns_ops->put(PROC_I(inode)->ns);
47} 52}
48 53
49static struct kmem_cache * proc_inode_cachep; 54static struct kmem_cache * proc_inode_cachep;
@@ -62,6 +67,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
62 ei->pde = NULL; 67 ei->pde = NULL;
63 ei->sysctl = NULL; 68 ei->sysctl = NULL;
64 ei->sysctl_entry = NULL; 69 ei->sysctl_entry = NULL;
70 ei->ns = NULL;
71 ei->ns_ops = NULL;
65 inode = &ei->vfs_inode; 72 inode = &ei->vfs_inode;
66 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 73 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
67 return inode; 74 return inode;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index c03e8d3a3a5b..7838e5cfec14 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -61,6 +61,14 @@ extern const struct file_operations proc_pagemap_operations;
61extern const struct file_operations proc_net_operations; 61extern const struct file_operations proc_net_operations;
62extern const struct inode_operations proc_net_inode_operations; 62extern const struct inode_operations proc_net_inode_operations;
63 63
64struct proc_maps_private {
65 struct pid *pid;
66 struct task_struct *task;
67#ifdef CONFIG_MMU
68 struct vm_area_struct *tail_vma;
69#endif
70};
71
64void proc_init_inodecache(void); 72void proc_init_inodecache(void);
65 73
66static inline struct pid *proc_pid(struct inode *inode) 74static inline struct pid *proc_pid(struct inode *inode)
@@ -119,3 +127,21 @@ struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
119 */ 127 */
120int proc_readdir(struct file *, void *, filldir_t); 128int proc_readdir(struct file *, void *, filldir_t);
121struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *); 129struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
130
131
132
133/* Lookups */
134typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
135 struct task_struct *, const void *);
136int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
137 const char *name, int len,
138 instantiate_t instantiate, struct task_struct *task, const void *ptr);
139int pid_revalidate(struct dentry *dentry, struct nameidata *nd);
140struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task);
141extern const struct dentry_operations pid_dentry_operations;
142int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
143int proc_setattr(struct dentry *dentry, struct iattr *attr);
144
145extern const struct inode_operations proc_ns_dir_inode_operations;
146extern const struct file_operations proc_ns_dir_operations;
147
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
new file mode 100644
index 000000000000..781dec5bd682
--- /dev/null
+++ b/fs/proc/namespaces.c
@@ -0,0 +1,198 @@
1#include <linux/proc_fs.h>
2#include <linux/nsproxy.h>
3#include <linux/sched.h>
4#include <linux/ptrace.h>
5#include <linux/fs_struct.h>
6#include <linux/mount.h>
7#include <linux/path.h>
8#include <linux/namei.h>
9#include <linux/file.h>
10#include <linux/utsname.h>
11#include <net/net_namespace.h>
12#include <linux/mnt_namespace.h>
13#include <linux/ipc_namespace.h>
14#include <linux/pid_namespace.h>
15#include "internal.h"
16
17
18static const struct proc_ns_operations *ns_entries[] = {
19#ifdef CONFIG_NET_NS
20 &netns_operations,
21#endif
22#ifdef CONFIG_UTS_NS
23 &utsns_operations,
24#endif
25#ifdef CONFIG_IPC_NS
26 &ipcns_operations,
27#endif
28};
29
30static const struct file_operations ns_file_operations = {
31 .llseek = no_llseek,
32};
33
34static struct dentry *proc_ns_instantiate(struct inode *dir,
35 struct dentry *dentry, struct task_struct *task, const void *ptr)
36{
37 const struct proc_ns_operations *ns_ops = ptr;
38 struct inode *inode;
39 struct proc_inode *ei;
40 struct dentry *error = ERR_PTR(-ENOENT);
41
42 inode = proc_pid_make_inode(dir->i_sb, task);
43 if (!inode)
44 goto out;
45
46 ei = PROC_I(inode);
47 inode->i_mode = S_IFREG|S_IRUSR;
48 inode->i_fop = &ns_file_operations;
49 ei->ns_ops = ns_ops;
50 ei->ns = ns_ops->get(task);
51 if (!ei->ns)
52 goto out_iput;
53
54 dentry->d_op = &pid_dentry_operations;
55 d_add(dentry, inode);
56 /* Close the race of the process dying before we return the dentry */
57 if (pid_revalidate(dentry, NULL))
58 error = NULL;
59out:
60 return error;
61out_iput:
62 iput(inode);
63 goto out;
64}
65
66static int proc_ns_fill_cache(struct file *filp, void *dirent,
67 filldir_t filldir, struct task_struct *task,
68 const struct proc_ns_operations *ops)
69{
70 return proc_fill_cache(filp, dirent, filldir,
71 ops->name, strlen(ops->name),
72 proc_ns_instantiate, task, ops);
73}
74
75static int proc_ns_dir_readdir(struct file *filp, void *dirent,
76 filldir_t filldir)
77{
78 int i;
79 struct dentry *dentry = filp->f_path.dentry;
80 struct inode *inode = dentry->d_inode;
81 struct task_struct *task = get_proc_task(inode);
82 const struct proc_ns_operations **entry, **last;
83 ino_t ino;
84 int ret;
85
86 ret = -ENOENT;
87 if (!task)
88 goto out_no_task;
89
90 ret = -EPERM;
91 if (!ptrace_may_access(task, PTRACE_MODE_READ))
92 goto out;
93
94 ret = 0;
95 i = filp->f_pos;
96 switch (i) {
97 case 0:
98 ino = inode->i_ino;
99 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
100 goto out;
101 i++;
102 filp->f_pos++;
103 /* fall through */
104 case 1:
105 ino = parent_ino(dentry);
106 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
107 goto out;
108 i++;
109 filp->f_pos++;
110 /* fall through */
111 default:
112 i -= 2;
113 if (i >= ARRAY_SIZE(ns_entries)) {
114 ret = 1;
115 goto out;
116 }
117 entry = ns_entries + i;
118 last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
119 while (entry <= last) {
120 if (proc_ns_fill_cache(filp, dirent, filldir,
121 task, *entry) < 0)
122 goto out;
123 filp->f_pos++;
124 entry++;
125 }
126 }
127
128 ret = 1;
129out:
130 put_task_struct(task);
131out_no_task:
132 return ret;
133}
134
135const struct file_operations proc_ns_dir_operations = {
136 .read = generic_read_dir,
137 .readdir = proc_ns_dir_readdir,
138};
139
140static struct dentry *proc_ns_dir_lookup(struct inode *dir,
141 struct dentry *dentry, struct nameidata *nd)
142{
143 struct dentry *error;
144 struct task_struct *task = get_proc_task(dir);
145 const struct proc_ns_operations **entry, **last;
146 unsigned int len = dentry->d_name.len;
147
148 error = ERR_PTR(-ENOENT);
149
150 if (!task)
151 goto out_no_task;
152
153 error = ERR_PTR(-EPERM);
154 if (!ptrace_may_access(task, PTRACE_MODE_READ))
155 goto out;
156
157 last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
158 for (entry = ns_entries; entry <= last; entry++) {
159 if (strlen((*entry)->name) != len)
160 continue;
161 if (!memcmp(dentry->d_name.name, (*entry)->name, len))
162 break;
163 }
164 error = ERR_PTR(-ENOENT);
165 if (entry > last)
166 goto out;
167
168 error = proc_ns_instantiate(dir, dentry, task, *entry);
169out:
170 put_task_struct(task);
171out_no_task:
172 return error;
173}
174
175const struct inode_operations proc_ns_dir_inode_operations = {
176 .lookup = proc_ns_dir_lookup,
177 .getattr = pid_getattr,
178 .setattr = proc_setattr,
179};
180
181struct file *proc_ns_fget(int fd)
182{
183 struct file *file;
184
185 file = fget(fd);
186 if (!file)
187 return ERR_PTR(-EBADF);
188
189 if (file->f_op != &ns_file_operations)
190 goto out_invalid;
191
192 return file;
193
194out_invalid:
195 fput(file);
196 return ERR_PTR(-EINVAL);
197}
198
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 318d8654989b..db15935fa757 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -211,7 +211,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
211{ 211{
212 struct mm_struct *mm = vma->vm_mm; 212 struct mm_struct *mm = vma->vm_mm;
213 struct file *file = vma->vm_file; 213 struct file *file = vma->vm_file;
214 int flags = vma->vm_flags; 214 vm_flags_t flags = vma->vm_flags;
215 unsigned long ino = 0; 215 unsigned long ino = 0;
216 unsigned long long pgoff = 0; 216 unsigned long long pgoff = 0;
217 unsigned long start, end; 217 unsigned long start, end;
@@ -858,7 +858,192 @@ const struct file_operations proc_pagemap_operations = {
858#endif /* CONFIG_PROC_PAGE_MONITOR */ 858#endif /* CONFIG_PROC_PAGE_MONITOR */
859 859
860#ifdef CONFIG_NUMA 860#ifdef CONFIG_NUMA
861extern int show_numa_map(struct seq_file *m, void *v); 861
862struct numa_maps {
863 struct vm_area_struct *vma;
864 unsigned long pages;
865 unsigned long anon;
866 unsigned long active;
867 unsigned long writeback;
868 unsigned long mapcount_max;
869 unsigned long dirty;
870 unsigned long swapcache;
871 unsigned long node[MAX_NUMNODES];
872};
873
874struct numa_maps_private {
875 struct proc_maps_private proc_maps;
876 struct numa_maps md;
877};
878
879static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty)
880{
881 int count = page_mapcount(page);
882
883 md->pages++;
884 if (pte_dirty || PageDirty(page))
885 md->dirty++;
886
887 if (PageSwapCache(page))
888 md->swapcache++;
889
890 if (PageActive(page) || PageUnevictable(page))
891 md->active++;
892
893 if (PageWriteback(page))
894 md->writeback++;
895
896 if (PageAnon(page))
897 md->anon++;
898
899 if (count > md->mapcount_max)
900 md->mapcount_max = count;
901
902 md->node[page_to_nid(page)]++;
903}
904
905static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
906 unsigned long end, struct mm_walk *walk)
907{
908 struct numa_maps *md;
909 spinlock_t *ptl;
910 pte_t *orig_pte;
911 pte_t *pte;
912
913 md = walk->private;
914 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
915 do {
916 struct page *page;
917 int nid;
918
919 if (!pte_present(*pte))
920 continue;
921
922 page = vm_normal_page(md->vma, addr, *pte);
923 if (!page)
924 continue;
925
926 if (PageReserved(page))
927 continue;
928
929 nid = page_to_nid(page);
930 if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
931 continue;
932
933 gather_stats(page, md, pte_dirty(*pte));
934
935 } while (pte++, addr += PAGE_SIZE, addr != end);
936 pte_unmap_unlock(orig_pte, ptl);
937 return 0;
938}
939#ifdef CONFIG_HUGETLB_PAGE
940static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
941 unsigned long addr, unsigned long end, struct mm_walk *walk)
942{
943 struct numa_maps *md;
944 struct page *page;
945
946 if (pte_none(*pte))
947 return 0;
948
949 page = pte_page(*pte);
950 if (!page)
951 return 0;
952
953 md = walk->private;
954 gather_stats(page, md, pte_dirty(*pte));
955 return 0;
956}
957
958#else
959static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
960 unsigned long addr, unsigned long end, struct mm_walk *walk)
961{
962 return 0;
963}
964#endif
965
966/*
967 * Display pages allocated per node and memory policy via /proc.
968 */
969static int show_numa_map(struct seq_file *m, void *v)
970{
971 struct numa_maps_private *numa_priv = m->private;
972 struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
973 struct vm_area_struct *vma = v;
974 struct numa_maps *md = &numa_priv->md;
975 struct file *file = vma->vm_file;
976 struct mm_struct *mm = vma->vm_mm;
977 struct mm_walk walk = {};
978 struct mempolicy *pol;
979 int n;
980 char buffer[50];
981
982 if (!mm)
983 return 0;
984
985 /* Ensure we start with an empty set of numa_maps statistics. */
986 memset(md, 0, sizeof(*md));
987
988 md->vma = vma;
989
990 walk.hugetlb_entry = gather_hugetbl_stats;
991 walk.pmd_entry = gather_pte_stats;
992 walk.private = md;
993 walk.mm = mm;
994
995 pol = get_vma_policy(proc_priv->task, vma, vma->vm_start);
996 mpol_to_str(buffer, sizeof(buffer), pol, 0);
997 mpol_cond_put(pol);
998
999 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1000
1001 if (file) {
1002 seq_printf(m, " file=");
1003 seq_path(m, &file->f_path, "\n\t= ");
1004 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1005 seq_printf(m, " heap");
1006 } else if (vma->vm_start <= mm->start_stack &&
1007 vma->vm_end >= mm->start_stack) {
1008 seq_printf(m, " stack");
1009 }
1010
1011 walk_page_range(vma->vm_start, vma->vm_end, &walk);
1012
1013 if (!md->pages)
1014 goto out;
1015
1016 if (md->anon)
1017 seq_printf(m, " anon=%lu", md->anon);
1018
1019 if (md->dirty)
1020 seq_printf(m, " dirty=%lu", md->dirty);
1021
1022 if (md->pages != md->anon && md->pages != md->dirty)
1023 seq_printf(m, " mapped=%lu", md->pages);
1024
1025 if (md->mapcount_max > 1)
1026 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1027
1028 if (md->swapcache)
1029 seq_printf(m, " swapcache=%lu", md->swapcache);
1030
1031 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1032 seq_printf(m, " active=%lu", md->active);
1033
1034 if (md->writeback)
1035 seq_printf(m, " writeback=%lu", md->writeback);
1036
1037 for_each_node_state(n, N_HIGH_MEMORY)
1038 if (md->node[n])
1039 seq_printf(m, " N%d=%lu", n, md->node[n]);
1040out:
1041 seq_putc(m, '\n');
1042
1043 if (m->count < m->size)
1044 m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
1045 return 0;
1046}
862 1047
863static const struct seq_operations proc_pid_numa_maps_op = { 1048static const struct seq_operations proc_pid_numa_maps_op = {
864 .start = m_start, 1049 .start = m_start,
@@ -869,7 +1054,20 @@ static const struct seq_operations proc_pid_numa_maps_op = {
869 1054
870static int numa_maps_open(struct inode *inode, struct file *file) 1055static int numa_maps_open(struct inode *inode, struct file *file)
871{ 1056{
872 return do_maps_open(inode, file, &proc_pid_numa_maps_op); 1057 struct numa_maps_private *priv;
1058 int ret = -ENOMEM;
1059 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
1060 if (priv) {
1061 priv->proc_maps.pid = proc_pid(inode);
1062 ret = seq_open(file, &proc_pid_numa_maps_op);
1063 if (!ret) {
1064 struct seq_file *m = file->private_data;
1065 m->private = priv;
1066 } else {
1067 kfree(priv);
1068 }
1069 }
1070 return ret;
873} 1071}
874 1072
875const struct file_operations proc_numa_maps_operations = { 1073const struct file_operations proc_numa_maps_operations = {
@@ -878,4 +1076,4 @@ const struct file_operations proc_numa_maps_operations = {
878 .llseek = seq_lseek, 1076 .llseek = seq_lseek,
879 .release = seq_release_private, 1077 .release = seq_release_private,
880}; 1078};
881#endif 1079#endif /* CONFIG_NUMA */
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index d3c032f5fa0a..5b572c89e6c4 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -691,8 +691,11 @@ static void prune_dqcache(int count)
691 * This is called from kswapd when we think we need some 691 * This is called from kswapd when we think we need some
692 * more memory 692 * more memory
693 */ 693 */
694static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) 694static int shrink_dqcache_memory(struct shrinker *shrink,
695 struct shrink_control *sc)
695{ 696{
697 int nr = sc->nr_to_scan;
698
696 if (nr) { 699 if (nr) {
697 spin_lock(&dq_list_lock); 700 spin_lock(&dq_list_lock);
698 prune_dqcache(nr); 701 prune_dqcache(nr);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 118662690cdf..76c8164d5651 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -831,6 +831,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
831 INITIALIZE_PATH(path); 831 INITIALIZE_PATH(path);
832 struct reiserfs_dir_entry de; 832 struct reiserfs_dir_entry de;
833 833
834 dentry_unhash(dentry);
835
834 /* we will be doing 2 balancings and update 2 stat data, we change quotas 836 /* we will be doing 2 balancings and update 2 stat data, we change quotas
835 * of the owner of the directory and of the owner of the parent directory. 837 * of the owner of the directory and of the owner of the parent directory.
836 * The quota structure is possibly deleted only on last iput => outside 838 * The quota structure is possibly deleted only on last iput => outside
@@ -1225,6 +1227,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1225 unsigned long savelink = 1; 1227 unsigned long savelink = 1;
1226 struct timespec ctime; 1228 struct timespec ctime;
1227 1229
1230 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
1231 dentry_unhash(new_dentry);
1232
1228 /* three balancings: (1) old name removal, (2) new name insertion 1233 /* three balancings: (1) old name removal, (2) new name insertion
1229 and (3) maybe "save" link insertion 1234 and (3) maybe "save" link insertion
1230 stat data updates: (1) old directory, 1235 stat data updates: (1) old directory,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 47d2a4498b03..50f1abccd1cd 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -105,7 +105,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
105 mutex_unlock(&dentry->d_inode->i_mutex); 105 mutex_unlock(&dentry->d_inode->i_mutex);
106 if (!error) 106 if (!error)
107 d_delete(dentry); 107 d_delete(dentry);
108 dput(dentry);
109 108
110 return error; 109 return error;
111} 110}
diff --git a/fs/splice.c b/fs/splice.c
index 50a5d978da16..aa866d309695 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -162,6 +162,14 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = {
162 .get = generic_pipe_buf_get, 162 .get = generic_pipe_buf_get,
163}; 163};
164 164
165static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
166{
167 smp_mb();
168 if (waitqueue_active(&pipe->wait))
169 wake_up_interruptible(&pipe->wait);
170 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
171}
172
165/** 173/**
166 * splice_to_pipe - fill passed data into a pipe 174 * splice_to_pipe - fill passed data into a pipe
167 * @pipe: pipe to fill 175 * @pipe: pipe to fill
@@ -247,12 +255,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
247 255
248 pipe_unlock(pipe); 256 pipe_unlock(pipe);
249 257
250 if (do_wakeup) { 258 if (do_wakeup)
251 smp_mb(); 259 wakeup_pipe_readers(pipe);
252 if (waitqueue_active(&pipe->wait))
253 wake_up_interruptible(&pipe->wait);
254 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
255 }
256 260
257 while (page_nr < spd_pages) 261 while (page_nr < spd_pages)
258 spd->spd_release(spd, page_nr++); 262 spd->spd_release(spd, page_nr++);
@@ -1892,12 +1896,9 @@ retry:
1892 /* 1896 /*
1893 * If we put data in the output pipe, wakeup any potential readers. 1897 * If we put data in the output pipe, wakeup any potential readers.
1894 */ 1898 */
1895 if (ret > 0) { 1899 if (ret > 0)
1896 smp_mb(); 1900 wakeup_pipe_readers(opipe);
1897 if (waitqueue_active(&opipe->wait)) 1901
1898 wake_up_interruptible(&opipe->wait);
1899 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1900 }
1901 if (input_wakeup) 1902 if (input_wakeup)
1902 wakeup_pipe_writers(ipipe); 1903 wakeup_pipe_writers(ipipe);
1903 1904
@@ -1976,12 +1977,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1976 /* 1977 /*
1977 * If we put data in the output pipe, wakeup any potential readers. 1978 * If we put data in the output pipe, wakeup any potential readers.
1978 */ 1979 */
1979 if (ret > 0) { 1980 if (ret > 0)
1980 smp_mb(); 1981 wakeup_pipe_readers(opipe);
1981 if (waitqueue_active(&opipe->wait))
1982 wake_up_interruptible(&opipe->wait);
1983 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1984 }
1985 1982
1986 return ret; 1983 return ret;
1987} 1984}
diff --git a/fs/super.c b/fs/super.c
index c04f7e0b7ed2..c75593953c52 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -31,6 +31,7 @@
31#include <linux/mutex.h> 31#include <linux/mutex.h>
32#include <linux/backing-dev.h> 32#include <linux/backing-dev.h>
33#include <linux/rculist_bl.h> 33#include <linux/rculist_bl.h>
34#include <linux/cleancache.h>
34#include "internal.h" 35#include "internal.h"
35 36
36 37
@@ -112,6 +113,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
112 s->s_maxbytes = MAX_NON_LFS; 113 s->s_maxbytes = MAX_NON_LFS;
113 s->s_op = &default_op; 114 s->s_op = &default_op;
114 s->s_time_gran = 1000000000; 115 s->s_time_gran = 1000000000;
116 s->cleancache_poolid = -1;
115 } 117 }
116out: 118out:
117 return s; 119 return s;
@@ -177,6 +179,7 @@ void deactivate_locked_super(struct super_block *s)
177{ 179{
178 struct file_system_type *fs = s->s_type; 180 struct file_system_type *fs = s->s_type;
179 if (atomic_dec_and_test(&s->s_active)) { 181 if (atomic_dec_and_test(&s->s_active)) {
182 cleancache_flush_fs(s);
180 fs->kill_sb(s); 183 fs->kill_sb(s);
181 /* 184 /*
182 * We need to call rcu_barrier so all the delayed rcu free 185 * We need to call rcu_barrier so all the delayed rcu free
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index e474fbcf8bde..e2cc6756f3b1 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -196,6 +196,8 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
196 struct inode *inode = dentry->d_inode; 196 struct inode *inode = dentry->d_inode;
197 int err = -ENOTEMPTY; 197 int err = -ENOTEMPTY;
198 198
199 dentry_unhash(dentry);
200
199 if (sysv_empty_dir(inode)) { 201 if (sysv_empty_dir(inode)) {
200 err = sysv_unlink(dir, dentry); 202 err = sysv_unlink(dir, dentry);
201 if (!err) { 203 if (!err) {
@@ -222,6 +224,9 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
222 struct sysv_dir_entry * old_de; 224 struct sysv_dir_entry * old_de;
223 int err = -ENOENT; 225 int err = -ENOENT;
224 226
227 if (new_inode && S_ISDIR(new_inode->i_mode))
228 dentry_unhash(new_dentry);
229
225 old_de = sysv_find_entry(old_dentry, &old_page); 230 old_de = sysv_find_entry(old_dentry, &old_page);
226 if (!old_de) 231 if (!old_de)
227 goto out; 232 goto out;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ef5abd38f0bf..c2b80943560d 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -656,6 +656,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
656 struct ubifs_inode *dir_ui = ubifs_inode(dir); 656 struct ubifs_inode *dir_ui = ubifs_inode(dir);
657 struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; 657 struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
658 658
659 dentry_unhash(dentry);
660
659 /* 661 /*
660 * Budget request settings: deletion direntry, deletion inode and 662 * Budget request settings: deletion direntry, deletion inode and
661 * changing the parent inode. If budgeting fails, go ahead anyway 663 * changing the parent inode. If budgeting fails, go ahead anyway
@@ -976,6 +978,9 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
976 .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; 978 .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
977 struct timespec time; 979 struct timespec time;
978 980
981 if (new_inode && S_ISDIR(new_inode->i_mode))
982 dentry_unhash(new_dentry);
983
979 /* 984 /*
980 * Budget request settings: deletion direntry, new direntry, removing 985 * Budget request settings: deletion direntry, new direntry, removing
981 * the old inode, and changing old and new parent directory inodes. 986 * the old inode, and changing old and new parent directory inodes.
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f1dce848ef96..4d76594c2a8f 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -783,6 +783,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
783 struct fileIdentDesc *fi, cfi; 783 struct fileIdentDesc *fi, cfi;
784 struct kernel_lb_addr tloc; 784 struct kernel_lb_addr tloc;
785 785
786 dentry_unhash(dentry);
787
786 retval = -ENOENT; 788 retval = -ENOENT;
787 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 789 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
788 if (!fi) 790 if (!fi)
@@ -1081,6 +1083,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1081 struct kernel_lb_addr tloc; 1083 struct kernel_lb_addr tloc;
1082 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1084 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1083 1085
1086 if (new_inode && S_ISDIR(new_inode->i_mode))
1087 dentry_unhash(new_dentry);
1088
1084 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); 1089 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
1085 if (ofi) { 1090 if (ofi) {
1086 if (ofibh.sbh != ofibh.ebh) 1091 if (ofibh.sbh != ofibh.ebh)
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 29309e25417f..953ebdfc5bf7 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -258,6 +258,8 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
258 struct inode * inode = dentry->d_inode; 258 struct inode * inode = dentry->d_inode;
259 int err= -ENOTEMPTY; 259 int err= -ENOTEMPTY;
260 260
261 dentry_unhash(dentry);
262
261 lock_ufs(dir->i_sb); 263 lock_ufs(dir->i_sb);
262 if (ufs_empty_dir (inode)) { 264 if (ufs_empty_dir (inode)) {
263 err = ufs_unlink(dir, dentry); 265 err = ufs_unlink(dir, dentry);
@@ -282,6 +284,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
282 struct ufs_dir_entry *old_de; 284 struct ufs_dir_entry *old_de;
283 int err = -ENOENT; 285 int err = -ENOENT;
284 286
287 if (new_inode && S_ISDIR(new_inode->i_mode))
288 dentry_unhash(new_dentry);
289
285 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); 290 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
286 if (!old_de) 291 if (!old_de)
287 goto out; 292 goto out;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 52b2b5da566e..5e68099db2a5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1422,12 +1422,12 @@ restart:
1422int 1422int
1423xfs_buftarg_shrink( 1423xfs_buftarg_shrink(
1424 struct shrinker *shrink, 1424 struct shrinker *shrink,
1425 int nr_to_scan, 1425 struct shrink_control *sc)
1426 gfp_t mask)
1427{ 1426{
1428 struct xfs_buftarg *btp = container_of(shrink, 1427 struct xfs_buftarg *btp = container_of(shrink,
1429 struct xfs_buftarg, bt_shrinker); 1428 struct xfs_buftarg, bt_shrinker);
1430 struct xfs_buf *bp; 1429 struct xfs_buf *bp;
1430 int nr_to_scan = sc->nr_to_scan;
1431 LIST_HEAD(dispose); 1431 LIST_HEAD(dispose);
1432 1432
1433 if (!nr_to_scan) 1433 if (!nr_to_scan)
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
index d61611c88012..244e797dae32 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -191,3 +191,32 @@ xfs_ioc_trim(
191 return -XFS_ERROR(EFAULT); 191 return -XFS_ERROR(EFAULT);
192 return 0; 192 return 0;
193} 193}
194
195int
196xfs_discard_extents(
197 struct xfs_mount *mp,
198 struct list_head *list)
199{
200 struct xfs_busy_extent *busyp;
201 int error = 0;
202
203 list_for_each_entry(busyp, list, list) {
204 trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
205 busyp->length);
206
207 error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
208 XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
209 XFS_FSB_TO_BB(mp, busyp->length),
210 GFP_NOFS, 0);
211 if (error && error != EOPNOTSUPP) {
212 xfs_info(mp,
213 "discard failed for extent [0x%llu,%u], error %d",
214 (unsigned long long)busyp->bno,
215 busyp->length,
216 error);
217 return error;
218 }
219 }
220
221 return 0;
222}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
index e82b6dd3e127..344879aea646 100644
--- a/fs/xfs/linux-2.6/xfs_discard.h
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -2,7 +2,9 @@
2#define XFS_DISCARD_H 1 2#define XFS_DISCARD_H 1
3 3
4struct fstrim_range; 4struct fstrim_range;
5struct list_head;
5 6
6extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); 7extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
8extern int xfs_discard_extents(struct xfs_mount *, struct list_head *);
7 9
8#endif /* XFS_DISCARD_H */ 10#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index b0aa59e51fd0..98b9c91fcdf1 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -110,8 +110,10 @@ mempool_t *xfs_ioend_pool;
110#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ 110#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
111#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ 111#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
112#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ 112#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */
113#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */ 113#define MNTOPT_DELAYLOG "delaylog" /* Delayed logging enabled */
114#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */ 114#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */
115#define MNTOPT_DISCARD "discard" /* Discard unused blocks */
116#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */
115 117
116/* 118/*
117 * Table driven mount option parser. 119 * Table driven mount option parser.
@@ -355,6 +357,10 @@ xfs_parseargs(
355 mp->m_flags |= XFS_MOUNT_DELAYLOG; 357 mp->m_flags |= XFS_MOUNT_DELAYLOG;
356 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { 358 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
357 mp->m_flags &= ~XFS_MOUNT_DELAYLOG; 359 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
360 } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
361 mp->m_flags |= XFS_MOUNT_DISCARD;
362 } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
363 mp->m_flags &= ~XFS_MOUNT_DISCARD;
358 } else if (!strcmp(this_char, "ihashsize")) { 364 } else if (!strcmp(this_char, "ihashsize")) {
359 xfs_warn(mp, 365 xfs_warn(mp,
360 "ihashsize no longer used, option is deprecated."); 366 "ihashsize no longer used, option is deprecated.");
@@ -388,6 +394,13 @@ xfs_parseargs(
388 return EINVAL; 394 return EINVAL;
389 } 395 }
390 396
397 if ((mp->m_flags & XFS_MOUNT_DISCARD) &&
398 !(mp->m_flags & XFS_MOUNT_DELAYLOG)) {
399 xfs_warn(mp,
400 "the discard option is incompatible with the nodelaylog option");
401 return EINVAL;
402 }
403
391#ifndef CONFIG_XFS_QUOTA 404#ifndef CONFIG_XFS_QUOTA
392 if (XFS_IS_QUOTA_RUNNING(mp)) { 405 if (XFS_IS_QUOTA_RUNNING(mp)) {
393 xfs_warn(mp, "quota support not available in this kernel."); 406 xfs_warn(mp, "quota support not available in this kernel.");
@@ -488,6 +501,7 @@ xfs_showargs(
488 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, 501 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
489 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, 502 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
490 { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG }, 503 { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
504 { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
491 { 0, NULL } 505 { 0, NULL }
492 }; 506 };
493 static struct proc_xfs_info xfs_info_unset[] = { 507 static struct proc_xfs_info xfs_info_unset[] = {
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index cb1bb2080e44..8ecad5ff9f9b 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -1032,13 +1032,14 @@ xfs_reclaim_inodes(
1032static int 1032static int
1033xfs_reclaim_inode_shrink( 1033xfs_reclaim_inode_shrink(
1034 struct shrinker *shrink, 1034 struct shrinker *shrink,
1035 int nr_to_scan, 1035 struct shrink_control *sc)
1036 gfp_t gfp_mask)
1037{ 1036{
1038 struct xfs_mount *mp; 1037 struct xfs_mount *mp;
1039 struct xfs_perag *pag; 1038 struct xfs_perag *pag;
1040 xfs_agnumber_t ag; 1039 xfs_agnumber_t ag;
1041 int reclaimable; 1040 int reclaimable;
1041 int nr_to_scan = sc->nr_to_scan;
1042 gfp_t gfp_mask = sc->gfp_mask;
1042 1043
1043 mp = container_of(shrink, struct xfs_mount, m_inode_shrink); 1044 mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
1044 if (nr_to_scan) { 1045 if (nr_to_scan) {
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 69228aa8605a..b94dace4e785 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -60,7 +60,7 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
60 60
61STATIC int xfs_qm_init_quotainos(xfs_mount_t *); 61STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
62STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); 62STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
63STATIC int xfs_qm_shake(struct shrinker *, int, gfp_t); 63STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *);
64 64
65static struct shrinker xfs_qm_shaker = { 65static struct shrinker xfs_qm_shaker = {
66 .shrink = xfs_qm_shake, 66 .shrink = xfs_qm_shake,
@@ -2009,10 +2009,10 @@ xfs_qm_shake_freelist(
2009STATIC int 2009STATIC int
2010xfs_qm_shake( 2010xfs_qm_shake(
2011 struct shrinker *shrink, 2011 struct shrinker *shrink,
2012 int nr_to_scan, 2012 struct shrink_control *sc)
2013 gfp_t gfp_mask)
2014{ 2013{
2015 int ndqused, nfree, n; 2014 int ndqused, nfree, n;
2015 gfp_t gfp_mask = sc->gfp_mask;
2016 2016
2017 if (!kmem_shake_allow(gfp_mask)) 2017 if (!kmem_shake_allow(gfp_mask))
2018 return 0; 2018 return 0;
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index da0a561ffba2..6530769a999b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -187,6 +187,9 @@ struct xfs_busy_extent {
187 xfs_agnumber_t agno; 187 xfs_agnumber_t agno;
188 xfs_agblock_t bno; 188 xfs_agblock_t bno;
189 xfs_extlen_t length; 189 xfs_extlen_t length;
190 unsigned int flags;
191#define XFS_ALLOC_BUSY_DISCARDED 0x01 /* undergoing a discard op. */
192#define XFS_ALLOC_BUSY_SKIP_DISCARD 0x02 /* do not discard */
190}; 193};
191 194
192/* 195/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index acdced86413c..95862bbff56b 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2469,7 +2469,7 @@ xfs_free_extent(
2469 2469
2470 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); 2470 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
2471 if (!error) 2471 if (!error)
2472 xfs_alloc_busy_insert(tp, args.agno, args.agbno, len); 2472 xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0);
2473error0: 2473error0:
2474 xfs_perag_put(args.pag); 2474 xfs_perag_put(args.pag);
2475 return error; 2475 return error;
@@ -2480,7 +2480,8 @@ xfs_alloc_busy_insert(
2480 struct xfs_trans *tp, 2480 struct xfs_trans *tp,
2481 xfs_agnumber_t agno, 2481 xfs_agnumber_t agno,
2482 xfs_agblock_t bno, 2482 xfs_agblock_t bno,
2483 xfs_extlen_t len) 2483 xfs_extlen_t len,
2484 unsigned int flags)
2484{ 2485{
2485 struct xfs_busy_extent *new; 2486 struct xfs_busy_extent *new;
2486 struct xfs_busy_extent *busyp; 2487 struct xfs_busy_extent *busyp;
@@ -2504,6 +2505,7 @@ xfs_alloc_busy_insert(
2504 new->bno = bno; 2505 new->bno = bno;
2505 new->length = len; 2506 new->length = len;
2506 INIT_LIST_HEAD(&new->list); 2507 INIT_LIST_HEAD(&new->list);
2508 new->flags = flags;
2507 2509
2508 /* trace before insert to be able to see failed inserts */ 2510 /* trace before insert to be able to see failed inserts */
2509 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len); 2511 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
@@ -2609,6 +2611,18 @@ xfs_alloc_busy_update_extent(
2609 xfs_agblock_t bend = bbno + busyp->length; 2611 xfs_agblock_t bend = bbno + busyp->length;
2610 2612
2611 /* 2613 /*
2614 * This extent is currently being discarded. Give the thread
2615 * performing the discard a chance to mark the extent unbusy
2616 * and retry.
2617 */
2618 if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) {
2619 spin_unlock(&pag->pagb_lock);
2620 delay(1);
2621 spin_lock(&pag->pagb_lock);
2622 return false;
2623 }
2624
2625 /*
2612 * If there is a busy extent overlapping a user allocation, we have 2626 * If there is a busy extent overlapping a user allocation, we have
2613 * no choice but to force the log and retry the search. 2627 * no choice but to force the log and retry the search.
2614 * 2628 *
@@ -2813,7 +2827,8 @@ restart:
2813 * If this is a metadata allocation, try to reuse the busy 2827 * If this is a metadata allocation, try to reuse the busy
2814 * extent instead of trimming the allocation. 2828 * extent instead of trimming the allocation.
2815 */ 2829 */
2816 if (!args->userdata) { 2830 if (!args->userdata &&
2831 !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) {
2817 if (!xfs_alloc_busy_update_extent(args->mp, args->pag, 2832 if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
2818 busyp, fbno, flen, 2833 busyp, fbno, flen,
2819 false)) 2834 false))
@@ -2979,10 +2994,16 @@ xfs_alloc_busy_clear_one(
2979 kmem_free(busyp); 2994 kmem_free(busyp);
2980} 2995}
2981 2996
2997/*
2998 * Remove all extents on the passed in list from the busy extents tree.
2999 * If do_discard is set skip extents that need to be discarded, and mark
3000 * these as undergoing a discard operation instead.
3001 */
2982void 3002void
2983xfs_alloc_busy_clear( 3003xfs_alloc_busy_clear(
2984 struct xfs_mount *mp, 3004 struct xfs_mount *mp,
2985 struct list_head *list) 3005 struct list_head *list,
3006 bool do_discard)
2986{ 3007{
2987 struct xfs_busy_extent *busyp, *n; 3008 struct xfs_busy_extent *busyp, *n;
2988 struct xfs_perag *pag = NULL; 3009 struct xfs_perag *pag = NULL;
@@ -2999,7 +3020,11 @@ xfs_alloc_busy_clear(
2999 agno = busyp->agno; 3020 agno = busyp->agno;
3000 } 3021 }
3001 3022
3002 xfs_alloc_busy_clear_one(mp, pag, busyp); 3023 if (do_discard && busyp->length &&
3024 !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD))
3025 busyp->flags = XFS_ALLOC_BUSY_DISCARDED;
3026 else
3027 xfs_alloc_busy_clear_one(mp, pag, busyp);
3003 } 3028 }
3004 3029
3005 if (pag) { 3030 if (pag) {
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 240ad288f2f9..2f52b924be79 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -137,10 +137,11 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
137#ifdef __KERNEL__ 137#ifdef __KERNEL__
138void 138void
139xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, 139xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
140 xfs_agblock_t bno, xfs_extlen_t len); 140 xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
141 141
142void 142void
143xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list); 143xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list,
144 bool do_discard);
144 145
145int 146int
146xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, 147xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 8b469d53599f..2b3518826a69 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -120,7 +120,8 @@ xfs_allocbt_free_block(
120 if (error) 120 if (error)
121 return error; 121 return error;
122 122
123 xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); 123 xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
124 XFS_ALLOC_BUSY_SKIP_DISCARD);
124 xfs_trans_agbtree_delta(cur->bc_tp, -1); 125 xfs_trans_agbtree_delta(cur->bc_tp, -1);
125 return 0; 126 return 0;
126} 127}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index fa00788de2f5..e546a33214c9 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -89,36 +89,19 @@ xfs_bmap_add_attrfork_local(
89 int *flags); /* inode logging flags */ 89 int *flags); /* inode logging flags */
90 90
91/* 91/*
92 * Called by xfs_bmapi to update file extent records and the btree
93 * after allocating space (or doing a delayed allocation).
94 */
95STATIC int /* error */
96xfs_bmap_add_extent(
97 xfs_inode_t *ip, /* incore inode pointer */
98 xfs_extnum_t idx, /* extent number to update/insert */
99 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
100 xfs_bmbt_irec_t *new, /* new data to add to file extents */
101 xfs_fsblock_t *first, /* pointer to firstblock variable */
102 xfs_bmap_free_t *flist, /* list of extents to be freed */
103 int *logflagsp, /* inode logging flags */
104 int whichfork, /* data or attr fork */
105 int rsvd); /* OK to allocate reserved blocks */
106
107/*
108 * Called by xfs_bmap_add_extent to handle cases converting a delayed 92 * Called by xfs_bmap_add_extent to handle cases converting a delayed
109 * allocation to a real allocation. 93 * allocation to a real allocation.
110 */ 94 */
111STATIC int /* error */ 95STATIC int /* error */
112xfs_bmap_add_extent_delay_real( 96xfs_bmap_add_extent_delay_real(
113 xfs_inode_t *ip, /* incore inode pointer */ 97 xfs_inode_t *ip, /* incore inode pointer */
114 xfs_extnum_t idx, /* extent number to update/insert */ 98 xfs_extnum_t *idx, /* extent number to update/insert */
115 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 99 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
116 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 100 xfs_bmbt_irec_t *new, /* new data to add to file extents */
117 xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ 101 xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
118 xfs_fsblock_t *first, /* pointer to firstblock variable */ 102 xfs_fsblock_t *first, /* pointer to firstblock variable */
119 xfs_bmap_free_t *flist, /* list of extents to be freed */ 103 xfs_bmap_free_t *flist, /* list of extents to be freed */
120 int *logflagsp, /* inode logging flags */ 104 int *logflagsp); /* inode logging flags */
121 int rsvd); /* OK to allocate reserved blocks */
122 105
123/* 106/*
124 * Called by xfs_bmap_add_extent to handle cases converting a hole 107 * Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -127,10 +110,9 @@ xfs_bmap_add_extent_delay_real(
127STATIC int /* error */ 110STATIC int /* error */
128xfs_bmap_add_extent_hole_delay( 111xfs_bmap_add_extent_hole_delay(
129 xfs_inode_t *ip, /* incore inode pointer */ 112 xfs_inode_t *ip, /* incore inode pointer */
130 xfs_extnum_t idx, /* extent number to update/insert */ 113 xfs_extnum_t *idx, /* extent number to update/insert */
131 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 114 xfs_bmbt_irec_t *new, /* new data to add to file extents */
132 int *logflagsp,/* inode logging flags */ 115 int *logflagsp); /* inode logging flags */
133 int rsvd); /* OK to allocate reserved blocks */
134 116
135/* 117/*
136 * Called by xfs_bmap_add_extent to handle cases converting a hole 118 * Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -139,7 +121,7 @@ xfs_bmap_add_extent_hole_delay(
139STATIC int /* error */ 121STATIC int /* error */
140xfs_bmap_add_extent_hole_real( 122xfs_bmap_add_extent_hole_real(
141 xfs_inode_t *ip, /* incore inode pointer */ 123 xfs_inode_t *ip, /* incore inode pointer */
142 xfs_extnum_t idx, /* extent number to update/insert */ 124 xfs_extnum_t *idx, /* extent number to update/insert */
143 xfs_btree_cur_t *cur, /* if null, not a btree */ 125 xfs_btree_cur_t *cur, /* if null, not a btree */
144 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 126 xfs_bmbt_irec_t *new, /* new data to add to file extents */
145 int *logflagsp, /* inode logging flags */ 127 int *logflagsp, /* inode logging flags */
@@ -152,7 +134,7 @@ xfs_bmap_add_extent_hole_real(
152STATIC int /* error */ 134STATIC int /* error */
153xfs_bmap_add_extent_unwritten_real( 135xfs_bmap_add_extent_unwritten_real(
154 xfs_inode_t *ip, /* incore inode pointer */ 136 xfs_inode_t *ip, /* incore inode pointer */
155 xfs_extnum_t idx, /* extent number to update/insert */ 137 xfs_extnum_t *idx, /* extent number to update/insert */
156 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 138 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
157 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 139 xfs_bmbt_irec_t *new, /* new data to add to file extents */
158 int *logflagsp); /* inode logging flags */ 140 int *logflagsp); /* inode logging flags */
@@ -180,22 +162,6 @@ xfs_bmap_btree_to_extents(
180 int whichfork); /* data or attr fork */ 162 int whichfork); /* data or attr fork */
181 163
182/* 164/*
183 * Called by xfs_bmapi to update file extent records and the btree
184 * after removing space (or undoing a delayed allocation).
185 */
186STATIC int /* error */
187xfs_bmap_del_extent(
188 xfs_inode_t *ip, /* incore inode pointer */
189 xfs_trans_t *tp, /* current trans pointer */
190 xfs_extnum_t idx, /* extent number to update/insert */
191 xfs_bmap_free_t *flist, /* list of extents to be freed */
192 xfs_btree_cur_t *cur, /* if null, not a btree */
193 xfs_bmbt_irec_t *new, /* new data to add to file extents */
194 int *logflagsp,/* inode logging flags */
195 int whichfork, /* data or attr fork */
196 int rsvd); /* OK to allocate reserved blocks */
197
198/*
199 * Remove the entry "free" from the free item list. Prev points to the 165 * Remove the entry "free" from the free item list. Prev points to the
200 * previous entry, unless "free" is the head of the list. 166 * previous entry, unless "free" is the head of the list.
201 */ 167 */
@@ -474,14 +440,13 @@ xfs_bmap_add_attrfork_local(
474STATIC int /* error */ 440STATIC int /* error */
475xfs_bmap_add_extent( 441xfs_bmap_add_extent(
476 xfs_inode_t *ip, /* incore inode pointer */ 442 xfs_inode_t *ip, /* incore inode pointer */
477 xfs_extnum_t idx, /* extent number to update/insert */ 443 xfs_extnum_t *idx, /* extent number to update/insert */
478 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 444 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
479 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 445 xfs_bmbt_irec_t *new, /* new data to add to file extents */
480 xfs_fsblock_t *first, /* pointer to firstblock variable */ 446 xfs_fsblock_t *first, /* pointer to firstblock variable */
481 xfs_bmap_free_t *flist, /* list of extents to be freed */ 447 xfs_bmap_free_t *flist, /* list of extents to be freed */
482 int *logflagsp, /* inode logging flags */ 448 int *logflagsp, /* inode logging flags */
483 int whichfork, /* data or attr fork */ 449 int whichfork) /* data or attr fork */
484 int rsvd) /* OK to use reserved data blocks */
485{ 450{
486 xfs_btree_cur_t *cur; /* btree cursor or null */ 451 xfs_btree_cur_t *cur; /* btree cursor or null */
487 xfs_filblks_t da_new; /* new count del alloc blocks used */ 452 xfs_filblks_t da_new; /* new count del alloc blocks used */
@@ -492,23 +457,27 @@ xfs_bmap_add_extent(
492 xfs_extnum_t nextents; /* number of extents in file now */ 457 xfs_extnum_t nextents; /* number of extents in file now */
493 458
494 XFS_STATS_INC(xs_add_exlist); 459 XFS_STATS_INC(xs_add_exlist);
460
495 cur = *curp; 461 cur = *curp;
496 ifp = XFS_IFORK_PTR(ip, whichfork); 462 ifp = XFS_IFORK_PTR(ip, whichfork);
497 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 463 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
498 ASSERT(idx <= nextents);
499 da_old = da_new = 0; 464 da_old = da_new = 0;
500 error = 0; 465 error = 0;
466
467 ASSERT(*idx >= 0);
468 ASSERT(*idx <= nextents);
469
501 /* 470 /*
502 * This is the first extent added to a new/empty file. 471 * This is the first extent added to a new/empty file.
503 * Special case this one, so other routines get to assume there are 472 * Special case this one, so other routines get to assume there are
504 * already extents in the list. 473 * already extents in the list.
505 */ 474 */
506 if (nextents == 0) { 475 if (nextents == 0) {
507 xfs_iext_insert(ip, 0, 1, new, 476 xfs_iext_insert(ip, *idx, 1, new,
508 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); 477 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
509 478
510 ASSERT(cur == NULL); 479 ASSERT(cur == NULL);
511 ifp->if_lastex = 0; 480
512 if (!isnullstartblock(new->br_startblock)) { 481 if (!isnullstartblock(new->br_startblock)) {
513 XFS_IFORK_NEXT_SET(ip, whichfork, 1); 482 XFS_IFORK_NEXT_SET(ip, whichfork, 1);
514 logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); 483 logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
@@ -522,27 +491,25 @@ xfs_bmap_add_extent(
522 if (cur) 491 if (cur)
523 ASSERT((cur->bc_private.b.flags & 492 ASSERT((cur->bc_private.b.flags &
524 XFS_BTCUR_BPRV_WASDEL) == 0); 493 XFS_BTCUR_BPRV_WASDEL) == 0);
525 if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new, 494 error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
526 &logflags, rsvd))) 495 &logflags);
527 goto done;
528 } 496 }
529 /* 497 /*
530 * Real allocation off the end of the file. 498 * Real allocation off the end of the file.
531 */ 499 */
532 else if (idx == nextents) { 500 else if (*idx == nextents) {
533 if (cur) 501 if (cur)
534 ASSERT((cur->bc_private.b.flags & 502 ASSERT((cur->bc_private.b.flags &
535 XFS_BTCUR_BPRV_WASDEL) == 0); 503 XFS_BTCUR_BPRV_WASDEL) == 0);
536 if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, 504 error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
537 &logflags, whichfork))) 505 &logflags, whichfork);
538 goto done;
539 } else { 506 } else {
540 xfs_bmbt_irec_t prev; /* old extent at offset idx */ 507 xfs_bmbt_irec_t prev; /* old extent at offset idx */
541 508
542 /* 509 /*
543 * Get the record referred to by idx. 510 * Get the record referred to by idx.
544 */ 511 */
545 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &prev); 512 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev);
546 /* 513 /*
547 * If it's a real allocation record, and the new allocation ends 514 * If it's a real allocation record, and the new allocation ends
548 * after the start of the referred to record, then we're filling 515 * after the start of the referred to record, then we're filling
@@ -557,22 +524,18 @@ xfs_bmap_add_extent(
557 if (cur) 524 if (cur)
558 ASSERT(cur->bc_private.b.flags & 525 ASSERT(cur->bc_private.b.flags &
559 XFS_BTCUR_BPRV_WASDEL); 526 XFS_BTCUR_BPRV_WASDEL);
560 if ((error = xfs_bmap_add_extent_delay_real(ip, 527 error = xfs_bmap_add_extent_delay_real(ip,
561 idx, &cur, new, &da_new, first, flist, 528 idx, &cur, new, &da_new,
562 &logflags, rsvd))) 529 first, flist, &logflags);
563 goto done;
564 } else if (new->br_state == XFS_EXT_NORM) {
565 ASSERT(new->br_state == XFS_EXT_NORM);
566 if ((error = xfs_bmap_add_extent_unwritten_real(
567 ip, idx, &cur, new, &logflags)))
568 goto done;
569 } else { 530 } else {
570 ASSERT(new->br_state == XFS_EXT_UNWRITTEN); 531 ASSERT(new->br_state == XFS_EXT_NORM ||
571 if ((error = xfs_bmap_add_extent_unwritten_real( 532 new->br_state == XFS_EXT_UNWRITTEN);
572 ip, idx, &cur, new, &logflags))) 533
534 error = xfs_bmap_add_extent_unwritten_real(ip,
535 idx, &cur, new, &logflags);
536 if (error)
573 goto done; 537 goto done;
574 } 538 }
575 ASSERT(*curp == cur || *curp == NULL);
576 } 539 }
577 /* 540 /*
578 * Otherwise we're filling in a hole with an allocation. 541 * Otherwise we're filling in a hole with an allocation.
@@ -581,13 +544,15 @@ xfs_bmap_add_extent(
581 if (cur) 544 if (cur)
582 ASSERT((cur->bc_private.b.flags & 545 ASSERT((cur->bc_private.b.flags &
583 XFS_BTCUR_BPRV_WASDEL) == 0); 546 XFS_BTCUR_BPRV_WASDEL) == 0);
584 if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, 547 error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
585 new, &logflags, whichfork))) 548 new, &logflags, whichfork);
586 goto done;
587 } 549 }
588 } 550 }
589 551
552 if (error)
553 goto done;
590 ASSERT(*curp == cur || *curp == NULL); 554 ASSERT(*curp == cur || *curp == NULL);
555
591 /* 556 /*
592 * Convert to a btree if necessary. 557 * Convert to a btree if necessary.
593 */ 558 */
@@ -615,7 +580,7 @@ xfs_bmap_add_extent(
615 ASSERT(nblks <= da_old); 580 ASSERT(nblks <= da_old);
616 if (nblks < da_old) 581 if (nblks < da_old)
617 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, 582 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
618 (int64_t)(da_old - nblks), rsvd); 583 (int64_t)(da_old - nblks), 0);
619 } 584 }
620 /* 585 /*
621 * Clear out the allocated field, done with it now in any case. 586 * Clear out the allocated field, done with it now in any case.
@@ -640,14 +605,13 @@ done:
640STATIC int /* error */ 605STATIC int /* error */
641xfs_bmap_add_extent_delay_real( 606xfs_bmap_add_extent_delay_real(
642 xfs_inode_t *ip, /* incore inode pointer */ 607 xfs_inode_t *ip, /* incore inode pointer */
643 xfs_extnum_t idx, /* extent number to update/insert */ 608 xfs_extnum_t *idx, /* extent number to update/insert */
644 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 609 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
645 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 610 xfs_bmbt_irec_t *new, /* new data to add to file extents */
646 xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ 611 xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
647 xfs_fsblock_t *first, /* pointer to firstblock variable */ 612 xfs_fsblock_t *first, /* pointer to firstblock variable */
648 xfs_bmap_free_t *flist, /* list of extents to be freed */ 613 xfs_bmap_free_t *flist, /* list of extents to be freed */
649 int *logflagsp, /* inode logging flags */ 614 int *logflagsp) /* inode logging flags */
650 int rsvd) /* OK to use reserved data block allocation */
651{ 615{
652 xfs_btree_cur_t *cur; /* btree cursor */ 616 xfs_btree_cur_t *cur; /* btree cursor */
653 int diff; /* temp value */ 617 int diff; /* temp value */
@@ -673,7 +637,7 @@ xfs_bmap_add_extent_delay_real(
673 */ 637 */
674 cur = *curp; 638 cur = *curp;
675 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 639 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
676 ep = xfs_iext_get_ext(ifp, idx); 640 ep = xfs_iext_get_ext(ifp, *idx);
677 xfs_bmbt_get_all(ep, &PREV); 641 xfs_bmbt_get_all(ep, &PREV);
678 new_endoff = new->br_startoff + new->br_blockcount; 642 new_endoff = new->br_startoff + new->br_blockcount;
679 ASSERT(PREV.br_startoff <= new->br_startoff); 643 ASSERT(PREV.br_startoff <= new->br_startoff);
@@ -692,9 +656,9 @@ xfs_bmap_add_extent_delay_real(
692 * Check and set flags if this segment has a left neighbor. 656 * Check and set flags if this segment has a left neighbor.
693 * Don't set contiguous if the combined extent would be too large. 657 * Don't set contiguous if the combined extent would be too large.
694 */ 658 */
695 if (idx > 0) { 659 if (*idx > 0) {
696 state |= BMAP_LEFT_VALID; 660 state |= BMAP_LEFT_VALID;
697 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); 661 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
698 662
699 if (isnullstartblock(LEFT.br_startblock)) 663 if (isnullstartblock(LEFT.br_startblock))
700 state |= BMAP_LEFT_DELAY; 664 state |= BMAP_LEFT_DELAY;
@@ -712,9 +676,9 @@ xfs_bmap_add_extent_delay_real(
712 * Don't set contiguous if the combined extent would be too large. 676 * Don't set contiguous if the combined extent would be too large.
713 * Also check for all-three-contiguous being too large. 677 * Also check for all-three-contiguous being too large.
714 */ 678 */
715 if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { 679 if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
716 state |= BMAP_RIGHT_VALID; 680 state |= BMAP_RIGHT_VALID;
717 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); 681 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
718 682
719 if (isnullstartblock(RIGHT.br_startblock)) 683 if (isnullstartblock(RIGHT.br_startblock))
720 state |= BMAP_RIGHT_DELAY; 684 state |= BMAP_RIGHT_DELAY;
@@ -745,14 +709,14 @@ xfs_bmap_add_extent_delay_real(
745 * Filling in all of a previously delayed allocation extent. 709 * Filling in all of a previously delayed allocation extent.
746 * The left and right neighbors are both contiguous with new. 710 * The left and right neighbors are both contiguous with new.
747 */ 711 */
748 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 712 --*idx;
749 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 713 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
714 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
750 LEFT.br_blockcount + PREV.br_blockcount + 715 LEFT.br_blockcount + PREV.br_blockcount +
751 RIGHT.br_blockcount); 716 RIGHT.br_blockcount);
752 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 717 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
753 718
754 xfs_iext_remove(ip, idx, 2, state); 719 xfs_iext_remove(ip, *idx + 1, 2, state);
755 ip->i_df.if_lastex = idx - 1;
756 ip->i_d.di_nextents--; 720 ip->i_d.di_nextents--;
757 if (cur == NULL) 721 if (cur == NULL)
758 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 722 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -784,13 +748,14 @@ xfs_bmap_add_extent_delay_real(
784 * Filling in all of a previously delayed allocation extent. 748 * Filling in all of a previously delayed allocation extent.
785 * The left neighbor is contiguous, the right is not. 749 * The left neighbor is contiguous, the right is not.
786 */ 750 */
787 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 751 --*idx;
788 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 752
753 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
754 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
789 LEFT.br_blockcount + PREV.br_blockcount); 755 LEFT.br_blockcount + PREV.br_blockcount);
790 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 756 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
791 757
792 ip->i_df.if_lastex = idx - 1; 758 xfs_iext_remove(ip, *idx + 1, 1, state);
793 xfs_iext_remove(ip, idx, 1, state);
794 if (cur == NULL) 759 if (cur == NULL)
795 rval = XFS_ILOG_DEXT; 760 rval = XFS_ILOG_DEXT;
796 else { 761 else {
@@ -814,14 +779,13 @@ xfs_bmap_add_extent_delay_real(
814 * Filling in all of a previously delayed allocation extent. 779 * Filling in all of a previously delayed allocation extent.
815 * The right neighbor is contiguous, the left is not. 780 * The right neighbor is contiguous, the left is not.
816 */ 781 */
817 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 782 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
818 xfs_bmbt_set_startblock(ep, new->br_startblock); 783 xfs_bmbt_set_startblock(ep, new->br_startblock);
819 xfs_bmbt_set_blockcount(ep, 784 xfs_bmbt_set_blockcount(ep,
820 PREV.br_blockcount + RIGHT.br_blockcount); 785 PREV.br_blockcount + RIGHT.br_blockcount);
821 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 786 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
822 787
823 ip->i_df.if_lastex = idx; 788 xfs_iext_remove(ip, *idx + 1, 1, state);
824 xfs_iext_remove(ip, idx + 1, 1, state);
825 if (cur == NULL) 789 if (cur == NULL)
826 rval = XFS_ILOG_DEXT; 790 rval = XFS_ILOG_DEXT;
827 else { 791 else {
@@ -837,6 +801,7 @@ xfs_bmap_add_extent_delay_real(
837 RIGHT.br_blockcount, PREV.br_state))) 801 RIGHT.br_blockcount, PREV.br_state)))
838 goto done; 802 goto done;
839 } 803 }
804
840 *dnew = 0; 805 *dnew = 0;
841 break; 806 break;
842 807
@@ -846,11 +811,10 @@ xfs_bmap_add_extent_delay_real(
846 * Neither the left nor right neighbors are contiguous with 811 * Neither the left nor right neighbors are contiguous with
847 * the new one. 812 * the new one.
848 */ 813 */
849 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 814 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
850 xfs_bmbt_set_startblock(ep, new->br_startblock); 815 xfs_bmbt_set_startblock(ep, new->br_startblock);
851 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 816 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
852 817
853 ip->i_df.if_lastex = idx;
854 ip->i_d.di_nextents++; 818 ip->i_d.di_nextents++;
855 if (cur == NULL) 819 if (cur == NULL)
856 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 820 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -866,6 +830,7 @@ xfs_bmap_add_extent_delay_real(
866 goto done; 830 goto done;
867 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 831 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
868 } 832 }
833
869 *dnew = 0; 834 *dnew = 0;
870 break; 835 break;
871 836
@@ -874,17 +839,16 @@ xfs_bmap_add_extent_delay_real(
874 * Filling in the first part of a previous delayed allocation. 839 * Filling in the first part of a previous delayed allocation.
875 * The left neighbor is contiguous. 840 * The left neighbor is contiguous.
876 */ 841 */
877 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 842 trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
878 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 843 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
879 LEFT.br_blockcount + new->br_blockcount); 844 LEFT.br_blockcount + new->br_blockcount);
880 xfs_bmbt_set_startoff(ep, 845 xfs_bmbt_set_startoff(ep,
881 PREV.br_startoff + new->br_blockcount); 846 PREV.br_startoff + new->br_blockcount);
882 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 847 trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
883 848
884 temp = PREV.br_blockcount - new->br_blockcount; 849 temp = PREV.br_blockcount - new->br_blockcount;
885 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 850 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
886 xfs_bmbt_set_blockcount(ep, temp); 851 xfs_bmbt_set_blockcount(ep, temp);
887 ip->i_df.if_lastex = idx - 1;
888 if (cur == NULL) 852 if (cur == NULL)
889 rval = XFS_ILOG_DEXT; 853 rval = XFS_ILOG_DEXT;
890 else { 854 else {
@@ -904,7 +868,9 @@ xfs_bmap_add_extent_delay_real(
904 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 868 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
905 startblockval(PREV.br_startblock)); 869 startblockval(PREV.br_startblock));
906 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 870 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
907 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 871 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
872
873 --*idx;
908 *dnew = temp; 874 *dnew = temp;
909 break; 875 break;
910 876
@@ -913,12 +879,11 @@ xfs_bmap_add_extent_delay_real(
913 * Filling in the first part of a previous delayed allocation. 879 * Filling in the first part of a previous delayed allocation.
914 * The left neighbor is not contiguous. 880 * The left neighbor is not contiguous.
915 */ 881 */
916 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 882 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
917 xfs_bmbt_set_startoff(ep, new_endoff); 883 xfs_bmbt_set_startoff(ep, new_endoff);
918 temp = PREV.br_blockcount - new->br_blockcount; 884 temp = PREV.br_blockcount - new->br_blockcount;
919 xfs_bmbt_set_blockcount(ep, temp); 885 xfs_bmbt_set_blockcount(ep, temp);
920 xfs_iext_insert(ip, idx, 1, new, state); 886 xfs_iext_insert(ip, *idx, 1, new, state);
921 ip->i_df.if_lastex = idx;
922 ip->i_d.di_nextents++; 887 ip->i_d.di_nextents++;
923 if (cur == NULL) 888 if (cur == NULL)
924 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 889 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -946,9 +911,10 @@ xfs_bmap_add_extent_delay_real(
946 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 911 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
947 startblockval(PREV.br_startblock) - 912 startblockval(PREV.br_startblock) -
948 (cur ? cur->bc_private.b.allocated : 0)); 913 (cur ? cur->bc_private.b.allocated : 0));
949 ep = xfs_iext_get_ext(ifp, idx + 1); 914 ep = xfs_iext_get_ext(ifp, *idx + 1);
950 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 915 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
951 trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); 916 trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
917
952 *dnew = temp; 918 *dnew = temp;
953 break; 919 break;
954 920
@@ -958,15 +924,13 @@ xfs_bmap_add_extent_delay_real(
958 * The right neighbor is contiguous with the new allocation. 924 * The right neighbor is contiguous with the new allocation.
959 */ 925 */
960 temp = PREV.br_blockcount - new->br_blockcount; 926 temp = PREV.br_blockcount - new->br_blockcount;
961 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 927 trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_);
962 trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
963 xfs_bmbt_set_blockcount(ep, temp); 928 xfs_bmbt_set_blockcount(ep, temp);
964 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), 929 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1),
965 new->br_startoff, new->br_startblock, 930 new->br_startoff, new->br_startblock,
966 new->br_blockcount + RIGHT.br_blockcount, 931 new->br_blockcount + RIGHT.br_blockcount,
967 RIGHT.br_state); 932 RIGHT.br_state);
968 trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); 933 trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
969 ip->i_df.if_lastex = idx + 1;
970 if (cur == NULL) 934 if (cur == NULL)
971 rval = XFS_ILOG_DEXT; 935 rval = XFS_ILOG_DEXT;
972 else { 936 else {
@@ -983,10 +947,14 @@ xfs_bmap_add_extent_delay_real(
983 RIGHT.br_state))) 947 RIGHT.br_state)))
984 goto done; 948 goto done;
985 } 949 }
950
986 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 951 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
987 startblockval(PREV.br_startblock)); 952 startblockval(PREV.br_startblock));
953 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
988 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 954 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
989 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 955 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
956
957 ++*idx;
990 *dnew = temp; 958 *dnew = temp;
991 break; 959 break;
992 960
@@ -996,10 +964,9 @@ xfs_bmap_add_extent_delay_real(
996 * The right neighbor is not contiguous. 964 * The right neighbor is not contiguous.
997 */ 965 */
998 temp = PREV.br_blockcount - new->br_blockcount; 966 temp = PREV.br_blockcount - new->br_blockcount;
999 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 967 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1000 xfs_bmbt_set_blockcount(ep, temp); 968 xfs_bmbt_set_blockcount(ep, temp);
1001 xfs_iext_insert(ip, idx + 1, 1, new, state); 969 xfs_iext_insert(ip, *idx + 1, 1, new, state);
1002 ip->i_df.if_lastex = idx + 1;
1003 ip->i_d.di_nextents++; 970 ip->i_d.di_nextents++;
1004 if (cur == NULL) 971 if (cur == NULL)
1005 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 972 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1027,9 +994,11 @@ xfs_bmap_add_extent_delay_real(
1027 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 994 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
1028 startblockval(PREV.br_startblock) - 995 startblockval(PREV.br_startblock) -
1029 (cur ? cur->bc_private.b.allocated : 0)); 996 (cur ? cur->bc_private.b.allocated : 0));
1030 ep = xfs_iext_get_ext(ifp, idx); 997 ep = xfs_iext_get_ext(ifp, *idx);
1031 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 998 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1032 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 999 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1000
1001 ++*idx;
1033 *dnew = temp; 1002 *dnew = temp;
1034 break; 1003 break;
1035 1004
@@ -1056,7 +1025,7 @@ xfs_bmap_add_extent_delay_real(
1056 */ 1025 */
1057 temp = new->br_startoff - PREV.br_startoff; 1026 temp = new->br_startoff - PREV.br_startoff;
1058 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; 1027 temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
1059 trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_); 1028 trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_);
1060 xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ 1029 xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
1061 LEFT = *new; 1030 LEFT = *new;
1062 RIGHT.br_state = PREV.br_state; 1031 RIGHT.br_state = PREV.br_state;
@@ -1065,8 +1034,7 @@ xfs_bmap_add_extent_delay_real(
1065 RIGHT.br_startoff = new_endoff; 1034 RIGHT.br_startoff = new_endoff;
1066 RIGHT.br_blockcount = temp2; 1035 RIGHT.br_blockcount = temp2;
1067 /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ 1036 /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
1068 xfs_iext_insert(ip, idx + 1, 2, &LEFT, state); 1037 xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state);
1069 ip->i_df.if_lastex = idx + 1;
1070 ip->i_d.di_nextents++; 1038 ip->i_d.di_nextents++;
1071 if (cur == NULL) 1039 if (cur == NULL)
1072 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1040 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1097,7 +1065,7 @@ xfs_bmap_add_extent_delay_real(
1097 (cur ? cur->bc_private.b.allocated : 0)); 1065 (cur ? cur->bc_private.b.allocated : 0));
1098 if (diff > 0 && 1066 if (diff > 0 &&
1099 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, 1067 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
1100 -((int64_t)diff), rsvd)) { 1068 -((int64_t)diff), 0)) {
1101 /* 1069 /*
1102 * Ick gross gag me with a spoon. 1070 * Ick gross gag me with a spoon.
1103 */ 1071 */
@@ -1109,7 +1077,7 @@ xfs_bmap_add_extent_delay_real(
1109 if (!diff || 1077 if (!diff ||
1110 !xfs_icsb_modify_counters(ip->i_mount, 1078 !xfs_icsb_modify_counters(ip->i_mount,
1111 XFS_SBS_FDBLOCKS, 1079 XFS_SBS_FDBLOCKS,
1112 -((int64_t)diff), rsvd)) 1080 -((int64_t)diff), 0))
1113 break; 1081 break;
1114 } 1082 }
1115 if (temp2) { 1083 if (temp2) {
@@ -1118,18 +1086,20 @@ xfs_bmap_add_extent_delay_real(
1118 if (!diff || 1086 if (!diff ||
1119 !xfs_icsb_modify_counters(ip->i_mount, 1087 !xfs_icsb_modify_counters(ip->i_mount,
1120 XFS_SBS_FDBLOCKS, 1088 XFS_SBS_FDBLOCKS,
1121 -((int64_t)diff), rsvd)) 1089 -((int64_t)diff), 0))
1122 break; 1090 break;
1123 } 1091 }
1124 } 1092 }
1125 } 1093 }
1126 ep = xfs_iext_get_ext(ifp, idx); 1094 ep = xfs_iext_get_ext(ifp, *idx);
1127 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 1095 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
1128 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1096 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1129 trace_xfs_bmap_pre_update(ip, idx + 2, state, _THIS_IP_); 1097 trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_);
1130 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2), 1098 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2),
1131 nullstartblock((int)temp2)); 1099 nullstartblock((int)temp2));
1132 trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_); 1100 trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_);
1101
1102 ++*idx;
1133 *dnew = temp + temp2; 1103 *dnew = temp + temp2;
1134 break; 1104 break;
1135 1105
@@ -1161,7 +1131,7 @@ done:
1161STATIC int /* error */ 1131STATIC int /* error */
1162xfs_bmap_add_extent_unwritten_real( 1132xfs_bmap_add_extent_unwritten_real(
1163 xfs_inode_t *ip, /* incore inode pointer */ 1133 xfs_inode_t *ip, /* incore inode pointer */
1164 xfs_extnum_t idx, /* extent number to update/insert */ 1134 xfs_extnum_t *idx, /* extent number to update/insert */
1165 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ 1135 xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
1166 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 1136 xfs_bmbt_irec_t *new, /* new data to add to file extents */
1167 int *logflagsp) /* inode logging flags */ 1137 int *logflagsp) /* inode logging flags */
@@ -1188,7 +1158,7 @@ xfs_bmap_add_extent_unwritten_real(
1188 error = 0; 1158 error = 0;
1189 cur = *curp; 1159 cur = *curp;
1190 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1160 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1191 ep = xfs_iext_get_ext(ifp, idx); 1161 ep = xfs_iext_get_ext(ifp, *idx);
1192 xfs_bmbt_get_all(ep, &PREV); 1162 xfs_bmbt_get_all(ep, &PREV);
1193 newext = new->br_state; 1163 newext = new->br_state;
1194 oldext = (newext == XFS_EXT_UNWRITTEN) ? 1164 oldext = (newext == XFS_EXT_UNWRITTEN) ?
@@ -1211,9 +1181,9 @@ xfs_bmap_add_extent_unwritten_real(
1211 * Check and set flags if this segment has a left neighbor. 1181 * Check and set flags if this segment has a left neighbor.
1212 * Don't set contiguous if the combined extent would be too large. 1182 * Don't set contiguous if the combined extent would be too large.
1213 */ 1183 */
1214 if (idx > 0) { 1184 if (*idx > 0) {
1215 state |= BMAP_LEFT_VALID; 1185 state |= BMAP_LEFT_VALID;
1216 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); 1186 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
1217 1187
1218 if (isnullstartblock(LEFT.br_startblock)) 1188 if (isnullstartblock(LEFT.br_startblock))
1219 state |= BMAP_LEFT_DELAY; 1189 state |= BMAP_LEFT_DELAY;
@@ -1231,9 +1201,9 @@ xfs_bmap_add_extent_unwritten_real(
1231 * Don't set contiguous if the combined extent would be too large. 1201 * Don't set contiguous if the combined extent would be too large.
1232 * Also check for all-three-contiguous being too large. 1202 * Also check for all-three-contiguous being too large.
1233 */ 1203 */
1234 if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { 1204 if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
1235 state |= BMAP_RIGHT_VALID; 1205 state |= BMAP_RIGHT_VALID;
1236 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); 1206 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
1237 if (isnullstartblock(RIGHT.br_startblock)) 1207 if (isnullstartblock(RIGHT.br_startblock))
1238 state |= BMAP_RIGHT_DELAY; 1208 state |= BMAP_RIGHT_DELAY;
1239 } 1209 }
@@ -1262,14 +1232,15 @@ xfs_bmap_add_extent_unwritten_real(
1262 * Setting all of a previous oldext extent to newext. 1232 * Setting all of a previous oldext extent to newext.
1263 * The left and right neighbors are both contiguous with new. 1233 * The left and right neighbors are both contiguous with new.
1264 */ 1234 */
1265 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1235 --*idx;
1266 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1236
1237 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1238 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
1267 LEFT.br_blockcount + PREV.br_blockcount + 1239 LEFT.br_blockcount + PREV.br_blockcount +
1268 RIGHT.br_blockcount); 1240 RIGHT.br_blockcount);
1269 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1241 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1270 1242
1271 xfs_iext_remove(ip, idx, 2, state); 1243 xfs_iext_remove(ip, *idx + 1, 2, state);
1272 ip->i_df.if_lastex = idx - 1;
1273 ip->i_d.di_nextents -= 2; 1244 ip->i_d.di_nextents -= 2;
1274 if (cur == NULL) 1245 if (cur == NULL)
1275 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1246 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1305,13 +1276,14 @@ xfs_bmap_add_extent_unwritten_real(
1305 * Setting all of a previous oldext extent to newext. 1276 * Setting all of a previous oldext extent to newext.
1306 * The left neighbor is contiguous, the right is not. 1277 * The left neighbor is contiguous, the right is not.
1307 */ 1278 */
1308 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1279 --*idx;
1309 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1280
1281 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1282 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
1310 LEFT.br_blockcount + PREV.br_blockcount); 1283 LEFT.br_blockcount + PREV.br_blockcount);
1311 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1284 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1312 1285
1313 ip->i_df.if_lastex = idx - 1; 1286 xfs_iext_remove(ip, *idx + 1, 1, state);
1314 xfs_iext_remove(ip, idx, 1, state);
1315 ip->i_d.di_nextents--; 1287 ip->i_d.di_nextents--;
1316 if (cur == NULL) 1288 if (cur == NULL)
1317 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1289 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1341,13 +1313,12 @@ xfs_bmap_add_extent_unwritten_real(
1341 * Setting all of a previous oldext extent to newext. 1313 * Setting all of a previous oldext extent to newext.
1342 * The right neighbor is contiguous, the left is not. 1314 * The right neighbor is contiguous, the left is not.
1343 */ 1315 */
1344 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1316 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1345 xfs_bmbt_set_blockcount(ep, 1317 xfs_bmbt_set_blockcount(ep,
1346 PREV.br_blockcount + RIGHT.br_blockcount); 1318 PREV.br_blockcount + RIGHT.br_blockcount);
1347 xfs_bmbt_set_state(ep, newext); 1319 xfs_bmbt_set_state(ep, newext);
1348 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1320 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1349 ip->i_df.if_lastex = idx; 1321 xfs_iext_remove(ip, *idx + 1, 1, state);
1350 xfs_iext_remove(ip, idx + 1, 1, state);
1351 ip->i_d.di_nextents--; 1322 ip->i_d.di_nextents--;
1352 if (cur == NULL) 1323 if (cur == NULL)
1353 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1324 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1378,11 +1349,10 @@ xfs_bmap_add_extent_unwritten_real(
1378 * Neither the left nor right neighbors are contiguous with 1349 * Neither the left nor right neighbors are contiguous with
1379 * the new one. 1350 * the new one.
1380 */ 1351 */
1381 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1352 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1382 xfs_bmbt_set_state(ep, newext); 1353 xfs_bmbt_set_state(ep, newext);
1383 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1354 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1384 1355
1385 ip->i_df.if_lastex = idx;
1386 if (cur == NULL) 1356 if (cur == NULL)
1387 rval = XFS_ILOG_DEXT; 1357 rval = XFS_ILOG_DEXT;
1388 else { 1358 else {
@@ -1404,21 +1374,22 @@ xfs_bmap_add_extent_unwritten_real(
1404 * Setting the first part of a previous oldext extent to newext. 1374 * Setting the first part of a previous oldext extent to newext.
1405 * The left neighbor is contiguous. 1375 * The left neighbor is contiguous.
1406 */ 1376 */
1407 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1377 trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
1408 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1378 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
1409 LEFT.br_blockcount + new->br_blockcount); 1379 LEFT.br_blockcount + new->br_blockcount);
1410 xfs_bmbt_set_startoff(ep, 1380 xfs_bmbt_set_startoff(ep,
1411 PREV.br_startoff + new->br_blockcount); 1381 PREV.br_startoff + new->br_blockcount);
1412 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1382 trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
1413 1383
1414 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1384 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1415 xfs_bmbt_set_startblock(ep, 1385 xfs_bmbt_set_startblock(ep,
1416 new->br_startblock + new->br_blockcount); 1386 new->br_startblock + new->br_blockcount);
1417 xfs_bmbt_set_blockcount(ep, 1387 xfs_bmbt_set_blockcount(ep,
1418 PREV.br_blockcount - new->br_blockcount); 1388 PREV.br_blockcount - new->br_blockcount);
1419 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1389 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1390
1391 --*idx;
1420 1392
1421 ip->i_df.if_lastex = idx - 1;
1422 if (cur == NULL) 1393 if (cur == NULL)
1423 rval = XFS_ILOG_DEXT; 1394 rval = XFS_ILOG_DEXT;
1424 else { 1395 else {
@@ -1449,17 +1420,16 @@ xfs_bmap_add_extent_unwritten_real(
1449 * Setting the first part of a previous oldext extent to newext. 1420 * Setting the first part of a previous oldext extent to newext.
1450 * The left neighbor is not contiguous. 1421 * The left neighbor is not contiguous.
1451 */ 1422 */
1452 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1423 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1453 ASSERT(ep && xfs_bmbt_get_state(ep) == oldext); 1424 ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
1454 xfs_bmbt_set_startoff(ep, new_endoff); 1425 xfs_bmbt_set_startoff(ep, new_endoff);
1455 xfs_bmbt_set_blockcount(ep, 1426 xfs_bmbt_set_blockcount(ep,
1456 PREV.br_blockcount - new->br_blockcount); 1427 PREV.br_blockcount - new->br_blockcount);
1457 xfs_bmbt_set_startblock(ep, 1428 xfs_bmbt_set_startblock(ep,
1458 new->br_startblock + new->br_blockcount); 1429 new->br_startblock + new->br_blockcount);
1459 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1430 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1460 1431
1461 xfs_iext_insert(ip, idx, 1, new, state); 1432 xfs_iext_insert(ip, *idx, 1, new, state);
1462 ip->i_df.if_lastex = idx;
1463 ip->i_d.di_nextents++; 1433 ip->i_d.di_nextents++;
1464 if (cur == NULL) 1434 if (cur == NULL)
1465 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1435 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1488,17 +1458,19 @@ xfs_bmap_add_extent_unwritten_real(
1488 * Setting the last part of a previous oldext extent to newext. 1458 * Setting the last part of a previous oldext extent to newext.
1489 * The right neighbor is contiguous with the new allocation. 1459 * The right neighbor is contiguous with the new allocation.
1490 */ 1460 */
1491 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1461 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1492 trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
1493 xfs_bmbt_set_blockcount(ep, 1462 xfs_bmbt_set_blockcount(ep,
1494 PREV.br_blockcount - new->br_blockcount); 1463 PREV.br_blockcount - new->br_blockcount);
1495 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1464 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1496 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), 1465
1466 ++*idx;
1467
1468 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1469 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
1497 new->br_startoff, new->br_startblock, 1470 new->br_startoff, new->br_startblock,
1498 new->br_blockcount + RIGHT.br_blockcount, newext); 1471 new->br_blockcount + RIGHT.br_blockcount, newext);
1499 trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_); 1472 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1500 1473
1501 ip->i_df.if_lastex = idx + 1;
1502 if (cur == NULL) 1474 if (cur == NULL)
1503 rval = XFS_ILOG_DEXT; 1475 rval = XFS_ILOG_DEXT;
1504 else { 1476 else {
@@ -1528,13 +1500,14 @@ xfs_bmap_add_extent_unwritten_real(
1528 * Setting the last part of a previous oldext extent to newext. 1500 * Setting the last part of a previous oldext extent to newext.
1529 * The right neighbor is not contiguous. 1501 * The right neighbor is not contiguous.
1530 */ 1502 */
1531 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1503 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1532 xfs_bmbt_set_blockcount(ep, 1504 xfs_bmbt_set_blockcount(ep,
1533 PREV.br_blockcount - new->br_blockcount); 1505 PREV.br_blockcount - new->br_blockcount);
1534 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1506 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1507
1508 ++*idx;
1509 xfs_iext_insert(ip, *idx, 1, new, state);
1535 1510
1536 xfs_iext_insert(ip, idx + 1, 1, new, state);
1537 ip->i_df.if_lastex = idx + 1;
1538 ip->i_d.di_nextents++; 1511 ip->i_d.di_nextents++;
1539 if (cur == NULL) 1512 if (cur == NULL)
1540 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1513 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1568,10 +1541,10 @@ xfs_bmap_add_extent_unwritten_real(
1568 * newext. Contiguity is impossible here. 1541 * newext. Contiguity is impossible here.
1569 * One extent becomes three extents. 1542 * One extent becomes three extents.
1570 */ 1543 */
1571 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1544 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1572 xfs_bmbt_set_blockcount(ep, 1545 xfs_bmbt_set_blockcount(ep,
1573 new->br_startoff - PREV.br_startoff); 1546 new->br_startoff - PREV.br_startoff);
1574 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1547 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1575 1548
1576 r[0] = *new; 1549 r[0] = *new;
1577 r[1].br_startoff = new_endoff; 1550 r[1].br_startoff = new_endoff;
@@ -1579,8 +1552,10 @@ xfs_bmap_add_extent_unwritten_real(
1579 PREV.br_startoff + PREV.br_blockcount - new_endoff; 1552 PREV.br_startoff + PREV.br_blockcount - new_endoff;
1580 r[1].br_startblock = new->br_startblock + new->br_blockcount; 1553 r[1].br_startblock = new->br_startblock + new->br_blockcount;
1581 r[1].br_state = oldext; 1554 r[1].br_state = oldext;
1582 xfs_iext_insert(ip, idx + 1, 2, &r[0], state); 1555
1583 ip->i_df.if_lastex = idx + 1; 1556 ++*idx;
1557 xfs_iext_insert(ip, *idx, 2, &r[0], state);
1558
1584 ip->i_d.di_nextents += 2; 1559 ip->i_d.di_nextents += 2;
1585 if (cur == NULL) 1560 if (cur == NULL)
1586 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; 1561 rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1650,12 +1625,10 @@ done:
1650STATIC int /* error */ 1625STATIC int /* error */
1651xfs_bmap_add_extent_hole_delay( 1626xfs_bmap_add_extent_hole_delay(
1652 xfs_inode_t *ip, /* incore inode pointer */ 1627 xfs_inode_t *ip, /* incore inode pointer */
1653 xfs_extnum_t idx, /* extent number to update/insert */ 1628 xfs_extnum_t *idx, /* extent number to update/insert */
1654 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 1629 xfs_bmbt_irec_t *new, /* new data to add to file extents */
1655 int *logflagsp, /* inode logging flags */ 1630 int *logflagsp) /* inode logging flags */
1656 int rsvd) /* OK to allocate reserved blocks */
1657{ 1631{
1658 xfs_bmbt_rec_host_t *ep; /* extent record for idx */
1659 xfs_ifork_t *ifp; /* inode fork pointer */ 1632 xfs_ifork_t *ifp; /* inode fork pointer */
1660 xfs_bmbt_irec_t left; /* left neighbor extent entry */ 1633 xfs_bmbt_irec_t left; /* left neighbor extent entry */
1661 xfs_filblks_t newlen=0; /* new indirect size */ 1634 xfs_filblks_t newlen=0; /* new indirect size */
@@ -1665,16 +1638,15 @@ xfs_bmap_add_extent_hole_delay(
1665 xfs_filblks_t temp=0; /* temp for indirect calculations */ 1638 xfs_filblks_t temp=0; /* temp for indirect calculations */
1666 1639
1667 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); 1640 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
1668 ep = xfs_iext_get_ext(ifp, idx);
1669 state = 0; 1641 state = 0;
1670 ASSERT(isnullstartblock(new->br_startblock)); 1642 ASSERT(isnullstartblock(new->br_startblock));
1671 1643
1672 /* 1644 /*
1673 * Check and set flags if this segment has a left neighbor 1645 * Check and set flags if this segment has a left neighbor
1674 */ 1646 */
1675 if (idx > 0) { 1647 if (*idx > 0) {
1676 state |= BMAP_LEFT_VALID; 1648 state |= BMAP_LEFT_VALID;
1677 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); 1649 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
1678 1650
1679 if (isnullstartblock(left.br_startblock)) 1651 if (isnullstartblock(left.br_startblock))
1680 state |= BMAP_LEFT_DELAY; 1652 state |= BMAP_LEFT_DELAY;
@@ -1684,9 +1656,9 @@ xfs_bmap_add_extent_hole_delay(
1684 * Check and set flags if the current (right) segment exists. 1656 * Check and set flags if the current (right) segment exists.
1685 * If it doesn't exist, we're converting the hole at end-of-file. 1657 * If it doesn't exist, we're converting the hole at end-of-file.
1686 */ 1658 */
1687 if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { 1659 if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
1688 state |= BMAP_RIGHT_VALID; 1660 state |= BMAP_RIGHT_VALID;
1689 xfs_bmbt_get_all(ep, &right); 1661 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
1690 1662
1691 if (isnullstartblock(right.br_startblock)) 1663 if (isnullstartblock(right.br_startblock))
1692 state |= BMAP_RIGHT_DELAY; 1664 state |= BMAP_RIGHT_DELAY;
@@ -1719,21 +1691,21 @@ xfs_bmap_add_extent_hole_delay(
1719 * on the left and on the right. 1691 * on the left and on the right.
1720 * Merge all three into a single extent record. 1692 * Merge all three into a single extent record.
1721 */ 1693 */
1694 --*idx;
1722 temp = left.br_blockcount + new->br_blockcount + 1695 temp = left.br_blockcount + new->br_blockcount +
1723 right.br_blockcount; 1696 right.br_blockcount;
1724 1697
1725 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1698 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1726 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); 1699 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
1727 oldlen = startblockval(left.br_startblock) + 1700 oldlen = startblockval(left.br_startblock) +
1728 startblockval(new->br_startblock) + 1701 startblockval(new->br_startblock) +
1729 startblockval(right.br_startblock); 1702 startblockval(right.br_startblock);
1730 newlen = xfs_bmap_worst_indlen(ip, temp); 1703 newlen = xfs_bmap_worst_indlen(ip, temp);
1731 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), 1704 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
1732 nullstartblock((int)newlen)); 1705 nullstartblock((int)newlen));
1733 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1706 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1734 1707
1735 xfs_iext_remove(ip, idx, 1, state); 1708 xfs_iext_remove(ip, *idx + 1, 1, state);
1736 ip->i_df.if_lastex = idx - 1;
1737 break; 1709 break;
1738 1710
1739 case BMAP_LEFT_CONTIG: 1711 case BMAP_LEFT_CONTIG:
@@ -1742,17 +1714,17 @@ xfs_bmap_add_extent_hole_delay(
1742 * on the left. 1714 * on the left.
1743 * Merge the new allocation with the left neighbor. 1715 * Merge the new allocation with the left neighbor.
1744 */ 1716 */
1717 --*idx;
1745 temp = left.br_blockcount + new->br_blockcount; 1718 temp = left.br_blockcount + new->br_blockcount;
1746 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1719
1747 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); 1720 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1721 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
1748 oldlen = startblockval(left.br_startblock) + 1722 oldlen = startblockval(left.br_startblock) +
1749 startblockval(new->br_startblock); 1723 startblockval(new->br_startblock);
1750 newlen = xfs_bmap_worst_indlen(ip, temp); 1724 newlen = xfs_bmap_worst_indlen(ip, temp);
1751 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), 1725 xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
1752 nullstartblock((int)newlen)); 1726 nullstartblock((int)newlen));
1753 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1727 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1754
1755 ip->i_df.if_lastex = idx - 1;
1756 break; 1728 break;
1757 1729
1758 case BMAP_RIGHT_CONTIG: 1730 case BMAP_RIGHT_CONTIG:
@@ -1761,16 +1733,15 @@ xfs_bmap_add_extent_hole_delay(
1761 * on the right. 1733 * on the right.
1762 * Merge the new allocation with the right neighbor. 1734 * Merge the new allocation with the right neighbor.
1763 */ 1735 */
1764 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1736 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1765 temp = new->br_blockcount + right.br_blockcount; 1737 temp = new->br_blockcount + right.br_blockcount;
1766 oldlen = startblockval(new->br_startblock) + 1738 oldlen = startblockval(new->br_startblock) +
1767 startblockval(right.br_startblock); 1739 startblockval(right.br_startblock);
1768 newlen = xfs_bmap_worst_indlen(ip, temp); 1740 newlen = xfs_bmap_worst_indlen(ip, temp);
1769 xfs_bmbt_set_allf(ep, new->br_startoff, 1741 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
1742 new->br_startoff,
1770 nullstartblock((int)newlen), temp, right.br_state); 1743 nullstartblock((int)newlen), temp, right.br_state);
1771 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1744 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1772
1773 ip->i_df.if_lastex = idx;
1774 break; 1745 break;
1775 1746
1776 case 0: 1747 case 0:
@@ -1780,14 +1751,13 @@ xfs_bmap_add_extent_hole_delay(
1780 * Insert a new entry. 1751 * Insert a new entry.
1781 */ 1752 */
1782 oldlen = newlen = 0; 1753 oldlen = newlen = 0;
1783 xfs_iext_insert(ip, idx, 1, new, state); 1754 xfs_iext_insert(ip, *idx, 1, new, state);
1784 ip->i_df.if_lastex = idx;
1785 break; 1755 break;
1786 } 1756 }
1787 if (oldlen != newlen) { 1757 if (oldlen != newlen) {
1788 ASSERT(oldlen > newlen); 1758 ASSERT(oldlen > newlen);
1789 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, 1759 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
1790 (int64_t)(oldlen - newlen), rsvd); 1760 (int64_t)(oldlen - newlen), 0);
1791 /* 1761 /*
1792 * Nothing to do for disk quota accounting here. 1762 * Nothing to do for disk quota accounting here.
1793 */ 1763 */
@@ -1803,13 +1773,12 @@ xfs_bmap_add_extent_hole_delay(
1803STATIC int /* error */ 1773STATIC int /* error */
1804xfs_bmap_add_extent_hole_real( 1774xfs_bmap_add_extent_hole_real(
1805 xfs_inode_t *ip, /* incore inode pointer */ 1775 xfs_inode_t *ip, /* incore inode pointer */
1806 xfs_extnum_t idx, /* extent number to update/insert */ 1776 xfs_extnum_t *idx, /* extent number to update/insert */
1807 xfs_btree_cur_t *cur, /* if null, not a btree */ 1777 xfs_btree_cur_t *cur, /* if null, not a btree */
1808 xfs_bmbt_irec_t *new, /* new data to add to file extents */ 1778 xfs_bmbt_irec_t *new, /* new data to add to file extents */
1809 int *logflagsp, /* inode logging flags */ 1779 int *logflagsp, /* inode logging flags */
1810 int whichfork) /* data or attr fork */ 1780 int whichfork) /* data or attr fork */
1811{ 1781{
1812 xfs_bmbt_rec_host_t *ep; /* pointer to extent entry ins. point */
1813 int error; /* error return value */ 1782 int error; /* error return value */
1814 int i; /* temp state */ 1783 int i; /* temp state */
1815 xfs_ifork_t *ifp; /* inode fork pointer */ 1784 xfs_ifork_t *ifp; /* inode fork pointer */
@@ -1819,8 +1788,7 @@ xfs_bmap_add_extent_hole_real(
1819 int state; /* state bits, accessed thru macros */ 1788 int state; /* state bits, accessed thru macros */
1820 1789
1821 ifp = XFS_IFORK_PTR(ip, whichfork); 1790 ifp = XFS_IFORK_PTR(ip, whichfork);
1822 ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); 1791 ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
1823 ep = xfs_iext_get_ext(ifp, idx);
1824 state = 0; 1792 state = 0;
1825 1793
1826 if (whichfork == XFS_ATTR_FORK) 1794 if (whichfork == XFS_ATTR_FORK)
@@ -1829,9 +1797,9 @@ xfs_bmap_add_extent_hole_real(
1829 /* 1797 /*
1830 * Check and set flags if this segment has a left neighbor. 1798 * Check and set flags if this segment has a left neighbor.
1831 */ 1799 */
1832 if (idx > 0) { 1800 if (*idx > 0) {
1833 state |= BMAP_LEFT_VALID; 1801 state |= BMAP_LEFT_VALID;
1834 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); 1802 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
1835 if (isnullstartblock(left.br_startblock)) 1803 if (isnullstartblock(left.br_startblock))
1836 state |= BMAP_LEFT_DELAY; 1804 state |= BMAP_LEFT_DELAY;
1837 } 1805 }
@@ -1840,9 +1808,9 @@ xfs_bmap_add_extent_hole_real(
1840 * Check and set flags if this segment has a current value. 1808 * Check and set flags if this segment has a current value.
1841 * Not true if we're inserting into the "hole" at eof. 1809 * Not true if we're inserting into the "hole" at eof.
1842 */ 1810 */
1843 if (idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { 1811 if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
1844 state |= BMAP_RIGHT_VALID; 1812 state |= BMAP_RIGHT_VALID;
1845 xfs_bmbt_get_all(ep, &right); 1813 xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
1846 if (isnullstartblock(right.br_startblock)) 1814 if (isnullstartblock(right.br_startblock))
1847 state |= BMAP_RIGHT_DELAY; 1815 state |= BMAP_RIGHT_DELAY;
1848 } 1816 }
@@ -1879,14 +1847,15 @@ xfs_bmap_add_extent_hole_real(
1879 * left and on the right. 1847 * left and on the right.
1880 * Merge all three into a single extent record. 1848 * Merge all three into a single extent record.
1881 */ 1849 */
1882 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1850 --*idx;
1883 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1851 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1852 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
1884 left.br_blockcount + new->br_blockcount + 1853 left.br_blockcount + new->br_blockcount +
1885 right.br_blockcount); 1854 right.br_blockcount);
1886 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1855 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1856
1857 xfs_iext_remove(ip, *idx + 1, 1, state);
1887 1858
1888 xfs_iext_remove(ip, idx, 1, state);
1889 ifp->if_lastex = idx - 1;
1890 XFS_IFORK_NEXT_SET(ip, whichfork, 1859 XFS_IFORK_NEXT_SET(ip, whichfork,
1891 XFS_IFORK_NEXTENTS(ip, whichfork) - 1); 1860 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
1892 if (cur == NULL) { 1861 if (cur == NULL) {
@@ -1921,12 +1890,12 @@ xfs_bmap_add_extent_hole_real(
1921 * on the left. 1890 * on the left.
1922 * Merge the new allocation with the left neighbor. 1891 * Merge the new allocation with the left neighbor.
1923 */ 1892 */
1924 trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_); 1893 --*idx;
1925 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), 1894 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1895 xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
1926 left.br_blockcount + new->br_blockcount); 1896 left.br_blockcount + new->br_blockcount);
1927 trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_); 1897 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1928 1898
1929 ifp->if_lastex = idx - 1;
1930 if (cur == NULL) { 1899 if (cur == NULL) {
1931 rval = xfs_ilog_fext(whichfork); 1900 rval = xfs_ilog_fext(whichfork);
1932 } else { 1901 } else {
@@ -1952,13 +1921,13 @@ xfs_bmap_add_extent_hole_real(
1952 * on the right. 1921 * on the right.
1953 * Merge the new allocation with the right neighbor. 1922 * Merge the new allocation with the right neighbor.
1954 */ 1923 */
1955 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 1924 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
1956 xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock, 1925 xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
1926 new->br_startoff, new->br_startblock,
1957 new->br_blockcount + right.br_blockcount, 1927 new->br_blockcount + right.br_blockcount,
1958 right.br_state); 1928 right.br_state);
1959 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 1929 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
1960 1930
1961 ifp->if_lastex = idx;
1962 if (cur == NULL) { 1931 if (cur == NULL) {
1963 rval = xfs_ilog_fext(whichfork); 1932 rval = xfs_ilog_fext(whichfork);
1964 } else { 1933 } else {
@@ -1984,8 +1953,7 @@ xfs_bmap_add_extent_hole_real(
1984 * real allocation. 1953 * real allocation.
1985 * Insert a new entry. 1954 * Insert a new entry.
1986 */ 1955 */
1987 xfs_iext_insert(ip, idx, 1, new, state); 1956 xfs_iext_insert(ip, *idx, 1, new, state);
1988 ifp->if_lastex = idx;
1989 XFS_IFORK_NEXT_SET(ip, whichfork, 1957 XFS_IFORK_NEXT_SET(ip, whichfork,
1990 XFS_IFORK_NEXTENTS(ip, whichfork) + 1); 1958 XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
1991 if (cur == NULL) { 1959 if (cur == NULL) {
@@ -2833,13 +2801,12 @@ STATIC int /* error */
2833xfs_bmap_del_extent( 2801xfs_bmap_del_extent(
2834 xfs_inode_t *ip, /* incore inode pointer */ 2802 xfs_inode_t *ip, /* incore inode pointer */
2835 xfs_trans_t *tp, /* current transaction pointer */ 2803 xfs_trans_t *tp, /* current transaction pointer */
2836 xfs_extnum_t idx, /* extent number to update/delete */ 2804 xfs_extnum_t *idx, /* extent number to update/delete */
2837 xfs_bmap_free_t *flist, /* list of extents to be freed */ 2805 xfs_bmap_free_t *flist, /* list of extents to be freed */
2838 xfs_btree_cur_t *cur, /* if null, not a btree */ 2806 xfs_btree_cur_t *cur, /* if null, not a btree */
2839 xfs_bmbt_irec_t *del, /* data to remove from extents */ 2807 xfs_bmbt_irec_t *del, /* data to remove from extents */
2840 int *logflagsp, /* inode logging flags */ 2808 int *logflagsp, /* inode logging flags */
2841 int whichfork, /* data or attr fork */ 2809 int whichfork) /* data or attr fork */
2842 int rsvd) /* OK to allocate reserved blocks */
2843{ 2810{
2844 xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ 2811 xfs_filblks_t da_new; /* new delay-alloc indirect blocks */
2845 xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ 2812 xfs_filblks_t da_old; /* old delay-alloc indirect blocks */
@@ -2870,10 +2837,10 @@ xfs_bmap_del_extent(
2870 2837
2871 mp = ip->i_mount; 2838 mp = ip->i_mount;
2872 ifp = XFS_IFORK_PTR(ip, whichfork); 2839 ifp = XFS_IFORK_PTR(ip, whichfork);
2873 ASSERT((idx >= 0) && (idx < ifp->if_bytes / 2840 ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
2874 (uint)sizeof(xfs_bmbt_rec_t))); 2841 (uint)sizeof(xfs_bmbt_rec_t)));
2875 ASSERT(del->br_blockcount > 0); 2842 ASSERT(del->br_blockcount > 0);
2876 ep = xfs_iext_get_ext(ifp, idx); 2843 ep = xfs_iext_get_ext(ifp, *idx);
2877 xfs_bmbt_get_all(ep, &got); 2844 xfs_bmbt_get_all(ep, &got);
2878 ASSERT(got.br_startoff <= del->br_startoff); 2845 ASSERT(got.br_startoff <= del->br_startoff);
2879 del_endoff = del->br_startoff + del->br_blockcount; 2846 del_endoff = del->br_startoff + del->br_blockcount;
@@ -2947,11 +2914,12 @@ xfs_bmap_del_extent(
2947 /* 2914 /*
2948 * Matches the whole extent. Delete the entry. 2915 * Matches the whole extent. Delete the entry.
2949 */ 2916 */
2950 xfs_iext_remove(ip, idx, 1, 2917 xfs_iext_remove(ip, *idx, 1,
2951 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); 2918 whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
2952 ifp->if_lastex = idx; 2919 --*idx;
2953 if (delay) 2920 if (delay)
2954 break; 2921 break;
2922
2955 XFS_IFORK_NEXT_SET(ip, whichfork, 2923 XFS_IFORK_NEXT_SET(ip, whichfork,
2956 XFS_IFORK_NEXTENTS(ip, whichfork) - 1); 2924 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
2957 flags |= XFS_ILOG_CORE; 2925 flags |= XFS_ILOG_CORE;
@@ -2968,21 +2936,20 @@ xfs_bmap_del_extent(
2968 /* 2936 /*
2969 * Deleting the first part of the extent. 2937 * Deleting the first part of the extent.
2970 */ 2938 */
2971 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 2939 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
2972 xfs_bmbt_set_startoff(ep, del_endoff); 2940 xfs_bmbt_set_startoff(ep, del_endoff);
2973 temp = got.br_blockcount - del->br_blockcount; 2941 temp = got.br_blockcount - del->br_blockcount;
2974 xfs_bmbt_set_blockcount(ep, temp); 2942 xfs_bmbt_set_blockcount(ep, temp);
2975 ifp->if_lastex = idx;
2976 if (delay) { 2943 if (delay) {
2977 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 2944 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
2978 da_old); 2945 da_old);
2979 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 2946 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
2980 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 2947 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2981 da_new = temp; 2948 da_new = temp;
2982 break; 2949 break;
2983 } 2950 }
2984 xfs_bmbt_set_startblock(ep, del_endblock); 2951 xfs_bmbt_set_startblock(ep, del_endblock);
2985 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 2952 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
2986 if (!cur) { 2953 if (!cur) {
2987 flags |= xfs_ilog_fext(whichfork); 2954 flags |= xfs_ilog_fext(whichfork);
2988 break; 2955 break;
@@ -2998,18 +2965,17 @@ xfs_bmap_del_extent(
2998 * Deleting the last part of the extent. 2965 * Deleting the last part of the extent.
2999 */ 2966 */
3000 temp = got.br_blockcount - del->br_blockcount; 2967 temp = got.br_blockcount - del->br_blockcount;
3001 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 2968 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
3002 xfs_bmbt_set_blockcount(ep, temp); 2969 xfs_bmbt_set_blockcount(ep, temp);
3003 ifp->if_lastex = idx;
3004 if (delay) { 2970 if (delay) {
3005 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), 2971 temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
3006 da_old); 2972 da_old);
3007 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); 2973 xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
3008 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 2974 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
3009 da_new = temp; 2975 da_new = temp;
3010 break; 2976 break;
3011 } 2977 }
3012 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 2978 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
3013 if (!cur) { 2979 if (!cur) {
3014 flags |= xfs_ilog_fext(whichfork); 2980 flags |= xfs_ilog_fext(whichfork);
3015 break; 2981 break;
@@ -3026,7 +2992,7 @@ xfs_bmap_del_extent(
3026 * Deleting the middle of the extent. 2992 * Deleting the middle of the extent.
3027 */ 2993 */
3028 temp = del->br_startoff - got.br_startoff; 2994 temp = del->br_startoff - got.br_startoff;
3029 trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_); 2995 trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
3030 xfs_bmbt_set_blockcount(ep, temp); 2996 xfs_bmbt_set_blockcount(ep, temp);
3031 new.br_startoff = del_endoff; 2997 new.br_startoff = del_endoff;
3032 temp2 = got_endoff - del_endoff; 2998 temp2 = got_endoff - del_endoff;
@@ -3113,9 +3079,9 @@ xfs_bmap_del_extent(
3113 } 3079 }
3114 } 3080 }
3115 } 3081 }
3116 trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_); 3082 trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
3117 xfs_iext_insert(ip, idx + 1, 1, &new, state); 3083 xfs_iext_insert(ip, *idx + 1, 1, &new, state);
3118 ifp->if_lastex = idx + 1; 3084 ++*idx;
3119 break; 3085 break;
3120 } 3086 }
3121 /* 3087 /*
@@ -3142,7 +3108,7 @@ xfs_bmap_del_extent(
3142 ASSERT(da_old >= da_new); 3108 ASSERT(da_old >= da_new);
3143 if (da_old > da_new) { 3109 if (da_old > da_new) {
3144 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, 3110 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
3145 (int64_t)(da_old - da_new), rsvd); 3111 (int64_t)(da_old - da_new), 0);
3146 } 3112 }
3147done: 3113done:
3148 *logflagsp = flags; 3114 *logflagsp = flags;
@@ -4562,29 +4528,24 @@ xfs_bmapi(
4562 if (rt) { 4528 if (rt) {
4563 error = xfs_mod_incore_sb(mp, 4529 error = xfs_mod_incore_sb(mp,
4564 XFS_SBS_FREXTENTS, 4530 XFS_SBS_FREXTENTS,
4565 -((int64_t)extsz), (flags & 4531 -((int64_t)extsz), 0);
4566 XFS_BMAPI_RSVBLOCKS));
4567 } else { 4532 } else {
4568 error = xfs_icsb_modify_counters(mp, 4533 error = xfs_icsb_modify_counters(mp,
4569 XFS_SBS_FDBLOCKS, 4534 XFS_SBS_FDBLOCKS,
4570 -((int64_t)alen), (flags & 4535 -((int64_t)alen), 0);
4571 XFS_BMAPI_RSVBLOCKS));
4572 } 4536 }
4573 if (!error) { 4537 if (!error) {
4574 error = xfs_icsb_modify_counters(mp, 4538 error = xfs_icsb_modify_counters(mp,
4575 XFS_SBS_FDBLOCKS, 4539 XFS_SBS_FDBLOCKS,
4576 -((int64_t)indlen), (flags & 4540 -((int64_t)indlen), 0);
4577 XFS_BMAPI_RSVBLOCKS));
4578 if (error && rt) 4541 if (error && rt)
4579 xfs_mod_incore_sb(mp, 4542 xfs_mod_incore_sb(mp,
4580 XFS_SBS_FREXTENTS, 4543 XFS_SBS_FREXTENTS,
4581 (int64_t)extsz, (flags & 4544 (int64_t)extsz, 0);
4582 XFS_BMAPI_RSVBLOCKS));
4583 else if (error) 4545 else if (error)
4584 xfs_icsb_modify_counters(mp, 4546 xfs_icsb_modify_counters(mp,
4585 XFS_SBS_FDBLOCKS, 4547 XFS_SBS_FDBLOCKS,
4586 (int64_t)alen, (flags & 4548 (int64_t)alen, 0);
4587 XFS_BMAPI_RSVBLOCKS));
4588 } 4549 }
4589 4550
4590 if (error) { 4551 if (error) {
@@ -4701,13 +4662,12 @@ xfs_bmapi(
4701 if (!wasdelay && (flags & XFS_BMAPI_PREALLOC)) 4662 if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
4702 got.br_state = XFS_EXT_UNWRITTEN; 4663 got.br_state = XFS_EXT_UNWRITTEN;
4703 } 4664 }
4704 error = xfs_bmap_add_extent(ip, lastx, &cur, &got, 4665 error = xfs_bmap_add_extent(ip, &lastx, &cur, &got,
4705 firstblock, flist, &tmp_logflags, 4666 firstblock, flist, &tmp_logflags,
4706 whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); 4667 whichfork);
4707 logflags |= tmp_logflags; 4668 logflags |= tmp_logflags;
4708 if (error) 4669 if (error)
4709 goto error0; 4670 goto error0;
4710 lastx = ifp->if_lastex;
4711 ep = xfs_iext_get_ext(ifp, lastx); 4671 ep = xfs_iext_get_ext(ifp, lastx);
4712 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4672 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4713 xfs_bmbt_get_all(ep, &got); 4673 xfs_bmbt_get_all(ep, &got);
@@ -4803,13 +4763,12 @@ xfs_bmapi(
4803 mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) 4763 mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
4804 ? XFS_EXT_NORM 4764 ? XFS_EXT_NORM
4805 : XFS_EXT_UNWRITTEN; 4765 : XFS_EXT_UNWRITTEN;
4806 error = xfs_bmap_add_extent(ip, lastx, &cur, mval, 4766 error = xfs_bmap_add_extent(ip, &lastx, &cur, mval,
4807 firstblock, flist, &tmp_logflags, 4767 firstblock, flist, &tmp_logflags,
4808 whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); 4768 whichfork);
4809 logflags |= tmp_logflags; 4769 logflags |= tmp_logflags;
4810 if (error) 4770 if (error)
4811 goto error0; 4771 goto error0;
4812 lastx = ifp->if_lastex;
4813 ep = xfs_iext_get_ext(ifp, lastx); 4772 ep = xfs_iext_get_ext(ifp, lastx);
4814 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 4773 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
4815 xfs_bmbt_get_all(ep, &got); 4774 xfs_bmbt_get_all(ep, &got);
@@ -4868,14 +4827,14 @@ xfs_bmapi(
4868 /* 4827 /*
4869 * Else go on to the next record. 4828 * Else go on to the next record.
4870 */ 4829 */
4871 ep = xfs_iext_get_ext(ifp, ++lastx);
4872 prev = got; 4830 prev = got;
4873 if (lastx >= nextents) 4831 if (++lastx < nextents) {
4874 eof = 1; 4832 ep = xfs_iext_get_ext(ifp, lastx);
4875 else
4876 xfs_bmbt_get_all(ep, &got); 4833 xfs_bmbt_get_all(ep, &got);
4834 } else {
4835 eof = 1;
4836 }
4877 } 4837 }
4878 ifp->if_lastex = lastx;
4879 *nmap = n; 4838 *nmap = n;
4880 /* 4839 /*
4881 * Transform from btree to extents, give it cur. 4840 * Transform from btree to extents, give it cur.
@@ -4984,7 +4943,6 @@ xfs_bmapi_single(
4984 ASSERT(!isnullstartblock(got.br_startblock)); 4943 ASSERT(!isnullstartblock(got.br_startblock));
4985 ASSERT(bno < got.br_startoff + got.br_blockcount); 4944 ASSERT(bno < got.br_startoff + got.br_blockcount);
4986 *fsb = got.br_startblock + (bno - got.br_startoff); 4945 *fsb = got.br_startblock + (bno - got.br_startoff);
4987 ifp->if_lastex = lastx;
4988 return 0; 4946 return 0;
4989} 4947}
4990 4948
@@ -5026,7 +4984,6 @@ xfs_bunmapi(
5026 int tmp_logflags; /* partial logging flags */ 4984 int tmp_logflags; /* partial logging flags */
5027 int wasdel; /* was a delayed alloc extent */ 4985 int wasdel; /* was a delayed alloc extent */
5028 int whichfork; /* data or attribute fork */ 4986 int whichfork; /* data or attribute fork */
5029 int rsvd; /* OK to allocate reserved blocks */
5030 xfs_fsblock_t sum; 4987 xfs_fsblock_t sum;
5031 4988
5032 trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); 4989 trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
@@ -5044,7 +5001,7 @@ xfs_bunmapi(
5044 mp = ip->i_mount; 5001 mp = ip->i_mount;
5045 if (XFS_FORCED_SHUTDOWN(mp)) 5002 if (XFS_FORCED_SHUTDOWN(mp))
5046 return XFS_ERROR(EIO); 5003 return XFS_ERROR(EIO);
5047 rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0; 5004
5048 ASSERT(len > 0); 5005 ASSERT(len > 0);
5049 ASSERT(nexts >= 0); 5006 ASSERT(nexts >= 0);
5050 ASSERT(ifp->if_ext_max == 5007 ASSERT(ifp->if_ext_max ==
@@ -5160,9 +5117,9 @@ xfs_bunmapi(
5160 del.br_blockcount = mod; 5117 del.br_blockcount = mod;
5161 } 5118 }
5162 del.br_state = XFS_EXT_UNWRITTEN; 5119 del.br_state = XFS_EXT_UNWRITTEN;
5163 error = xfs_bmap_add_extent(ip, lastx, &cur, &del, 5120 error = xfs_bmap_add_extent(ip, &lastx, &cur, &del,
5164 firstblock, flist, &logflags, 5121 firstblock, flist, &logflags,
5165 XFS_DATA_FORK, 0); 5122 XFS_DATA_FORK);
5166 if (error) 5123 if (error)
5167 goto error0; 5124 goto error0;
5168 goto nodelete; 5125 goto nodelete;
@@ -5188,9 +5145,12 @@ xfs_bunmapi(
5188 */ 5145 */
5189 ASSERT(bno >= del.br_blockcount); 5146 ASSERT(bno >= del.br_blockcount);
5190 bno -= del.br_blockcount; 5147 bno -= del.br_blockcount;
5191 if (bno < got.br_startoff) { 5148 if (got.br_startoff > bno) {
5192 if (--lastx >= 0) 5149 if (--lastx >= 0) {
5193 xfs_bmbt_get_all(--ep, &got); 5150 ep = xfs_iext_get_ext(ifp,
5151 lastx);
5152 xfs_bmbt_get_all(ep, &got);
5153 }
5194 } 5154 }
5195 continue; 5155 continue;
5196 } else if (del.br_state == XFS_EXT_UNWRITTEN) { 5156 } else if (del.br_state == XFS_EXT_UNWRITTEN) {
@@ -5214,18 +5174,19 @@ xfs_bunmapi(
5214 prev.br_startoff = start; 5174 prev.br_startoff = start;
5215 } 5175 }
5216 prev.br_state = XFS_EXT_UNWRITTEN; 5176 prev.br_state = XFS_EXT_UNWRITTEN;
5217 error = xfs_bmap_add_extent(ip, lastx - 1, &cur, 5177 lastx--;
5178 error = xfs_bmap_add_extent(ip, &lastx, &cur,
5218 &prev, firstblock, flist, &logflags, 5179 &prev, firstblock, flist, &logflags,
5219 XFS_DATA_FORK, 0); 5180 XFS_DATA_FORK);
5220 if (error) 5181 if (error)
5221 goto error0; 5182 goto error0;
5222 goto nodelete; 5183 goto nodelete;
5223 } else { 5184 } else {
5224 ASSERT(del.br_state == XFS_EXT_NORM); 5185 ASSERT(del.br_state == XFS_EXT_NORM);
5225 del.br_state = XFS_EXT_UNWRITTEN; 5186 del.br_state = XFS_EXT_UNWRITTEN;
5226 error = xfs_bmap_add_extent(ip, lastx, &cur, 5187 error = xfs_bmap_add_extent(ip, &lastx, &cur,
5227 &del, firstblock, flist, &logflags, 5188 &del, firstblock, flist, &logflags,
5228 XFS_DATA_FORK, 0); 5189 XFS_DATA_FORK);
5229 if (error) 5190 if (error)
5230 goto error0; 5191 goto error0;
5231 goto nodelete; 5192 goto nodelete;
@@ -5240,13 +5201,13 @@ xfs_bunmapi(
5240 rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); 5201 rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
5241 do_div(rtexts, mp->m_sb.sb_rextsize); 5202 do_div(rtexts, mp->m_sb.sb_rextsize);
5242 xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, 5203 xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
5243 (int64_t)rtexts, rsvd); 5204 (int64_t)rtexts, 0);
5244 (void)xfs_trans_reserve_quota_nblks(NULL, 5205 (void)xfs_trans_reserve_quota_nblks(NULL,
5245 ip, -((long)del.br_blockcount), 0, 5206 ip, -((long)del.br_blockcount), 0,
5246 XFS_QMOPT_RES_RTBLKS); 5207 XFS_QMOPT_RES_RTBLKS);
5247 } else { 5208 } else {
5248 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, 5209 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
5249 (int64_t)del.br_blockcount, rsvd); 5210 (int64_t)del.br_blockcount, 0);
5250 (void)xfs_trans_reserve_quota_nblks(NULL, 5211 (void)xfs_trans_reserve_quota_nblks(NULL,
5251 ip, -((long)del.br_blockcount), 0, 5212 ip, -((long)del.br_blockcount), 0,
5252 XFS_QMOPT_RES_REGBLKS); 5213 XFS_QMOPT_RES_REGBLKS);
@@ -5277,31 +5238,29 @@ xfs_bunmapi(
5277 error = XFS_ERROR(ENOSPC); 5238 error = XFS_ERROR(ENOSPC);
5278 goto error0; 5239 goto error0;
5279 } 5240 }
5280 error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del, 5241 error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
5281 &tmp_logflags, whichfork, rsvd); 5242 &tmp_logflags, whichfork);
5282 logflags |= tmp_logflags; 5243 logflags |= tmp_logflags;
5283 if (error) 5244 if (error)
5284 goto error0; 5245 goto error0;
5285 bno = del.br_startoff - 1; 5246 bno = del.br_startoff - 1;
5286nodelete: 5247nodelete:
5287 lastx = ifp->if_lastex;
5288 /* 5248 /*
5289 * If not done go on to the next (previous) record. 5249 * If not done go on to the next (previous) record.
5290 * Reset ep in case the extents array was re-alloced.
5291 */ 5250 */
5292 ep = xfs_iext_get_ext(ifp, lastx);
5293 if (bno != (xfs_fileoff_t)-1 && bno >= start) { 5251 if (bno != (xfs_fileoff_t)-1 && bno >= start) {
5294 if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) || 5252 if (lastx >= 0) {
5295 xfs_bmbt_get_startoff(ep) > bno) { 5253 ep = xfs_iext_get_ext(ifp, lastx);
5296 if (--lastx >= 0) 5254 if (xfs_bmbt_get_startoff(ep) > bno) {
5297 ep = xfs_iext_get_ext(ifp, lastx); 5255 if (--lastx >= 0)
5298 } 5256 ep = xfs_iext_get_ext(ifp,
5299 if (lastx >= 0) 5257 lastx);
5258 }
5300 xfs_bmbt_get_all(ep, &got); 5259 xfs_bmbt_get_all(ep, &got);
5260 }
5301 extno++; 5261 extno++;
5302 } 5262 }
5303 } 5263 }
5304 ifp->if_lastex = lastx;
5305 *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; 5264 *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
5306 ASSERT(ifp->if_ext_max == 5265 ASSERT(ifp->if_ext_max ==
5307 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); 5266 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 3651191daea1..c62234bde053 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -69,7 +69,6 @@ typedef struct xfs_bmap_free
69#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ 69#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */
70#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ 70#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */
71#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */ 71#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */
72#define XFS_BMAPI_RSVBLOCKS 0x020 /* OK to alloc. reserved data blocks */
73#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */ 72#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */
74#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */ 73#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */
75 /* combine contig. space */ 74 /* combine contig. space */
@@ -87,7 +86,6 @@ typedef struct xfs_bmap_free
87 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ 86 { XFS_BMAPI_ENTIRE, "ENTIRE" }, \
88 { XFS_BMAPI_METADATA, "METADATA" }, \ 87 { XFS_BMAPI_METADATA, "METADATA" }, \
89 { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ 88 { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \
90 { XFS_BMAPI_RSVBLOCKS, "RSVBLOCKS" }, \
91 { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ 89 { XFS_BMAPI_PREALLOC, "PREALLOC" }, \
92 { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ 90 { XFS_BMAPI_IGSTATE, "IGSTATE" }, \
93 { XFS_BMAPI_CONTIG, "CONTIG" }, \ 91 { XFS_BMAPI_CONTIG, "CONTIG" }, \
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c8e3349c287c..a098a20ca63e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -920,7 +920,6 @@ xfs_iread_extents(
920 /* 920 /*
921 * We know that the size is valid (it's checked in iformat_btree) 921 * We know that the size is valid (it's checked in iformat_btree)
922 */ 922 */
923 ifp->if_lastex = NULLEXTNUM;
924 ifp->if_bytes = ifp->if_real_bytes = 0; 923 ifp->if_bytes = ifp->if_real_bytes = 0;
925 ifp->if_flags |= XFS_IFEXTENTS; 924 ifp->if_flags |= XFS_IFEXTENTS;
926 xfs_iext_add(ifp, 0, nextents); 925 xfs_iext_add(ifp, 0, nextents);
@@ -2558,12 +2557,9 @@ xfs_iflush_fork(
2558 case XFS_DINODE_FMT_EXTENTS: 2557 case XFS_DINODE_FMT_EXTENTS:
2559 ASSERT((ifp->if_flags & XFS_IFEXTENTS) || 2558 ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
2560 !(iip->ili_format.ilf_fields & extflag[whichfork])); 2559 !(iip->ili_format.ilf_fields & extflag[whichfork]));
2561 ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) ||
2562 (ifp->if_bytes == 0));
2563 ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) ||
2564 (ifp->if_bytes > 0));
2565 if ((iip->ili_format.ilf_fields & extflag[whichfork]) && 2560 if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
2566 (ifp->if_bytes > 0)) { 2561 (ifp->if_bytes > 0)) {
2562 ASSERT(xfs_iext_get_ext(ifp, 0));
2567 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); 2563 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
2568 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, 2564 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
2569 whichfork); 2565 whichfork);
@@ -3112,6 +3108,8 @@ xfs_iext_get_ext(
3112 xfs_extnum_t idx) /* index of target extent */ 3108 xfs_extnum_t idx) /* index of target extent */
3113{ 3109{
3114 ASSERT(idx >= 0); 3110 ASSERT(idx >= 0);
3111 ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
3112
3115 if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { 3113 if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
3116 return ifp->if_u1.if_ext_irec->er_extbuf; 3114 return ifp->if_u1.if_ext_irec->er_extbuf;
3117 } else if (ifp->if_flags & XFS_IFEXTIREC) { 3115 } else if (ifp->if_flags & XFS_IFEXTIREC) {
@@ -3191,7 +3189,6 @@ xfs_iext_add(
3191 } 3189 }
3192 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; 3190 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
3193 ifp->if_real_bytes = 0; 3191 ifp->if_real_bytes = 0;
3194 ifp->if_lastex = nextents + ext_diff;
3195 } 3192 }
3196 /* 3193 /*
3197 * Otherwise use a linear (direct) extent list. 3194 * Otherwise use a linear (direct) extent list.
@@ -3886,8 +3883,10 @@ xfs_iext_idx_to_irec(
3886 xfs_extnum_t page_idx = *idxp; /* extent index in target list */ 3883 xfs_extnum_t page_idx = *idxp; /* extent index in target list */
3887 3884
3888 ASSERT(ifp->if_flags & XFS_IFEXTIREC); 3885 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3889 ASSERT(page_idx >= 0 && page_idx <= 3886 ASSERT(page_idx >= 0);
3890 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); 3887 ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
3888 ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
3889
3891 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; 3890 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3892 erp_idx = 0; 3891 erp_idx = 0;
3893 low = 0; 3892 low = 0;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ff4e2a30227d..3ae6d58e5473 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -67,7 +67,6 @@ typedef struct xfs_ifork {
67 short if_broot_bytes; /* bytes allocated for root */ 67 short if_broot_bytes; /* bytes allocated for root */
68 unsigned char if_flags; /* per-fork flags */ 68 unsigned char if_flags; /* per-fork flags */
69 unsigned char if_ext_max; /* max # of extent records */ 69 unsigned char if_ext_max; /* max # of extent records */
70 xfs_extnum_t if_lastex; /* last if_extents used */
71 union { 70 union {
72 xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ 71 xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
73 xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ 72 xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 7d56e88a3f0e..c7755d5a5fbe 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -29,6 +29,7 @@
29#include "xfs_mount.h" 29#include "xfs_mount.h"
30#include "xfs_error.h" 30#include "xfs_error.h"
31#include "xfs_alloc.h" 31#include "xfs_alloc.h"
32#include "xfs_discard.h"
32 33
33/* 34/*
34 * Perform initial CIL structure initialisation. If the CIL is not 35 * Perform initial CIL structure initialisation. If the CIL is not
@@ -361,18 +362,28 @@ xlog_cil_committed(
361 int abort) 362 int abort)
362{ 363{
363 struct xfs_cil_ctx *ctx = args; 364 struct xfs_cil_ctx *ctx = args;
365 struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
364 366
365 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, 367 xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
366 ctx->start_lsn, abort); 368 ctx->start_lsn, abort);
367 369
368 xfs_alloc_busy_sort(&ctx->busy_extents); 370 xfs_alloc_busy_sort(&ctx->busy_extents);
369 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, &ctx->busy_extents); 371 xfs_alloc_busy_clear(mp, &ctx->busy_extents,
372 (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
370 373
371 spin_lock(&ctx->cil->xc_cil_lock); 374 spin_lock(&ctx->cil->xc_cil_lock);
372 list_del(&ctx->committing); 375 list_del(&ctx->committing);
373 spin_unlock(&ctx->cil->xc_cil_lock); 376 spin_unlock(&ctx->cil->xc_cil_lock);
374 377
375 xlog_cil_free_logvec(ctx->lv_chain); 378 xlog_cil_free_logvec(ctx->lv_chain);
379
380 if (!list_empty(&ctx->busy_extents)) {
381 ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
382
383 xfs_discard_extents(mp, &ctx->busy_extents);
384 xfs_alloc_busy_clear(mp, &ctx->busy_extents, false);
385 }
386
376 kmem_free(ctx); 387 kmem_free(ctx);
377} 388}
378 389
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 19af0ab0d0c6..3d68bb267c5f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -224,6 +224,7 @@ typedef struct xfs_mount {
224#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem 224#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
225 operations, typically for 225 operations, typically for
226 disk errors in metadata */ 226 disk errors in metadata */
227#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */
227#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to 228#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to
228 user */ 229 user */
229#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment 230#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d1f24858ccc4..7c7bc2b786bd 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -609,7 +609,7 @@ xfs_trans_free(
609 struct xfs_trans *tp) 609 struct xfs_trans *tp)
610{ 610{
611 xfs_alloc_busy_sort(&tp->t_busy); 611 xfs_alloc_busy_sort(&tp->t_busy);
612 xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy); 612 xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false);
613 613
614 atomic_dec(&tp->t_mountp->m_active_trans); 614 atomic_dec(&tp->t_mountp->m_active_trans);
615 xfs_trans_free_dqinfo(tp); 615 xfs_trans_free_dqinfo(tp);