aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJ. Bruce Fields <bfields@redhat.com>2014-11-19 12:06:30 -0500
committerJ. Bruce Fields <bfields@redhat.com>2014-11-19 12:06:30 -0500
commit56429e9b3be567a173bd05f5594faf8522c34d3a (patch)
treed218d430ed992cdfa42da084bf36e5aa3c2ecb26 /fs
parent5b095e99928cc13332d364f7cca7a9ca684369b4 (diff)
parent093a1468b6edb0e568be7311b8d2228d205702db (diff)
merge nfs bugfixes into nfsd for-3.19 branch
In addition to nfsd bugfixes, there are some fixes in -rc5 for client bugs that can interfere with my testing.
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/block_dev.c3
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/disk-io.c43
-rw-r--r--fs/btrfs/extent-tree.c18
-rw-r--r--fs/btrfs/file-item.c2
-rw-r--r--fs/btrfs/ioctl.c20
-rw-r--r--fs/btrfs/super.c1
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/buffer.c86
-rw-r--r--fs/ceph/caps.c2
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/ecryptfs/main.c7
-rw-r--r--fs/exofs/Kbuild2
-rw-r--r--fs/exofs/common.h2
-rw-r--r--fs/exofs/dir.c2
-rw-r--r--fs/exofs/exofs.h2
-rw-r--r--fs/exofs/file.c2
-rw-r--r--fs/exofs/inode.c2
-rw-r--r--fs/exofs/namei.c2
-rw-r--r--fs/exofs/ore.c4
-rw-r--r--fs/exofs/ore_raid.c2
-rw-r--r--fs/exofs/ore_raid.h2
-rw-r--r--fs/exofs/super.c2
-rw-r--r--fs/exofs/symlink.c2
-rw-r--r--fs/exofs/sys.c2
-rw-r--r--fs/ext3/super.c7
-rw-r--r--fs/ext4/balloc.c15
-rw-r--r--fs/ext4/bitmap.c12
-rw-r--r--fs/ext4/dir.c8
-rw-r--r--fs/ext4/ext4.h50
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_jbd2.c4
-rw-r--r--fs/ext4/ext4_jbd2.h6
-rw-r--r--fs/ext4/extents.c626
-rw-r--r--fs/ext4/extents_status.c200
-rw-r--r--fs/ext4/extents_status.h13
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/ialloc.c7
-rw-r--r--fs/ext4/indirect.c86
-rw-r--r--fs/ext4/inline.c7
-rw-r--r--fs/ext4/inode.c140
-rw-r--r--fs/ext4/ioctl.c13
-rw-r--r--fs/ext4/mballoc.c15
-rw-r--r--fs/ext4/migrate.c11
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/move_extent.c1068
-rw-r--r--fs/ext4/namei.c361
-rw-r--r--fs/ext4/resize.c5
-rw-r--r--fs/ext4/super.c262
-rw-r--r--fs/ext4/xattr.c44
-rw-r--r--fs/internal.h7
-rw-r--r--fs/isofs/inode.c24
-rw-r--r--fs/isofs/namei.c22
-rw-r--r--fs/jbd/journal.c2
-rw-r--r--fs/jbd/revoke.c7
-rw-r--r--fs/jbd2/checkpoint.c334
-rw-r--r--fs/jbd2/journal.c18
-rw-r--r--fs/jbd2/recovery.c1
-rw-r--r--fs/jbd2/revoke.c10
-rw-r--r--fs/namei.c46
-rw-r--r--fs/namespace.c27
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c14
-rw-r--r--fs/nfs/delegation.c25
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c1
-rw-r--r--fs/nfs/direct.c1
-rw-r--r--fs/nfs/filelayout/filelayout.c3
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/netns.h1
-rw-r--r--fs/nfs/nfs4proc.c95
-rw-r--r--fs/nfs/objlayout/objio_osd.c2
-rw-r--r--fs/nfs/objlayout/objlayout.c2
-rw-r--r--fs/nfs/objlayout/objlayout.h2
-rw-r--r--fs/nfs/objlayout/pnfs_osd_xdr_cli.c2
-rw-r--r--fs/nfs/write.c2
-rw-r--r--fs/nfsd/nfsd.h9
-rw-r--r--fs/notify/fsnotify.c36
-rw-r--r--fs/notify/fsnotify.h4
-rw-r--r--fs/notify/inode_mark.c25
-rw-r--r--fs/notify/mark.c36
-rw-r--r--fs/notify/vfsmount_mark.c8
-rw-r--r--fs/ocfs2/cluster/tcp.c2
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/open.c23
-rw-r--r--fs/overlayfs/Kconfig10
-rw-r--r--fs/overlayfs/Makefile7
-rw-r--r--fs/overlayfs/copy_up.c414
-rw-r--r--fs/overlayfs/dir.c921
-rw-r--r--fs/overlayfs/inode.c425
-rw-r--r--fs/overlayfs/overlayfs.h191
-rw-r--r--fs/overlayfs/readdir.c593
-rw-r--r--fs/overlayfs/super.c796
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/splice.c1
-rw-r--r--fs/xfs/xfs_bmap_util.c72
-rw-r--r--fs/xfs/xfs_itable.c250
-rw-r--r--fs/xfs/xfs_itable.h16
100 files changed, 5291 insertions, 2392 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index db5dc1598716..664991afe0c0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -67,6 +67,7 @@ source "fs/quota/Kconfig"
67 67
68source "fs/autofs4/Kconfig" 68source "fs/autofs4/Kconfig"
69source "fs/fuse/Kconfig" 69source "fs/fuse/Kconfig"
70source "fs/overlayfs/Kconfig"
70 71
71menu "Caches" 72menu "Caches"
72 73
diff --git a/fs/Makefile b/fs/Makefile
index 90c88529892b..34a1b9dea6dd 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -104,6 +104,7 @@ obj-$(CONFIG_QNX6FS_FS) += qnx6/
104obj-$(CONFIG_AUTOFS4_FS) += autofs4/ 104obj-$(CONFIG_AUTOFS4_FS) += autofs4/
105obj-$(CONFIG_ADFS_FS) += adfs/ 105obj-$(CONFIG_ADFS_FS) += adfs/
106obj-$(CONFIG_FUSE_FS) += fuse/ 106obj-$(CONFIG_FUSE_FS) += fuse/
107obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/
107obj-$(CONFIG_UDF_FS) += udf/ 108obj-$(CONFIG_UDF_FS) += udf/
108obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ 109obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
109obj-$(CONFIG_OMFS_FS) += omfs/ 110obj-$(CONFIG_OMFS_FS) += omfs/
diff --git a/fs/block_dev.c b/fs/block_dev.c
index cc9d4114cda0..1d9c9f3754f8 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1585,7 +1585,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
1585} 1585}
1586EXPORT_SYMBOL_GPL(blkdev_write_iter); 1586EXPORT_SYMBOL_GPL(blkdev_write_iter);
1587 1587
1588static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) 1588ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
1589{ 1589{
1590 struct file *file = iocb->ki_filp; 1590 struct file *file = iocb->ki_filp;
1591 struct inode *bd_inode = file->f_mapping->host; 1591 struct inode *bd_inode = file->f_mapping->host;
@@ -1599,6 +1599,7 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
1599 iov_iter_truncate(to, size); 1599 iov_iter_truncate(to, size);
1600 return generic_file_read_iter(iocb, to); 1600 return generic_file_read_iter(iocb, to);
1601} 1601}
1602EXPORT_SYMBOL_GPL(blkdev_read_iter);
1602 1603
1603/* 1604/*
1604 * Try to release a page associated with block device when the system 1605 * Try to release a page associated with block device when the system
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d557264ee974..fe69edda11fb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3276,7 +3276,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3276 struct btrfs_root *root, unsigned long count); 3276 struct btrfs_root *root, unsigned long count);
3277int btrfs_async_run_delayed_refs(struct btrfs_root *root, 3277int btrfs_async_run_delayed_refs(struct btrfs_root *root,
3278 unsigned long count, int wait); 3278 unsigned long count, int wait);
3279int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 3279int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len);
3280int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, 3280int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
3281 struct btrfs_root *root, u64 bytenr, 3281 struct btrfs_root *root, u64 bytenr,
3282 u64 offset, int metadata, u64 *refs, u64 *flags); 3282 u64 offset, int metadata, u64 *refs, u64 *flags);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1ad0f47ac850..1bf9f897065d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3817,19 +3817,19 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3817 struct btrfs_super_block *sb = fs_info->super_copy; 3817 struct btrfs_super_block *sb = fs_info->super_copy;
3818 int ret = 0; 3818 int ret = 0;
3819 3819
3820 if (sb->root_level > BTRFS_MAX_LEVEL) { 3820 if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
3821 printk(KERN_ERR "BTRFS: tree_root level too big: %d > %d\n", 3821 printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n",
3822 sb->root_level, BTRFS_MAX_LEVEL); 3822 btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
3823 ret = -EINVAL; 3823 ret = -EINVAL;
3824 } 3824 }
3825 if (sb->chunk_root_level > BTRFS_MAX_LEVEL) { 3825 if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
3826 printk(KERN_ERR "BTRFS: chunk_root level too big: %d > %d\n", 3826 printk(KERN_ERR "BTRFS: chunk_root level too big: %d >= %d\n",
3827 sb->chunk_root_level, BTRFS_MAX_LEVEL); 3827 btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
3828 ret = -EINVAL; 3828 ret = -EINVAL;
3829 } 3829 }
3830 if (sb->log_root_level > BTRFS_MAX_LEVEL) { 3830 if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
3831 printk(KERN_ERR "BTRFS: log_root level too big: %d > %d\n", 3831 printk(KERN_ERR "BTRFS: log_root level too big: %d >= %d\n",
3832 sb->log_root_level, BTRFS_MAX_LEVEL); 3832 btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
3833 ret = -EINVAL; 3833 ret = -EINVAL;
3834 } 3834 }
3835 3835
@@ -3837,15 +3837,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3837 * The common minimum, we don't know if we can trust the nodesize/sectorsize 3837 * The common minimum, we don't know if we can trust the nodesize/sectorsize
3838 * items yet, they'll be verified later. Issue just a warning. 3838 * items yet, they'll be verified later. Issue just a warning.
3839 */ 3839 */
3840 if (!IS_ALIGNED(sb->root, 4096)) 3840 if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
3841 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", 3841 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
3842 sb->root); 3842 sb->root);
3843 if (!IS_ALIGNED(sb->chunk_root, 4096)) 3843 if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
3844 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", 3844 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
3845 sb->chunk_root); 3845 sb->chunk_root);
3846 if (!IS_ALIGNED(sb->log_root, 4096)) 3846 if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
3847 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", 3847 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
3848 sb->log_root); 3848 btrfs_super_log_root(sb));
3849 3849
3850 if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { 3850 if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
3851 printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n", 3851 printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
@@ -3857,13 +3857,13 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3857 * Hint to catch really bogus numbers, bitflips or so, more exact checks are 3857 * Hint to catch really bogus numbers, bitflips or so, more exact checks are
3858 * done later 3858 * done later
3859 */ 3859 */
3860 if (sb->num_devices > (1UL << 31)) 3860 if (btrfs_super_num_devices(sb) > (1UL << 31))
3861 printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n", 3861 printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
3862 sb->num_devices); 3862 btrfs_super_num_devices(sb));
3863 3863
3864 if (sb->bytenr != BTRFS_SUPER_INFO_OFFSET) { 3864 if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
3865 printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n", 3865 printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
3866 sb->bytenr, BTRFS_SUPER_INFO_OFFSET); 3866 btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
3867 ret = -EINVAL; 3867 ret = -EINVAL;
3868 } 3868 }
3869 3869
@@ -3871,14 +3871,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3871 * The generation is a global counter, we'll trust it more than the others 3871 * The generation is a global counter, we'll trust it more than the others
3872 * but it's still possible that it's the one that's wrong. 3872 * but it's still possible that it's the one that's wrong.
3873 */ 3873 */
3874 if (sb->generation < sb->chunk_root_generation) 3874 if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
3875 printk(KERN_WARNING 3875 printk(KERN_WARNING
3876 "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n", 3876 "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n",
3877 sb->generation, sb->chunk_root_generation); 3877 btrfs_super_generation(sb), btrfs_super_chunk_root_generation(sb));
3878 if (sb->generation < sb->cache_generation && sb->cache_generation != (u64)-1) 3878 if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
3879 && btrfs_super_cache_generation(sb) != (u64)-1)
3879 printk(KERN_WARNING 3880 printk(KERN_WARNING
3880 "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n", 3881 "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n",
3881 sb->generation, sb->cache_generation); 3882 btrfs_super_generation(sb), btrfs_super_cache_generation(sb));
3882 3883
3883 return ret; 3884 return ret;
3884} 3885}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d56589571012..47c1ba141082 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -710,8 +710,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
710 rcu_read_unlock(); 710 rcu_read_unlock();
711} 711}
712 712
713/* simple helper to search for an existing extent at a given offset */ 713/* simple helper to search for an existing data extent at a given offset */
714int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) 714int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
715{ 715{
716 int ret; 716 int ret;
717 struct btrfs_key key; 717 struct btrfs_key key;
@@ -726,12 +726,6 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
726 key.type = BTRFS_EXTENT_ITEM_KEY; 726 key.type = BTRFS_EXTENT_ITEM_KEY;
727 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 727 ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
728 0, 0); 728 0, 0);
729 if (ret > 0) {
730 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
731 if (key.objectid == start &&
732 key.type == BTRFS_METADATA_ITEM_KEY)
733 ret = 0;
734 }
735 btrfs_free_path(path); 729 btrfs_free_path(path);
736 return ret; 730 return ret;
737} 731}
@@ -786,7 +780,6 @@ search_again:
786 else 780 else
787 key.type = BTRFS_EXTENT_ITEM_KEY; 781 key.type = BTRFS_EXTENT_ITEM_KEY;
788 782
789again:
790 ret = btrfs_search_slot(trans, root->fs_info->extent_root, 783 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
791 &key, path, 0, 0); 784 &key, path, 0, 0);
792 if (ret < 0) 785 if (ret < 0)
@@ -802,13 +795,6 @@ again:
802 key.offset == root->nodesize) 795 key.offset == root->nodesize)
803 ret = 0; 796 ret = 0;
804 } 797 }
805 if (ret) {
806 key.objectid = bytenr;
807 key.type = BTRFS_EXTENT_ITEM_KEY;
808 key.offset = root->nodesize;
809 btrfs_release_path(path);
810 goto again;
811 }
812 } 798 }
813 799
814 if (ret == 0) { 800 if (ret == 0) {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 783a94355efd..84a2d1868271 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -413,7 +413,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
413 ret = 0; 413 ret = 0;
414fail: 414fail:
415 while (ret < 0 && !list_empty(&tmplist)) { 415 while (ret < 0 && !list_empty(&tmplist)) {
416 sums = list_entry(&tmplist, struct btrfs_ordered_sum, list); 416 sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list);
417 list_del(&sums->list); 417 list_del(&sums->list);
418 kfree(sums); 418 kfree(sums);
419 } 419 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8d2b76e29d3b..4399f0c3a4ce 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -765,23 +765,6 @@ out:
765 return ret; 765 return ret;
766} 766}
767 767
768/* copy of check_sticky in fs/namei.c()
769* It's inline, so penalty for filesystems that don't use sticky bit is
770* minimal.
771*/
772static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
773{
774 kuid_t fsuid = current_fsuid();
775
776 if (!(dir->i_mode & S_ISVTX))
777 return 0;
778 if (uid_eq(inode->i_uid, fsuid))
779 return 0;
780 if (uid_eq(dir->i_uid, fsuid))
781 return 0;
782 return !capable(CAP_FOWNER);
783}
784
785/* copy of may_delete in fs/namei.c() 768/* copy of may_delete in fs/namei.c()
786 * Check whether we can remove a link victim from directory dir, check 769 * Check whether we can remove a link victim from directory dir, check
787 * whether the type of victim is right. 770 * whether the type of victim is right.
@@ -817,8 +800,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
817 return error; 800 return error;
818 if (IS_APPEND(dir)) 801 if (IS_APPEND(dir))
819 return -EPERM; 802 return -EPERM;
820 if (btrfs_check_sticky(dir, victim->d_inode)|| 803 if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) ||
821 IS_APPEND(victim->d_inode)||
822 IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) 804 IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
823 return -EPERM; 805 return -EPERM;
824 if (isdir) { 806 if (isdir) {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a2b97ef10317..54bd91ece35b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2151,6 +2151,7 @@ static void __exit exit_btrfs_fs(void)
2151 extent_map_exit(); 2151 extent_map_exit();
2152 extent_io_exit(); 2152 extent_io_exit();
2153 btrfs_interface_exit(); 2153 btrfs_interface_exit();
2154 btrfs_end_io_wq_exit();
2154 unregister_filesystem(&btrfs_fs_type); 2155 unregister_filesystem(&btrfs_fs_type);
2155 btrfs_exit_sysfs(); 2156 btrfs_exit_sysfs();
2156 btrfs_cleanup_fs_uuids(); 2157 btrfs_cleanup_fs_uuids();
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1475979e5718..286213cec861 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -672,7 +672,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
672 * is this extent already allocated in the extent 672 * is this extent already allocated in the extent
673 * allocation tree? If so, just add a reference 673 * allocation tree? If so, just add a reference
674 */ 674 */
675 ret = btrfs_lookup_extent(root, ins.objectid, 675 ret = btrfs_lookup_data_extent(root, ins.objectid,
676 ins.offset); 676 ins.offset);
677 if (ret == 0) { 677 if (ret == 0) {
678 ret = btrfs_inc_extent_ref(trans, root, 678 ret = btrfs_inc_extent_ref(trans, root,
diff --git a/fs/buffer.c b/fs/buffer.c
index 9614adc7e754..20805db2c987 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -128,21 +128,15 @@ __clear_page_buffers(struct page *page)
128 page_cache_release(page); 128 page_cache_release(page);
129} 129}
130 130
131 131static void buffer_io_error(struct buffer_head *bh, char *msg)
132static int quiet_error(struct buffer_head *bh)
133{
134 if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
135 return 0;
136 return 1;
137}
138
139
140static void buffer_io_error(struct buffer_head *bh)
141{ 132{
142 char b[BDEVNAME_SIZE]; 133 char b[BDEVNAME_SIZE];
143 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", 134
135 if (!test_bit(BH_Quiet, &bh->b_state))
136 printk_ratelimited(KERN_ERR
137 "Buffer I/O error on dev %s, logical block %llu%s\n",
144 bdevname(bh->b_bdev, b), 138 bdevname(bh->b_bdev, b),
145 (unsigned long long)bh->b_blocknr); 139 (unsigned long long)bh->b_blocknr, msg);
146} 140}
147 141
148/* 142/*
@@ -177,17 +171,10 @@ EXPORT_SYMBOL(end_buffer_read_sync);
177 171
178void end_buffer_write_sync(struct buffer_head *bh, int uptodate) 172void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
179{ 173{
180 char b[BDEVNAME_SIZE];
181
182 if (uptodate) { 174 if (uptodate) {
183 set_buffer_uptodate(bh); 175 set_buffer_uptodate(bh);
184 } else { 176 } else {
185 if (!quiet_error(bh)) { 177 buffer_io_error(bh, ", lost sync page write");
186 buffer_io_error(bh);
187 printk(KERN_WARNING "lost page write due to "
188 "I/O error on %s\n",
189 bdevname(bh->b_bdev, b));
190 }
191 set_buffer_write_io_error(bh); 178 set_buffer_write_io_error(bh);
192 clear_buffer_uptodate(bh); 179 clear_buffer_uptodate(bh);
193 } 180 }
@@ -304,8 +291,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
304 set_buffer_uptodate(bh); 291 set_buffer_uptodate(bh);
305 } else { 292 } else {
306 clear_buffer_uptodate(bh); 293 clear_buffer_uptodate(bh);
307 if (!quiet_error(bh)) 294 buffer_io_error(bh, ", async page read");
308 buffer_io_error(bh);
309 SetPageError(page); 295 SetPageError(page);
310 } 296 }
311 297
@@ -353,7 +339,6 @@ still_busy:
353 */ 339 */
354void end_buffer_async_write(struct buffer_head *bh, int uptodate) 340void end_buffer_async_write(struct buffer_head *bh, int uptodate)
355{ 341{
356 char b[BDEVNAME_SIZE];
357 unsigned long flags; 342 unsigned long flags;
358 struct buffer_head *first; 343 struct buffer_head *first;
359 struct buffer_head *tmp; 344 struct buffer_head *tmp;
@@ -365,12 +350,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
365 if (uptodate) { 350 if (uptodate) {
366 set_buffer_uptodate(bh); 351 set_buffer_uptodate(bh);
367 } else { 352 } else {
368 if (!quiet_error(bh)) { 353 buffer_io_error(bh, ", lost async page write");
369 buffer_io_error(bh);
370 printk(KERN_WARNING "lost page write due to "
371 "I/O error on %s\n",
372 bdevname(bh->b_bdev, b));
373 }
374 set_bit(AS_EIO, &page->mapping->flags); 354 set_bit(AS_EIO, &page->mapping->flags);
375 set_buffer_write_io_error(bh); 355 set_buffer_write_io_error(bh);
376 clear_buffer_uptodate(bh); 356 clear_buffer_uptodate(bh);
@@ -993,7 +973,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
993 */ 973 */
994static int 974static int
995grow_dev_page(struct block_device *bdev, sector_t block, 975grow_dev_page(struct block_device *bdev, sector_t block,
996 pgoff_t index, int size, int sizebits) 976 pgoff_t index, int size, int sizebits, gfp_t gfp)
997{ 977{
998 struct inode *inode = bdev->bd_inode; 978 struct inode *inode = bdev->bd_inode;
999 struct page *page; 979 struct page *page;
@@ -1002,8 +982,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
1002 int ret = 0; /* Will call free_more_memory() */ 982 int ret = 0; /* Will call free_more_memory() */
1003 gfp_t gfp_mask; 983 gfp_t gfp_mask;
1004 984
1005 gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS; 985 gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
1006 gfp_mask |= __GFP_MOVABLE; 986
1007 /* 987 /*
1008 * XXX: __getblk_slow() can not really deal with failure and 988 * XXX: __getblk_slow() can not really deal with failure and
1009 * will endlessly loop on improvised global reclaim. Prefer 989 * will endlessly loop on improvised global reclaim. Prefer
@@ -1060,7 +1040,7 @@ failed:
1060 * that page was dirty, the buffers are set dirty also. 1040 * that page was dirty, the buffers are set dirty also.
1061 */ 1041 */
1062static int 1042static int
1063grow_buffers(struct block_device *bdev, sector_t block, int size) 1043grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
1064{ 1044{
1065 pgoff_t index; 1045 pgoff_t index;
1066 int sizebits; 1046 int sizebits;
@@ -1087,11 +1067,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
1087 } 1067 }
1088 1068
1089 /* Create a page with the proper size buffers.. */ 1069 /* Create a page with the proper size buffers.. */
1090 return grow_dev_page(bdev, block, index, size, sizebits); 1070 return grow_dev_page(bdev, block, index, size, sizebits, gfp);
1091} 1071}
1092 1072
1093static struct buffer_head * 1073struct buffer_head *
1094__getblk_slow(struct block_device *bdev, sector_t block, int size) 1074__getblk_slow(struct block_device *bdev, sector_t block,
1075 unsigned size, gfp_t gfp)
1095{ 1076{
1096 /* Size must be multiple of hard sectorsize */ 1077 /* Size must be multiple of hard sectorsize */
1097 if (unlikely(size & (bdev_logical_block_size(bdev)-1) || 1078 if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
@@ -1113,13 +1094,14 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
1113 if (bh) 1094 if (bh)
1114 return bh; 1095 return bh;
1115 1096
1116 ret = grow_buffers(bdev, block, size); 1097 ret = grow_buffers(bdev, block, size, gfp);
1117 if (ret < 0) 1098 if (ret < 0)
1118 return NULL; 1099 return NULL;
1119 if (ret == 0) 1100 if (ret == 0)
1120 free_more_memory(); 1101 free_more_memory();
1121 } 1102 }
1122} 1103}
1104EXPORT_SYMBOL(__getblk_slow);
1123 1105
1124/* 1106/*
1125 * The relationship between dirty buffers and dirty pages: 1107 * The relationship between dirty buffers and dirty pages:
@@ -1373,24 +1355,25 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1373EXPORT_SYMBOL(__find_get_block); 1355EXPORT_SYMBOL(__find_get_block);
1374 1356
1375/* 1357/*
1376 * __getblk will locate (and, if necessary, create) the buffer_head 1358 * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
1377 * which corresponds to the passed block_device, block and size. The 1359 * which corresponds to the passed block_device, block and size. The
1378 * returned buffer has its reference count incremented. 1360 * returned buffer has its reference count incremented.
1379 * 1361 *
1380 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() 1362 * __getblk_gfp() will lock up the machine if grow_dev_page's
1381 * attempt is failing. FIXME, perhaps? 1363 * try_to_free_buffers() attempt is failing. FIXME, perhaps?
1382 */ 1364 */
1383struct buffer_head * 1365struct buffer_head *
1384__getblk(struct block_device *bdev, sector_t block, unsigned size) 1366__getblk_gfp(struct block_device *bdev, sector_t block,
1367 unsigned size, gfp_t gfp)
1385{ 1368{
1386 struct buffer_head *bh = __find_get_block(bdev, block, size); 1369 struct buffer_head *bh = __find_get_block(bdev, block, size);
1387 1370
1388 might_sleep(); 1371 might_sleep();
1389 if (bh == NULL) 1372 if (bh == NULL)
1390 bh = __getblk_slow(bdev, block, size); 1373 bh = __getblk_slow(bdev, block, size, gfp);
1391 return bh; 1374 return bh;
1392} 1375}
1393EXPORT_SYMBOL(__getblk); 1376EXPORT_SYMBOL(__getblk_gfp);
1394 1377
1395/* 1378/*
1396 * Do async read-ahead on a buffer.. 1379 * Do async read-ahead on a buffer..
@@ -1406,24 +1389,28 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1406EXPORT_SYMBOL(__breadahead); 1389EXPORT_SYMBOL(__breadahead);
1407 1390
1408/** 1391/**
1409 * __bread() - reads a specified block and returns the bh 1392 * __bread_gfp() - reads a specified block and returns the bh
1410 * @bdev: the block_device to read from 1393 * @bdev: the block_device to read from
1411 * @block: number of block 1394 * @block: number of block
1412 * @size: size (in bytes) to read 1395 * @size: size (in bytes) to read
1413 * 1396 * @gfp: page allocation flag
1397 *
1414 * Reads a specified block, and returns buffer head that contains it. 1398 * Reads a specified block, and returns buffer head that contains it.
1399 * The page cache can be allocated from non-movable area
1400 * not to prevent page migration if you set gfp to zero.
1415 * It returns NULL if the block was unreadable. 1401 * It returns NULL if the block was unreadable.
1416 */ 1402 */
1417struct buffer_head * 1403struct buffer_head *
1418__bread(struct block_device *bdev, sector_t block, unsigned size) 1404__bread_gfp(struct block_device *bdev, sector_t block,
1405 unsigned size, gfp_t gfp)
1419{ 1406{
1420 struct buffer_head *bh = __getblk(bdev, block, size); 1407 struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1421 1408
1422 if (likely(bh) && !buffer_uptodate(bh)) 1409 if (likely(bh) && !buffer_uptodate(bh))
1423 bh = __bread_slow(bh); 1410 bh = __bread_slow(bh);
1424 return bh; 1411 return bh;
1425} 1412}
1426EXPORT_SYMBOL(__bread); 1413EXPORT_SYMBOL(__bread_gfp);
1427 1414
1428/* 1415/*
1429 * invalidate_bh_lrus() is called rarely - but not only at unmount. 1416 * invalidate_bh_lrus() is called rarely - but not only at unmount.
@@ -2082,6 +2069,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
2082 struct page *page, void *fsdata) 2069 struct page *page, void *fsdata)
2083{ 2070{
2084 struct inode *inode = mapping->host; 2071 struct inode *inode = mapping->host;
2072 loff_t old_size = inode->i_size;
2085 int i_size_changed = 0; 2073 int i_size_changed = 0;
2086 2074
2087 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 2075 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -2101,6 +2089,8 @@ int generic_write_end(struct file *file, struct address_space *mapping,
2101 unlock_page(page); 2089 unlock_page(page);
2102 page_cache_release(page); 2090 page_cache_release(page);
2103 2091
2092 if (old_size < pos)
2093 pagecache_isize_extended(inode, old_size, pos);
2104 /* 2094 /*
2105 * Don't mark the inode dirty under page lock. First, it unnecessarily 2095 * Don't mark the inode dirty under page lock. First, it unnecessarily
2106 * makes the holding time of page lock longer. Second, it forces lock 2096 * makes the holding time of page lock longer. Second, it forces lock
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 659f2ea9e6f7..cefca661464b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2638,7 +2638,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2638 2638
2639 for (i = 0; i < CEPH_CAP_BITS; i++) 2639 for (i = 0; i < CEPH_CAP_BITS; i++)
2640 if ((dirty & (1 << i)) && 2640 if ((dirty & (1 << i)) &&
2641 flush_tid == ci->i_cap_flush_tid[i]) 2641 (u16)flush_tid == ci->i_cap_flush_tid[i])
2642 cleaned |= 1 << i; 2642 cleaned |= 1 << i;
2643 2643
2644 dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," 2644 dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
diff --git a/fs/dcache.c b/fs/dcache.c
index d5a23fd0da90..3ffef7f4e5cd 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2673,11 +2673,13 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
2673 if (!IS_ROOT(new)) { 2673 if (!IS_ROOT(new)) {
2674 spin_unlock(&inode->i_lock); 2674 spin_unlock(&inode->i_lock);
2675 dput(new); 2675 dput(new);
2676 iput(inode);
2676 return ERR_PTR(-EIO); 2677 return ERR_PTR(-EIO);
2677 } 2678 }
2678 if (d_ancestor(new, dentry)) { 2679 if (d_ancestor(new, dentry)) {
2679 spin_unlock(&inode->i_lock); 2680 spin_unlock(&inode->i_lock);
2680 dput(new); 2681 dput(new);
2682 iput(inode);
2681 return ERR_PTR(-EIO); 2683 return ERR_PTR(-EIO);
2682 } 2684 }
2683 write_seqlock(&rename_lock); 2685 write_seqlock(&rename_lock);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 1b119d3bf924..c4cd1fd86cc2 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -566,6 +566,13 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
566 s->s_maxbytes = path.dentry->d_sb->s_maxbytes; 566 s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
567 s->s_blocksize = path.dentry->d_sb->s_blocksize; 567 s->s_blocksize = path.dentry->d_sb->s_blocksize;
568 s->s_magic = ECRYPTFS_SUPER_MAGIC; 568 s->s_magic = ECRYPTFS_SUPER_MAGIC;
569 s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1;
570
571 rc = -EINVAL;
572 if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
573 pr_err("eCryptfs: maximum fs stacking depth exceeded\n");
574 goto out_free;
575 }
569 576
570 inode = ecryptfs_get_inode(path.dentry->d_inode, s); 577 inode = ecryptfs_get_inode(path.dentry->d_inode, s);
571 rc = PTR_ERR(inode); 578 rc = PTR_ERR(inode);
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index 389ba8312d5d..b47c7b8dc275 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -4,7 +4,7 @@
4# Copyright (C) 2008 Panasas Inc. All rights reserved. 4# Copyright (C) 2008 Panasas Inc. All rights reserved.
5# 5#
6# Authors: 6# Authors:
7# Boaz Harrosh <bharrosh@panasas.com> 7# Boaz Harrosh <ooo@electrozaur.com>
8# 8#
9# This program is free software; you can redistribute it and/or modify 9# This program is free software; you can redistribute it and/or modify
10# it under the terms of the GNU General Public License version 2 10# it under the terms of the GNU General Public License version 2
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index 3bbd46956d77..7d88ef566213 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -4,7 +4,7 @@
4 * Copyright (C) 2005, 2006 4 * Copyright (C) 2005, 2006
5 * Avishay Traeger (avishay@gmail.com) 5 * Avishay Traeger (avishay@gmail.com)
6 * Copyright (C) 2008, 2009 6 * Copyright (C) 2008, 2009
7 * Boaz Harrosh <bharrosh@panasas.com> 7 * Boaz Harrosh <ooo@electrozaur.com>
8 * 8 *
9 * Copyrights for code taken from ext2: 9 * Copyrights for code taken from ext2:
10 * Copyright (C) 1992, 1993, 1994, 1995 10 * Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 49f51ab4caac..d7defd557601 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -2,7 +2,7 @@
2 * Copyright (C) 2005, 2006 2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) 3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2008, 2009 4 * Copyright (C) 2008, 2009
5 * Boaz Harrosh <bharrosh@panasas.com> 5 * Boaz Harrosh <ooo@electrozaur.com>
6 * 6 *
7 * Copyrights for code taken from ext2: 7 * Copyrights for code taken from ext2:
8 * Copyright (C) 1992, 1993, 1994, 1995 8 * Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index fffe86fd7a42..ad9cac670a47 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -2,7 +2,7 @@
2 * Copyright (C) 2005, 2006 2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) 3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2008, 2009 4 * Copyright (C) 2008, 2009
5 * Boaz Harrosh <bharrosh@panasas.com> 5 * Boaz Harrosh <ooo@electrozaur.com>
6 * 6 *
7 * Copyrights for code taken from ext2: 7 * Copyrights for code taken from ext2:
8 * Copyright (C) 1992, 1993, 1994, 1995 8 * Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 71bf8e4fb5d4..1a376b42d305 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -2,7 +2,7 @@
2 * Copyright (C) 2005, 2006 2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) 3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2008, 2009 4 * Copyright (C) 2008, 2009
5 * Boaz Harrosh <bharrosh@panasas.com> 5 * Boaz Harrosh <ooo@electrozaur.com>
6 * 6 *
7 * Copyrights for code taken from ext2: 7 * Copyrights for code taken from ext2:
8 * Copyright (C) 1992, 1993, 1994, 1995 8 * Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 3f9cafd73931..f1d3d4eb8c4f 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -2,7 +2,7 @@
2 * Copyright (C) 2005, 2006 2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) 3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2008, 2009 4 * Copyright (C) 2008, 2009
5 * Boaz Harrosh <bharrosh@panasas.com> 5 * Boaz Harrosh <ooo@electrozaur.com>
6 * 6 *
7 * Copyrights for code taken from ext2: 7 * Copyrights for code taken from ext2:
8 * Copyright (C) 1992, 1993, 1994, 1995 8 * Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 4731fd991efe..28907460e8fa 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -2,7 +2,7 @@
2 * Copyright (C) 2005, 2006 2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) 3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2008, 2009 4 * Copyright (C) 2008, 2009
5 * Boaz Harrosh <bharrosh@panasas.com> 5 * Boaz Harrosh <ooo@electrozaur.com>
6 * 6 *
7 * Copyrights for code taken from ext2: 7 * Copyrights for code taken from ext2:
8 * Copyright (C) 1992, 1993, 1994, 1995 8 * Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index cfc0205d62c4..7bd8ac8dfb28 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -2,7 +2,7 @@
2 * Copyright (C) 2005, 2006 2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) 3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2008, 2009 4 * Copyright (C) 2008, 2009
5 * Boaz Harrosh <bharrosh@panasas.com> 5 * Boaz Harrosh <ooo@electrozaur.com>
6 * 6 *
7 * This file is part of exofs. 7 * This file is part of exofs.
8 * 8 *
@@ -29,7 +29,7 @@
29 29
30#include "ore_raid.h" 30#include "ore_raid.h"
31 31
32MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); 32MODULE_AUTHOR("Boaz Harrosh <ooo@electrozaur.com>");
33MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); 33MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
34MODULE_LICENSE("GPL"); 34MODULE_LICENSE("GPL");
35 35
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 84529b8a331b..27cbdb697649 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) 2011 2 * Copyright (C) 2011
3 * Boaz Harrosh <bharrosh@panasas.com> 3 * Boaz Harrosh <ooo@electrozaur.com>
4 * 4 *
5 * This file is part of the objects raid engine (ore). 5 * This file is part of the objects raid engine (ore).
6 * 6 *
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
index cf6375d82129..a6e746775570 100644
--- a/fs/exofs/ore_raid.h
+++ b/fs/exofs/ore_raid.h
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) from 2011 2 * Copyright (C) from 2011
3 * Boaz Harrosh <bharrosh@panasas.com> 3 * Boaz Harrosh <ooo@electrozaur.com>
4 * 4 *
5 * This file is part of the objects raid engine (ore). 5 * This file is part of the objects raid engine (ore).
6 * 6 *
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index ed73ed8ebbee..95965503afcb 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -2,7 +2,7 @@
2 * Copyright (C) 2005, 2006 2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) 3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2008, 2009 4 * Copyright (C) 2008, 2009
5 * Boaz Harrosh <bharrosh@panasas.com> 5 * Boaz Harrosh <ooo@electrozaur.com>
6 * 6 *
7 * Copyrights for code taken from ext2: 7 * Copyrights for code taken from ext2:
8 * Copyright (C) 1992, 1993, 1994, 1995 8 * Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
index 4dd687c3e747..832e2624b80b 100644
--- a/fs/exofs/symlink.c
+++ b/fs/exofs/symlink.c
@@ -2,7 +2,7 @@
2 * Copyright (C) 2005, 2006 2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com) 3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2008, 2009 4 * Copyright (C) 2008, 2009
5 * Boaz Harrosh <bharrosh@panasas.com> 5 * Boaz Harrosh <ooo@electrozaur.com>
6 * 6 *
7 * Copyrights for code taken from ext2: 7 * Copyrights for code taken from ext2:
8 * Copyright (C) 1992, 1993, 1994, 1995 8 * Copyright (C) 1992, 1993, 1994, 1995
diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c
index 1b4f2f95fc37..5e6a2c0a1f0b 100644
--- a/fs/exofs/sys.c
+++ b/fs/exofs/sys.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Copyright (C) 2012 2 * Copyright (C) 2012
3 * Sachin Bhamare <sbhamare@panasas.com> 3 * Sachin Bhamare <sbhamare@panasas.com>
4 * Boaz Harrosh <bharrosh@panasas.com> 4 * Boaz Harrosh <ooo@electrozaur.com>
5 * 5 *
6 * This file is part of exofs. 6 * This file is part of exofs.
7 * 7 *
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 7015db0bafd1..eb742d0e67ff 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1354,13 +1354,6 @@ set_qf_format:
1354 "not specified."); 1354 "not specified.");
1355 return 0; 1355 return 0;
1356 } 1356 }
1357 } else {
1358 if (sbi->s_jquota_fmt) {
1359 ext3_msg(sb, KERN_ERR, "error: journaled quota format "
1360 "specified with no journaling "
1361 "enabled.");
1362 return 0;
1363 }
1364 } 1357 }
1365#endif 1358#endif
1366 return 1; 1359 return 1;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 581ef40fbe90..83a6f497c4e0 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -176,7 +176,7 @@ static unsigned int num_clusters_in_group(struct super_block *sb,
176} 176}
177 177
178/* Initializes an uninitialized block bitmap */ 178/* Initializes an uninitialized block bitmap */
179static void ext4_init_block_bitmap(struct super_block *sb, 179static int ext4_init_block_bitmap(struct super_block *sb,
180 struct buffer_head *bh, 180 struct buffer_head *bh,
181 ext4_group_t block_group, 181 ext4_group_t block_group,
182 struct ext4_group_desc *gdp) 182 struct ext4_group_desc *gdp)
@@ -192,7 +192,6 @@ static void ext4_init_block_bitmap(struct super_block *sb,
192 /* If checksum is bad mark all blocks used to prevent allocation 192 /* If checksum is bad mark all blocks used to prevent allocation
193 * essentially implementing a per-group read-only flag. */ 193 * essentially implementing a per-group read-only flag. */
194 if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { 194 if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
195 ext4_error(sb, "Checksum bad for group %u", block_group);
196 grp = ext4_get_group_info(sb, block_group); 195 grp = ext4_get_group_info(sb, block_group);
197 if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) 196 if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
198 percpu_counter_sub(&sbi->s_freeclusters_counter, 197 percpu_counter_sub(&sbi->s_freeclusters_counter,
@@ -205,7 +204,7 @@ static void ext4_init_block_bitmap(struct super_block *sb,
205 count); 204 count);
206 } 205 }
207 set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); 206 set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
208 return; 207 return -EIO;
209 } 208 }
210 memset(bh->b_data, 0, sb->s_blocksize); 209 memset(bh->b_data, 0, sb->s_blocksize);
211 210
@@ -243,6 +242,7 @@ static void ext4_init_block_bitmap(struct super_block *sb,
243 sb->s_blocksize * 8, bh->b_data); 242 sb->s_blocksize * 8, bh->b_data);
244 ext4_block_bitmap_csum_set(sb, block_group, gdp, bh); 243 ext4_block_bitmap_csum_set(sb, block_group, gdp, bh);
245 ext4_group_desc_csum_set(sb, block_group, gdp); 244 ext4_group_desc_csum_set(sb, block_group, gdp);
245 return 0;
246} 246}
247 247
248/* Return the number of free blocks in a block group. It is used when 248/* Return the number of free blocks in a block group. It is used when
@@ -438,11 +438,15 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
438 } 438 }
439 ext4_lock_group(sb, block_group); 439 ext4_lock_group(sb, block_group);
440 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 440 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
441 ext4_init_block_bitmap(sb, bh, block_group, desc); 441 int err;
442
443 err = ext4_init_block_bitmap(sb, bh, block_group, desc);
442 set_bitmap_uptodate(bh); 444 set_bitmap_uptodate(bh);
443 set_buffer_uptodate(bh); 445 set_buffer_uptodate(bh);
444 ext4_unlock_group(sb, block_group); 446 ext4_unlock_group(sb, block_group);
445 unlock_buffer(bh); 447 unlock_buffer(bh);
448 if (err)
449 ext4_error(sb, "Checksum bad for grp %u", block_group);
446 return bh; 450 return bh;
447 } 451 }
448 ext4_unlock_group(sb, block_group); 452 ext4_unlock_group(sb, block_group);
@@ -636,8 +640,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
636 * Account for the allocated meta blocks. We will never 640 * Account for the allocated meta blocks. We will never
637 * fail EDQUOT for metdata, but we do account for it. 641 * fail EDQUOT for metdata, but we do account for it.
638 */ 642 */
639 if (!(*errp) && 643 if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
640 ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
641 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 644 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
642 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 645 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
643 dquot_alloc_block_nofail(inode, 646 dquot_alloc_block_nofail(inode,
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 3285aa5a706a..b610779a958c 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -24,8 +24,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
24 __u32 provided, calculated; 24 __u32 provided, calculated;
25 struct ext4_sb_info *sbi = EXT4_SB(sb); 25 struct ext4_sb_info *sbi = EXT4_SB(sb);
26 26
27 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 27 if (!ext4_has_metadata_csum(sb))
28 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
29 return 1; 28 return 1;
30 29
31 provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); 30 provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
@@ -46,8 +45,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
46 __u32 csum; 45 __u32 csum;
47 struct ext4_sb_info *sbi = EXT4_SB(sb); 46 struct ext4_sb_info *sbi = EXT4_SB(sb);
48 47
49 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 48 if (!ext4_has_metadata_csum(sb))
50 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
51 return; 49 return;
52 50
53 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); 51 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
@@ -65,8 +63,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
65 struct ext4_sb_info *sbi = EXT4_SB(sb); 63 struct ext4_sb_info *sbi = EXT4_SB(sb);
66 int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; 64 int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
67 65
68 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 66 if (!ext4_has_metadata_csum(sb))
69 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
70 return 1; 67 return 1;
71 68
72 provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); 69 provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
@@ -91,8 +88,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
91 __u32 csum; 88 __u32 csum;
92 struct ext4_sb_info *sbi = EXT4_SB(sb); 89 struct ext4_sb_info *sbi = EXT4_SB(sb);
93 90
94 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 91 if (!ext4_has_metadata_csum(sb))
95 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
96 return; 92 return;
97 93
98 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); 94 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 0bb3f9ea0832..c24143ea9c08 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -151,13 +151,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
151 &file->f_ra, file, 151 &file->f_ra, file,
152 index, 1); 152 index, 1);
153 file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 153 file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
154 bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err); 154 bh = ext4_bread(NULL, inode, map.m_lblk, 0);
155 if (IS_ERR(bh))
156 return PTR_ERR(bh);
155 } 157 }
156 158
157 /*
158 * We ignore I/O errors on directories so users have a chance
159 * of recovering data when there's a bad sector
160 */
161 if (!bh) { 159 if (!bh) {
162 if (!dir_has_error) { 160 if (!dir_has_error) {
163 EXT4_ERROR_FILE(file, 0, 161 EXT4_ERROR_FILE(file, 0,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b0c225cdb52c..c55a1faaed58 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -572,15 +572,15 @@ enum {
572 572
573/* 573/*
574 * The bit position of these flags must not overlap with any of the 574 * The bit position of these flags must not overlap with any of the
575 * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(), 575 * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(),
576 * read_extent_tree_block(), ext4_split_extent_at(), 576 * read_extent_tree_block(), ext4_split_extent_at(),
577 * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). 577 * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
578 * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be 578 * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
579 * caching the extents when reading from the extent tree while a 579 * caching the extents when reading from the extent tree while a
580 * truncate or punch hole operation is in progress. 580 * truncate or punch hole operation is in progress.
581 */ 581 */
582#define EXT4_EX_NOCACHE 0x0400 582#define EXT4_EX_NOCACHE 0x40000000
583#define EXT4_EX_FORCE_CACHE 0x0800 583#define EXT4_EX_FORCE_CACHE 0x20000000
584 584
585/* 585/*
586 * Flags used by ext4_free_blocks 586 * Flags used by ext4_free_blocks
@@ -890,6 +890,7 @@ struct ext4_inode_info {
890 struct ext4_es_tree i_es_tree; 890 struct ext4_es_tree i_es_tree;
891 rwlock_t i_es_lock; 891 rwlock_t i_es_lock;
892 struct list_head i_es_lru; 892 struct list_head i_es_lru;
893 unsigned int i_es_all_nr; /* protected by i_es_lock */
893 unsigned int i_es_lru_nr; /* protected by i_es_lock */ 894 unsigned int i_es_lru_nr; /* protected by i_es_lock */
894 unsigned long i_touch_when; /* jiffies of last accessing */ 895 unsigned long i_touch_when; /* jiffies of last accessing */
895 896
@@ -1174,6 +1175,9 @@ struct ext4_super_block {
1174#define EXT4_MF_MNTDIR_SAMPLED 0x0001 1175#define EXT4_MF_MNTDIR_SAMPLED 0x0001
1175#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ 1176#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
1176 1177
1178/* Number of quota types we support */
1179#define EXT4_MAXQUOTAS 2
1180
1177/* 1181/*
1178 * fourth extended-fs super-block data in memory 1182 * fourth extended-fs super-block data in memory
1179 */ 1183 */
@@ -1237,7 +1241,7 @@ struct ext4_sb_info {
1237 u32 s_min_batch_time; 1241 u32 s_min_batch_time;
1238 struct block_device *journal_bdev; 1242 struct block_device *journal_bdev;
1239#ifdef CONFIG_QUOTA 1243#ifdef CONFIG_QUOTA
1240 char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ 1244 char *s_qf_names[EXT4_MAXQUOTAS]; /* Names of quota files with journalled quota */
1241 int s_jquota_fmt; /* Format of quota to use */ 1245 int s_jquota_fmt; /* Format of quota to use */
1242#endif 1246#endif
1243 unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ 1247 unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
@@ -1330,8 +1334,7 @@ struct ext4_sb_info {
1330 /* Reclaim extents from extent status tree */ 1334 /* Reclaim extents from extent status tree */
1331 struct shrinker s_es_shrinker; 1335 struct shrinker s_es_shrinker;
1332 struct list_head s_es_lru; 1336 struct list_head s_es_lru;
1333 unsigned long s_es_last_sorted; 1337 struct ext4_es_stats s_es_stats;
1334 struct percpu_counter s_extent_cache_cnt;
1335 struct mb_cache *s_mb_cache; 1338 struct mb_cache *s_mb_cache;
1336 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; 1339 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
1337 1340
@@ -1399,7 +1402,6 @@ enum {
1399 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ 1402 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
1400 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ 1403 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
1401 EXT4_STATE_NEWENTRY, /* File just added to dir */ 1404 EXT4_STATE_NEWENTRY, /* File just added to dir */
1402 EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
1403 EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read 1405 EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
1404 nolocking */ 1406 nolocking */
1405 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ 1407 EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
@@ -2086,10 +2088,8 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
2086extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); 2088extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
2087 2089
2088/* inode.c */ 2090/* inode.c */
2089struct buffer_head *ext4_getblk(handle_t *, struct inode *, 2091struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
2090 ext4_lblk_t, int, int *); 2092struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
2091struct buffer_head *ext4_bread(handle_t *, struct inode *,
2092 ext4_lblk_t, int, int *);
2093int ext4_get_block_write(struct inode *inode, sector_t iblock, 2093int ext4_get_block_write(struct inode *inode, sector_t iblock,
2094 struct buffer_head *bh_result, int create); 2094 struct buffer_head *bh_result, int create);
2095int ext4_get_block(struct inode *inode, sector_t iblock, 2095int ext4_get_block(struct inode *inode, sector_t iblock,
@@ -2109,6 +2109,7 @@ int do_journal_get_write_access(handle_t *handle,
2109#define CONVERT_INLINE_DATA 2 2109#define CONVERT_INLINE_DATA 2
2110 2110
2111extern struct inode *ext4_iget(struct super_block *, unsigned long); 2111extern struct inode *ext4_iget(struct super_block *, unsigned long);
2112extern struct inode *ext4_iget_normal(struct super_block *, unsigned long);
2112extern int ext4_write_inode(struct inode *, struct writeback_control *); 2113extern int ext4_write_inode(struct inode *, struct writeback_control *);
2113extern int ext4_setattr(struct dentry *, struct iattr *); 2114extern int ext4_setattr(struct dentry *, struct iattr *);
2114extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, 2115extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
@@ -2332,10 +2333,18 @@ extern int ext4_register_li_request(struct super_block *sb,
2332static inline int ext4_has_group_desc_csum(struct super_block *sb) 2333static inline int ext4_has_group_desc_csum(struct super_block *sb)
2333{ 2334{
2334 return EXT4_HAS_RO_COMPAT_FEATURE(sb, 2335 return EXT4_HAS_RO_COMPAT_FEATURE(sb,
2335 EXT4_FEATURE_RO_COMPAT_GDT_CSUM | 2336 EXT4_FEATURE_RO_COMPAT_GDT_CSUM) ||
2336 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM); 2337 (EXT4_SB(sb)->s_chksum_driver != NULL);
2337} 2338}
2338 2339
2340static inline int ext4_has_metadata_csum(struct super_block *sb)
2341{
2342 WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb,
2343 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
2344 !EXT4_SB(sb)->s_chksum_driver);
2345
2346 return (EXT4_SB(sb)->s_chksum_driver != NULL);
2347}
2339static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) 2348static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
2340{ 2349{
2341 return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | 2350 return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) |
@@ -2731,21 +2740,26 @@ extern int ext4_can_extents_be_merged(struct inode *inode,
2731 struct ext4_extent *ex1, 2740 struct ext4_extent *ex1,
2732 struct ext4_extent *ex2); 2741 struct ext4_extent *ex2);
2733extern int ext4_ext_insert_extent(handle_t *, struct inode *, 2742extern int ext4_ext_insert_extent(handle_t *, struct inode *,
2734 struct ext4_ext_path *, 2743 struct ext4_ext_path **,
2735 struct ext4_extent *, int); 2744 struct ext4_extent *, int);
2736extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, 2745extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
2737 struct ext4_ext_path *, 2746 struct ext4_ext_path **,
2738 int flags); 2747 int flags);
2739extern void ext4_ext_drop_refs(struct ext4_ext_path *); 2748extern void ext4_ext_drop_refs(struct ext4_ext_path *);
2740extern int ext4_ext_check_inode(struct inode *inode); 2749extern int ext4_ext_check_inode(struct inode *inode);
2741extern int ext4_find_delalloc_range(struct inode *inode, 2750extern int ext4_find_delalloc_range(struct inode *inode,
2742 ext4_lblk_t lblk_start, 2751 ext4_lblk_t lblk_start,
2743 ext4_lblk_t lblk_end); 2752 ext4_lblk_t lblk_end);
2744extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); 2753extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
2754extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
2745extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2755extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2746 __u64 start, __u64 len); 2756 __u64 start, __u64 len);
2747extern int ext4_ext_precache(struct inode *inode); 2757extern int ext4_ext_precache(struct inode *inode);
2748extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); 2758extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
2759extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
2760 struct inode *inode2, ext4_lblk_t lblk1,
2761 ext4_lblk_t lblk2, ext4_lblk_t count,
2762 int mark_unwritten,int *err);
2749 2763
2750/* move_extent.c */ 2764/* move_extent.c */
2751extern void ext4_double_down_write_data_sem(struct inode *first, 2765extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -2755,8 +2769,6 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
2755extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, 2769extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2756 __u64 start_orig, __u64 start_donor, 2770 __u64 start_orig, __u64 start_donor,
2757 __u64 len, __u64 *moved_len); 2771 __u64 len, __u64 *moved_len);
2758extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
2759 struct ext4_extent **extent);
2760 2772
2761/* page-io.c */ 2773/* page-io.c */
2762extern int __init ext4_init_pageio(void); 2774extern int __init ext4_init_pageio(void);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index a867f5ca9991..3c9381547094 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -123,6 +123,7 @@ find_ext4_extent_tail(struct ext4_extent_header *eh)
123struct ext4_ext_path { 123struct ext4_ext_path {
124 ext4_fsblk_t p_block; 124 ext4_fsblk_t p_block;
125 __u16 p_depth; 125 __u16 p_depth;
126 __u16 p_maxdepth;
126 struct ext4_extent *p_ext; 127 struct ext4_extent *p_ext;
127 struct ext4_extent_idx *p_idx; 128 struct ext4_extent_idx *p_idx;
128 struct ext4_extent_header *p_hdr; 129 struct ext4_extent_header *p_hdr;
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 0074e0d23d6e..3445035c7e01 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -256,8 +256,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
256 set_buffer_prio(bh); 256 set_buffer_prio(bh);
257 if (ext4_handle_valid(handle)) { 257 if (ext4_handle_valid(handle)) {
258 err = jbd2_journal_dirty_metadata(handle, bh); 258 err = jbd2_journal_dirty_metadata(handle, bh);
259 /* Errors can only happen if there is a bug */ 259 /* Errors can only happen due to aborted journal or a nasty bug */
260 if (WARN_ON_ONCE(err)) { 260 if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) {
261 ext4_journal_abort_handle(where, line, __func__, bh, 261 ext4_journal_abort_handle(where, line, __func__, bh,
262 handle, err); 262 handle, err);
263 if (inode == NULL) { 263 if (inode == NULL) {
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 17c00ff202f2..9c5b49fb281e 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -102,9 +102,9 @@
102#define EXT4_QUOTA_INIT_BLOCKS(sb) 0 102#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
103#define EXT4_QUOTA_DEL_BLOCKS(sb) 0 103#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
104#endif 104#endif
105#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) 105#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
106#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) 106#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
107#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) 107#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
108 108
109static inline int ext4_jbd2_credits_xattr(struct inode *inode) 109static inline int ext4_jbd2_credits_xattr(struct inode *inode)
110{ 110{
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74292a71b384..0b16fb4c06d3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -73,8 +73,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode,
73{ 73{
74 struct ext4_extent_tail *et; 74 struct ext4_extent_tail *et;
75 75
76 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 76 if (!ext4_has_metadata_csum(inode->i_sb))
77 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
78 return 1; 77 return 1;
79 78
80 et = find_ext4_extent_tail(eh); 79 et = find_ext4_extent_tail(eh);
@@ -88,8 +87,7 @@ static void ext4_extent_block_csum_set(struct inode *inode,
88{ 87{
89 struct ext4_extent_tail *et; 88 struct ext4_extent_tail *et;
90 89
91 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 90 if (!ext4_has_metadata_csum(inode->i_sb))
92 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
93 return; 91 return;
94 92
95 et = find_ext4_extent_tail(eh); 93 et = find_ext4_extent_tail(eh);
@@ -98,14 +96,14 @@ static void ext4_extent_block_csum_set(struct inode *inode,
98 96
99static int ext4_split_extent(handle_t *handle, 97static int ext4_split_extent(handle_t *handle,
100 struct inode *inode, 98 struct inode *inode,
101 struct ext4_ext_path *path, 99 struct ext4_ext_path **ppath,
102 struct ext4_map_blocks *map, 100 struct ext4_map_blocks *map,
103 int split_flag, 101 int split_flag,
104 int flags); 102 int flags);
105 103
106static int ext4_split_extent_at(handle_t *handle, 104static int ext4_split_extent_at(handle_t *handle,
107 struct inode *inode, 105 struct inode *inode,
108 struct ext4_ext_path *path, 106 struct ext4_ext_path **ppath,
109 ext4_lblk_t split, 107 ext4_lblk_t split,
110 int split_flag, 108 int split_flag,
111 int flags); 109 int flags);
@@ -291,6 +289,20 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
291 return size; 289 return size;
292} 290}
293 291
292static inline int
293ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
294 struct ext4_ext_path **ppath, ext4_lblk_t lblk,
295 int nofail)
296{
297 struct ext4_ext_path *path = *ppath;
298 int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
299
300 return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ?
301 EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
302 EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO |
303 (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
304}
305
294/* 306/*
295 * Calculate the number of metadata blocks needed 307 * Calculate the number of metadata blocks needed
296 * to allocate @blocks 308 * to allocate @blocks
@@ -695,9 +707,11 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
695 707
696void ext4_ext_drop_refs(struct ext4_ext_path *path) 708void ext4_ext_drop_refs(struct ext4_ext_path *path)
697{ 709{
698 int depth = path->p_depth; 710 int depth, i;
699 int i;
700 711
712 if (!path)
713 return;
714 depth = path->p_depth;
701 for (i = 0; i <= depth; i++, path++) 715 for (i = 0; i <= depth; i++, path++)
702 if (path->p_bh) { 716 if (path->p_bh) {
703 brelse(path->p_bh); 717 brelse(path->p_bh);
@@ -841,24 +855,32 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
841} 855}
842 856
843struct ext4_ext_path * 857struct ext4_ext_path *
844ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, 858ext4_find_extent(struct inode *inode, ext4_lblk_t block,
845 struct ext4_ext_path *path, int flags) 859 struct ext4_ext_path **orig_path, int flags)
846{ 860{
847 struct ext4_extent_header *eh; 861 struct ext4_extent_header *eh;
848 struct buffer_head *bh; 862 struct buffer_head *bh;
849 short int depth, i, ppos = 0, alloc = 0; 863 struct ext4_ext_path *path = orig_path ? *orig_path : NULL;
864 short int depth, i, ppos = 0;
850 int ret; 865 int ret;
851 866
852 eh = ext_inode_hdr(inode); 867 eh = ext_inode_hdr(inode);
853 depth = ext_depth(inode); 868 depth = ext_depth(inode);
854 869
855 /* account possible depth increase */ 870 if (path) {
871 ext4_ext_drop_refs(path);
872 if (depth > path[0].p_maxdepth) {
873 kfree(path);
874 *orig_path = path = NULL;
875 }
876 }
856 if (!path) { 877 if (!path) {
878 /* account possible depth increase */
857 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), 879 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2),
858 GFP_NOFS); 880 GFP_NOFS);
859 if (!path) 881 if (unlikely(!path))
860 return ERR_PTR(-ENOMEM); 882 return ERR_PTR(-ENOMEM);
861 alloc = 1; 883 path[0].p_maxdepth = depth + 1;
862 } 884 }
863 path[0].p_hdr = eh; 885 path[0].p_hdr = eh;
864 path[0].p_bh = NULL; 886 path[0].p_bh = NULL;
@@ -876,7 +898,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
876 898
877 bh = read_extent_tree_block(inode, path[ppos].p_block, --i, 899 bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
878 flags); 900 flags);
879 if (IS_ERR(bh)) { 901 if (unlikely(IS_ERR(bh))) {
880 ret = PTR_ERR(bh); 902 ret = PTR_ERR(bh);
881 goto err; 903 goto err;
882 } 904 }
@@ -910,8 +932,9 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
910 932
911err: 933err:
912 ext4_ext_drop_refs(path); 934 ext4_ext_drop_refs(path);
913 if (alloc) 935 kfree(path);
914 kfree(path); 936 if (orig_path)
937 *orig_path = NULL;
915 return ERR_PTR(ret); 938 return ERR_PTR(ret);
916} 939}
917 940
@@ -1238,16 +1261,24 @@ cleanup:
1238 * just created block 1261 * just created block
1239 */ 1262 */
1240static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, 1263static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1241 unsigned int flags, 1264 unsigned int flags)
1242 struct ext4_extent *newext)
1243{ 1265{
1244 struct ext4_extent_header *neh; 1266 struct ext4_extent_header *neh;
1245 struct buffer_head *bh; 1267 struct buffer_head *bh;
1246 ext4_fsblk_t newblock; 1268 ext4_fsblk_t newblock, goal = 0;
1269 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
1247 int err = 0; 1270 int err = 0;
1248 1271
1249 newblock = ext4_ext_new_meta_block(handle, inode, NULL, 1272 /* Try to prepend new index to old one */
1250 newext, &err, flags); 1273 if (ext_depth(inode))
1274 goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode)));
1275 if (goal > le32_to_cpu(es->s_first_data_block)) {
1276 flags |= EXT4_MB_HINT_TRY_GOAL;
1277 goal--;
1278 } else
1279 goal = ext4_inode_to_goal_block(inode);
1280 newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
1281 NULL, &err);
1251 if (newblock == 0) 1282 if (newblock == 0)
1252 return err; 1283 return err;
1253 1284
@@ -1314,9 +1345,10 @@ out:
1314static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, 1345static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
1315 unsigned int mb_flags, 1346 unsigned int mb_flags,
1316 unsigned int gb_flags, 1347 unsigned int gb_flags,
1317 struct ext4_ext_path *path, 1348 struct ext4_ext_path **ppath,
1318 struct ext4_extent *newext) 1349 struct ext4_extent *newext)
1319{ 1350{
1351 struct ext4_ext_path *path = *ppath;
1320 struct ext4_ext_path *curp; 1352 struct ext4_ext_path *curp;
1321 int depth, i, err = 0; 1353 int depth, i, err = 0;
1322 1354
@@ -1340,23 +1372,21 @@ repeat:
1340 goto out; 1372 goto out;
1341 1373
1342 /* refill path */ 1374 /* refill path */
1343 ext4_ext_drop_refs(path); 1375 path = ext4_find_extent(inode,
1344 path = ext4_ext_find_extent(inode,
1345 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1376 (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1346 path, gb_flags); 1377 ppath, gb_flags);
1347 if (IS_ERR(path)) 1378 if (IS_ERR(path))
1348 err = PTR_ERR(path); 1379 err = PTR_ERR(path);
1349 } else { 1380 } else {
1350 /* tree is full, time to grow in depth */ 1381 /* tree is full, time to grow in depth */
1351 err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext); 1382 err = ext4_ext_grow_indepth(handle, inode, mb_flags);
1352 if (err) 1383 if (err)
1353 goto out; 1384 goto out;
1354 1385
1355 /* refill path */ 1386 /* refill path */
1356 ext4_ext_drop_refs(path); 1387 path = ext4_find_extent(inode,
1357 path = ext4_ext_find_extent(inode,
1358 (ext4_lblk_t)le32_to_cpu(newext->ee_block), 1388 (ext4_lblk_t)le32_to_cpu(newext->ee_block),
1359 path, gb_flags); 1389 ppath, gb_flags);
1360 if (IS_ERR(path)) { 1390 if (IS_ERR(path)) {
1361 err = PTR_ERR(path); 1391 err = PTR_ERR(path);
1362 goto out; 1392 goto out;
@@ -1559,7 +1589,7 @@ found_extent:
1559 * allocated block. Thus, index entries have to be consistent 1589 * allocated block. Thus, index entries have to be consistent
1560 * with leaves. 1590 * with leaves.
1561 */ 1591 */
1562static ext4_lblk_t 1592ext4_lblk_t
1563ext4_ext_next_allocated_block(struct ext4_ext_path *path) 1593ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1564{ 1594{
1565 int depth; 1595 int depth;
@@ -1802,6 +1832,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
1802 sizeof(struct ext4_extent_idx); 1832 sizeof(struct ext4_extent_idx);
1803 s += sizeof(struct ext4_extent_header); 1833 s += sizeof(struct ext4_extent_header);
1804 1834
1835 path[1].p_maxdepth = path[0].p_maxdepth;
1805 memcpy(path[0].p_hdr, path[1].p_hdr, s); 1836 memcpy(path[0].p_hdr, path[1].p_hdr, s);
1806 path[0].p_depth = 0; 1837 path[0].p_depth = 0;
1807 path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + 1838 path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) +
@@ -1896,9 +1927,10 @@ out:
1896 * creating new leaf in the no-space case. 1927 * creating new leaf in the no-space case.
1897 */ 1928 */
1898int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, 1929int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1899 struct ext4_ext_path *path, 1930 struct ext4_ext_path **ppath,
1900 struct ext4_extent *newext, int gb_flags) 1931 struct ext4_extent *newext, int gb_flags)
1901{ 1932{
1933 struct ext4_ext_path *path = *ppath;
1902 struct ext4_extent_header *eh; 1934 struct ext4_extent_header *eh;
1903 struct ext4_extent *ex, *fex; 1935 struct ext4_extent *ex, *fex;
1904 struct ext4_extent *nearex; /* nearest extent */ 1936 struct ext4_extent *nearex; /* nearest extent */
@@ -1907,6 +1939,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1907 ext4_lblk_t next; 1939 ext4_lblk_t next;
1908 int mb_flags = 0, unwritten; 1940 int mb_flags = 0, unwritten;
1909 1941
1942 if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
1943 mb_flags |= EXT4_MB_DELALLOC_RESERVED;
1910 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { 1944 if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
1911 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); 1945 EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
1912 return -EIO; 1946 return -EIO;
@@ -1925,7 +1959,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1925 /* 1959 /*
1926 * Try to see whether we should rather test the extent on 1960 * Try to see whether we should rather test the extent on
1927 * right from ex, or from the left of ex. This is because 1961 * right from ex, or from the left of ex. This is because
1928 * ext4_ext_find_extent() can return either extent on the 1962 * ext4_find_extent() can return either extent on the
1929 * left, or on the right from the searched position. This 1963 * left, or on the right from the searched position. This
1930 * will make merging more effective. 1964 * will make merging more effective.
1931 */ 1965 */
@@ -2008,7 +2042,7 @@ prepend:
2008 if (next != EXT_MAX_BLOCKS) { 2042 if (next != EXT_MAX_BLOCKS) {
2009 ext_debug("next leaf block - %u\n", next); 2043 ext_debug("next leaf block - %u\n", next);
2010 BUG_ON(npath != NULL); 2044 BUG_ON(npath != NULL);
2011 npath = ext4_ext_find_extent(inode, next, NULL, 0); 2045 npath = ext4_find_extent(inode, next, NULL, 0);
2012 if (IS_ERR(npath)) 2046 if (IS_ERR(npath))
2013 return PTR_ERR(npath); 2047 return PTR_ERR(npath);
2014 BUG_ON(npath->p_depth != path->p_depth); 2048 BUG_ON(npath->p_depth != path->p_depth);
@@ -2028,9 +2062,9 @@ prepend:
2028 * We're gonna add a new leaf in the tree. 2062 * We're gonna add a new leaf in the tree.
2029 */ 2063 */
2030 if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) 2064 if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
2031 mb_flags = EXT4_MB_USE_RESERVED; 2065 mb_flags |= EXT4_MB_USE_RESERVED;
2032 err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, 2066 err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags,
2033 path, newext); 2067 ppath, newext);
2034 if (err) 2068 if (err)
2035 goto cleanup; 2069 goto cleanup;
2036 depth = ext_depth(inode); 2070 depth = ext_depth(inode);
@@ -2108,10 +2142,8 @@ merge:
2108 err = ext4_ext_dirty(handle, inode, path + path->p_depth); 2142 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
2109 2143
2110cleanup: 2144cleanup:
2111 if (npath) { 2145 ext4_ext_drop_refs(npath);
2112 ext4_ext_drop_refs(npath); 2146 kfree(npath);
2113 kfree(npath);
2114 }
2115 return err; 2147 return err;
2116} 2148}
2117 2149
@@ -2133,13 +2165,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
2133 /* find extent for this block */ 2165 /* find extent for this block */
2134 down_read(&EXT4_I(inode)->i_data_sem); 2166 down_read(&EXT4_I(inode)->i_data_sem);
2135 2167
2136 if (path && ext_depth(inode) != depth) { 2168 path = ext4_find_extent(inode, block, &path, 0);
2137 /* depth was changed. we have to realloc path */
2138 kfree(path);
2139 path = NULL;
2140 }
2141
2142 path = ext4_ext_find_extent(inode, block, path, 0);
2143 if (IS_ERR(path)) { 2169 if (IS_ERR(path)) {
2144 up_read(&EXT4_I(inode)->i_data_sem); 2170 up_read(&EXT4_I(inode)->i_data_sem);
2145 err = PTR_ERR(path); 2171 err = PTR_ERR(path);
@@ -2156,7 +2182,6 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
2156 } 2182 }
2157 ex = path[depth].p_ext; 2183 ex = path[depth].p_ext;
2158 next = ext4_ext_next_allocated_block(path); 2184 next = ext4_ext_next_allocated_block(path);
2159 ext4_ext_drop_refs(path);
2160 2185
2161 flags = 0; 2186 flags = 0;
2162 exists = 0; 2187 exists = 0;
@@ -2266,11 +2291,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
2266 block = es.es_lblk + es.es_len; 2291 block = es.es_lblk + es.es_len;
2267 } 2292 }
2268 2293
2269 if (path) { 2294 ext4_ext_drop_refs(path);
2270 ext4_ext_drop_refs(path); 2295 kfree(path);
2271 kfree(path);
2272 }
2273
2274 return err; 2296 return err;
2275} 2297}
2276 2298
@@ -2826,7 +2848,7 @@ again:
2826 ext4_lblk_t ee_block; 2848 ext4_lblk_t ee_block;
2827 2849
2828 /* find extent for this block */ 2850 /* find extent for this block */
2829 path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); 2851 path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE);
2830 if (IS_ERR(path)) { 2852 if (IS_ERR(path)) {
2831 ext4_journal_stop(handle); 2853 ext4_journal_stop(handle);
2832 return PTR_ERR(path); 2854 return PTR_ERR(path);
@@ -2854,24 +2876,14 @@ again:
2854 */ 2876 */
2855 if (end >= ee_block && 2877 if (end >= ee_block &&
2856 end < ee_block + ext4_ext_get_actual_len(ex) - 1) { 2878 end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
2857 int split_flag = 0;
2858
2859 if (ext4_ext_is_unwritten(ex))
2860 split_flag = EXT4_EXT_MARK_UNWRIT1 |
2861 EXT4_EXT_MARK_UNWRIT2;
2862
2863 /* 2879 /*
2864 * Split the extent in two so that 'end' is the last 2880 * Split the extent in two so that 'end' is the last
2865 * block in the first new extent. Also we should not 2881 * block in the first new extent. Also we should not
2866 * fail removing space due to ENOSPC so try to use 2882 * fail removing space due to ENOSPC so try to use
2867 * reserved block if that happens. 2883 * reserved block if that happens.
2868 */ 2884 */
2869 err = ext4_split_extent_at(handle, inode, path, 2885 err = ext4_force_split_extent_at(handle, inode, &path,
2870 end + 1, split_flag, 2886 end + 1, 1);
2871 EXT4_EX_NOCACHE |
2872 EXT4_GET_BLOCKS_PRE_IO |
2873 EXT4_GET_BLOCKS_METADATA_NOFAIL);
2874
2875 if (err < 0) 2887 if (err < 0)
2876 goto out; 2888 goto out;
2877 } 2889 }
@@ -2893,7 +2905,7 @@ again:
2893 ext4_journal_stop(handle); 2905 ext4_journal_stop(handle);
2894 return -ENOMEM; 2906 return -ENOMEM;
2895 } 2907 }
2896 path[0].p_depth = depth; 2908 path[0].p_maxdepth = path[0].p_depth = depth;
2897 path[0].p_hdr = ext_inode_hdr(inode); 2909 path[0].p_hdr = ext_inode_hdr(inode);
2898 i = 0; 2910 i = 0;
2899 2911
@@ -3013,10 +3025,9 @@ again:
3013out: 3025out:
3014 ext4_ext_drop_refs(path); 3026 ext4_ext_drop_refs(path);
3015 kfree(path); 3027 kfree(path);
3016 if (err == -EAGAIN) { 3028 path = NULL;
3017 path = NULL; 3029 if (err == -EAGAIN)
3018 goto again; 3030 goto again;
3019 }
3020 ext4_journal_stop(handle); 3031 ext4_journal_stop(handle);
3021 3032
3022 return err; 3033 return err;
@@ -3130,11 +3141,12 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
3130 */ 3141 */
3131static int ext4_split_extent_at(handle_t *handle, 3142static int ext4_split_extent_at(handle_t *handle,
3132 struct inode *inode, 3143 struct inode *inode,
3133 struct ext4_ext_path *path, 3144 struct ext4_ext_path **ppath,
3134 ext4_lblk_t split, 3145 ext4_lblk_t split,
3135 int split_flag, 3146 int split_flag,
3136 int flags) 3147 int flags)
3137{ 3148{
3149 struct ext4_ext_path *path = *ppath;
3138 ext4_fsblk_t newblock; 3150 ext4_fsblk_t newblock;
3139 ext4_lblk_t ee_block; 3151 ext4_lblk_t ee_block;
3140 struct ext4_extent *ex, newex, orig_ex, zero_ex; 3152 struct ext4_extent *ex, newex, orig_ex, zero_ex;
@@ -3205,7 +3217,7 @@ static int ext4_split_extent_at(handle_t *handle,
3205 if (split_flag & EXT4_EXT_MARK_UNWRIT2) 3217 if (split_flag & EXT4_EXT_MARK_UNWRIT2)
3206 ext4_ext_mark_unwritten(ex2); 3218 ext4_ext_mark_unwritten(ex2);
3207 3219
3208 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3220 err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
3209 if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { 3221 if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
3210 if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { 3222 if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
3211 if (split_flag & EXT4_EXT_DATA_VALID1) { 3223 if (split_flag & EXT4_EXT_DATA_VALID1) {
@@ -3271,11 +3283,12 @@ fix_extent_len:
3271 */ 3283 */
3272static int ext4_split_extent(handle_t *handle, 3284static int ext4_split_extent(handle_t *handle,
3273 struct inode *inode, 3285 struct inode *inode,
3274 struct ext4_ext_path *path, 3286 struct ext4_ext_path **ppath,
3275 struct ext4_map_blocks *map, 3287 struct ext4_map_blocks *map,
3276 int split_flag, 3288 int split_flag,
3277 int flags) 3289 int flags)
3278{ 3290{
3291 struct ext4_ext_path *path = *ppath;
3279 ext4_lblk_t ee_block; 3292 ext4_lblk_t ee_block;
3280 struct ext4_extent *ex; 3293 struct ext4_extent *ex;
3281 unsigned int ee_len, depth; 3294 unsigned int ee_len, depth;
@@ -3298,7 +3311,7 @@ static int ext4_split_extent(handle_t *handle,
3298 EXT4_EXT_MARK_UNWRIT2; 3311 EXT4_EXT_MARK_UNWRIT2;
3299 if (split_flag & EXT4_EXT_DATA_VALID2) 3312 if (split_flag & EXT4_EXT_DATA_VALID2)
3300 split_flag1 |= EXT4_EXT_DATA_VALID1; 3313 split_flag1 |= EXT4_EXT_DATA_VALID1;
3301 err = ext4_split_extent_at(handle, inode, path, 3314 err = ext4_split_extent_at(handle, inode, ppath,
3302 map->m_lblk + map->m_len, split_flag1, flags1); 3315 map->m_lblk + map->m_len, split_flag1, flags1);
3303 if (err) 3316 if (err)
3304 goto out; 3317 goto out;
@@ -3309,8 +3322,7 @@ static int ext4_split_extent(handle_t *handle,
3309 * Update path is required because previous ext4_split_extent_at() may 3322 * Update path is required because previous ext4_split_extent_at() may
3310 * result in split of original leaf or extent zeroout. 3323 * result in split of original leaf or extent zeroout.
3311 */ 3324 */
3312 ext4_ext_drop_refs(path); 3325 path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3313 path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
3314 if (IS_ERR(path)) 3326 if (IS_ERR(path))
3315 return PTR_ERR(path); 3327 return PTR_ERR(path);
3316 depth = ext_depth(inode); 3328 depth = ext_depth(inode);
@@ -3330,7 +3342,7 @@ static int ext4_split_extent(handle_t *handle,
3330 split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | 3342 split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
3331 EXT4_EXT_MARK_UNWRIT2); 3343 EXT4_EXT_MARK_UNWRIT2);
3332 } 3344 }
3333 err = ext4_split_extent_at(handle, inode, path, 3345 err = ext4_split_extent_at(handle, inode, ppath,
3334 map->m_lblk, split_flag1, flags); 3346 map->m_lblk, split_flag1, flags);
3335 if (err) 3347 if (err)
3336 goto out; 3348 goto out;
@@ -3364,9 +3376,10 @@ out:
3364static int ext4_ext_convert_to_initialized(handle_t *handle, 3376static int ext4_ext_convert_to_initialized(handle_t *handle,
3365 struct inode *inode, 3377 struct inode *inode,
3366 struct ext4_map_blocks *map, 3378 struct ext4_map_blocks *map,
3367 struct ext4_ext_path *path, 3379 struct ext4_ext_path **ppath,
3368 int flags) 3380 int flags)
3369{ 3381{
3382 struct ext4_ext_path *path = *ppath;
3370 struct ext4_sb_info *sbi; 3383 struct ext4_sb_info *sbi;
3371 struct ext4_extent_header *eh; 3384 struct ext4_extent_header *eh;
3372 struct ext4_map_blocks split_map; 3385 struct ext4_map_blocks split_map;
@@ -3590,11 +3603,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
3590 } 3603 }
3591 } 3604 }
3592 3605
3593 allocated = ext4_split_extent(handle, inode, path, 3606 err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
3594 &split_map, split_flag, flags); 3607 flags);
3595 if (allocated < 0) 3608 if (err > 0)
3596 err = allocated; 3609 err = 0;
3597
3598out: 3610out:
3599 /* If we have gotten a failure, don't zero out status tree */ 3611 /* If we have gotten a failure, don't zero out status tree */
3600 if (!err) 3612 if (!err)
@@ -3629,9 +3641,10 @@ out:
3629static int ext4_split_convert_extents(handle_t *handle, 3641static int ext4_split_convert_extents(handle_t *handle,
3630 struct inode *inode, 3642 struct inode *inode,
3631 struct ext4_map_blocks *map, 3643 struct ext4_map_blocks *map,
3632 struct ext4_ext_path *path, 3644 struct ext4_ext_path **ppath,
3633 int flags) 3645 int flags)
3634{ 3646{
3647 struct ext4_ext_path *path = *ppath;
3635 ext4_lblk_t eof_block; 3648 ext4_lblk_t eof_block;
3636 ext4_lblk_t ee_block; 3649 ext4_lblk_t ee_block;
3637 struct ext4_extent *ex; 3650 struct ext4_extent *ex;
@@ -3665,74 +3678,15 @@ static int ext4_split_convert_extents(handle_t *handle,
3665 split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); 3678 split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
3666 } 3679 }
3667 flags |= EXT4_GET_BLOCKS_PRE_IO; 3680 flags |= EXT4_GET_BLOCKS_PRE_IO;
3668 return ext4_split_extent(handle, inode, path, map, split_flag, flags); 3681 return ext4_split_extent(handle, inode, ppath, map, split_flag, flags);
3669} 3682}
3670 3683
3671static int ext4_convert_initialized_extents(handle_t *handle,
3672 struct inode *inode,
3673 struct ext4_map_blocks *map,
3674 struct ext4_ext_path *path)
3675{
3676 struct ext4_extent *ex;
3677 ext4_lblk_t ee_block;
3678 unsigned int ee_len;
3679 int depth;
3680 int err = 0;
3681
3682 depth = ext_depth(inode);
3683 ex = path[depth].p_ext;
3684 ee_block = le32_to_cpu(ex->ee_block);
3685 ee_len = ext4_ext_get_actual_len(ex);
3686
3687 ext_debug("%s: inode %lu, logical"
3688 "block %llu, max_blocks %u\n", __func__, inode->i_ino,
3689 (unsigned long long)ee_block, ee_len);
3690
3691 if (ee_block != map->m_lblk || ee_len > map->m_len) {
3692 err = ext4_split_convert_extents(handle, inode, map, path,
3693 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3694 if (err < 0)
3695 goto out;
3696 ext4_ext_drop_refs(path);
3697 path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
3698 if (IS_ERR(path)) {
3699 err = PTR_ERR(path);
3700 goto out;
3701 }
3702 depth = ext_depth(inode);
3703 ex = path[depth].p_ext;
3704 if (!ex) {
3705 EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3706 (unsigned long) map->m_lblk);
3707 err = -EIO;
3708 goto out;
3709 }
3710 }
3711
3712 err = ext4_ext_get_access(handle, inode, path + depth);
3713 if (err)
3714 goto out;
3715 /* first mark the extent as unwritten */
3716 ext4_ext_mark_unwritten(ex);
3717
3718 /* note: ext4_ext_correct_indexes() isn't needed here because
3719 * borders are not changed
3720 */
3721 ext4_ext_try_to_merge(handle, inode, path, ex);
3722
3723 /* Mark modified extent as dirty */
3724 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3725out:
3726 ext4_ext_show_leaf(inode, path);
3727 return err;
3728}
3729
3730
3731static int ext4_convert_unwritten_extents_endio(handle_t *handle, 3684static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3732 struct inode *inode, 3685 struct inode *inode,
3733 struct ext4_map_blocks *map, 3686 struct ext4_map_blocks *map,
3734 struct ext4_ext_path *path) 3687 struct ext4_ext_path **ppath)
3735{ 3688{
3689 struct ext4_ext_path *path = *ppath;
3736 struct ext4_extent *ex; 3690 struct ext4_extent *ex;
3737 ext4_lblk_t ee_block; 3691 ext4_lblk_t ee_block;
3738 unsigned int ee_len; 3692 unsigned int ee_len;
@@ -3761,16 +3715,13 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3761 inode->i_ino, (unsigned long long)ee_block, ee_len, 3715 inode->i_ino, (unsigned long long)ee_block, ee_len,
3762 (unsigned long long)map->m_lblk, map->m_len); 3716 (unsigned long long)map->m_lblk, map->m_len);
3763#endif 3717#endif
3764 err = ext4_split_convert_extents(handle, inode, map, path, 3718 err = ext4_split_convert_extents(handle, inode, map, ppath,
3765 EXT4_GET_BLOCKS_CONVERT); 3719 EXT4_GET_BLOCKS_CONVERT);
3766 if (err < 0) 3720 if (err < 0)
3767 goto out; 3721 return err;
3768 ext4_ext_drop_refs(path); 3722 path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3769 path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); 3723 if (IS_ERR(path))
3770 if (IS_ERR(path)) { 3724 return PTR_ERR(path);
3771 err = PTR_ERR(path);
3772 goto out;
3773 }
3774 depth = ext_depth(inode); 3725 depth = ext_depth(inode);
3775 ex = path[depth].p_ext; 3726 ex = path[depth].p_ext;
3776 } 3727 }
@@ -3963,12 +3914,16 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3963} 3914}
3964 3915
3965static int 3916static int
3966ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, 3917convert_initialized_extent(handle_t *handle, struct inode *inode,
3967 struct ext4_map_blocks *map, 3918 struct ext4_map_blocks *map,
3968 struct ext4_ext_path *path, int flags, 3919 struct ext4_ext_path **ppath, int flags,
3969 unsigned int allocated, ext4_fsblk_t newblock) 3920 unsigned int allocated, ext4_fsblk_t newblock)
3970{ 3921{
3971 int ret = 0; 3922 struct ext4_ext_path *path = *ppath;
3923 struct ext4_extent *ex;
3924 ext4_lblk_t ee_block;
3925 unsigned int ee_len;
3926 int depth;
3972 int err = 0; 3927 int err = 0;
3973 3928
3974 /* 3929 /*
@@ -3978,28 +3933,67 @@ ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
3978 if (map->m_len > EXT_UNWRITTEN_MAX_LEN) 3933 if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
3979 map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; 3934 map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
3980 3935
3981 ret = ext4_convert_initialized_extents(handle, inode, map, 3936 depth = ext_depth(inode);
3982 path); 3937 ex = path[depth].p_ext;
3983 if (ret >= 0) { 3938 ee_block = le32_to_cpu(ex->ee_block);
3984 ext4_update_inode_fsync_trans(handle, inode, 1); 3939 ee_len = ext4_ext_get_actual_len(ex);
3985 err = check_eofblocks_fl(handle, inode, map->m_lblk, 3940
3986 path, map->m_len); 3941 ext_debug("%s: inode %lu, logical"
3987 } else 3942 "block %llu, max_blocks %u\n", __func__, inode->i_ino,
3988 err = ret; 3943 (unsigned long long)ee_block, ee_len);
3944
3945 if (ee_block != map->m_lblk || ee_len > map->m_len) {
3946 err = ext4_split_convert_extents(handle, inode, map, ppath,
3947 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
3948 if (err < 0)
3949 return err;
3950 path = ext4_find_extent(inode, map->m_lblk, ppath, 0);
3951 if (IS_ERR(path))
3952 return PTR_ERR(path);
3953 depth = ext_depth(inode);
3954 ex = path[depth].p_ext;
3955 if (!ex) {
3956 EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
3957 (unsigned long) map->m_lblk);
3958 return -EIO;
3959 }
3960 }
3961
3962 err = ext4_ext_get_access(handle, inode, path + depth);
3963 if (err)
3964 return err;
3965 /* first mark the extent as unwritten */
3966 ext4_ext_mark_unwritten(ex);
3967
3968 /* note: ext4_ext_correct_indexes() isn't needed here because
3969 * borders are not changed
3970 */
3971 ext4_ext_try_to_merge(handle, inode, path, ex);
3972
3973 /* Mark modified extent as dirty */
3974 err = ext4_ext_dirty(handle, inode, path + path->p_depth);
3975 if (err)
3976 return err;
3977 ext4_ext_show_leaf(inode, path);
3978
3979 ext4_update_inode_fsync_trans(handle, inode, 1);
3980 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len);
3981 if (err)
3982 return err;
3989 map->m_flags |= EXT4_MAP_UNWRITTEN; 3983 map->m_flags |= EXT4_MAP_UNWRITTEN;
3990 if (allocated > map->m_len) 3984 if (allocated > map->m_len)
3991 allocated = map->m_len; 3985 allocated = map->m_len;
3992 map->m_len = allocated; 3986 map->m_len = allocated;
3993 3987 return allocated;
3994 return err ? err : allocated;
3995} 3988}
3996 3989
3997static int 3990static int
3998ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, 3991ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
3999 struct ext4_map_blocks *map, 3992 struct ext4_map_blocks *map,
4000 struct ext4_ext_path *path, int flags, 3993 struct ext4_ext_path **ppath, int flags,
4001 unsigned int allocated, ext4_fsblk_t newblock) 3994 unsigned int allocated, ext4_fsblk_t newblock)
4002{ 3995{
3996 struct ext4_ext_path *path = *ppath;
4003 int ret = 0; 3997 int ret = 0;
4004 int err = 0; 3998 int err = 0;
4005 ext4_io_end_t *io = ext4_inode_aio(inode); 3999 ext4_io_end_t *io = ext4_inode_aio(inode);
@@ -4021,8 +4015,8 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
4021 4015
4022 /* get_block() before submit the IO, split the extent */ 4016 /* get_block() before submit the IO, split the extent */
4023 if (flags & EXT4_GET_BLOCKS_PRE_IO) { 4017 if (flags & EXT4_GET_BLOCKS_PRE_IO) {
4024 ret = ext4_split_convert_extents(handle, inode, map, 4018 ret = ext4_split_convert_extents(handle, inode, map, ppath,
4025 path, flags | EXT4_GET_BLOCKS_CONVERT); 4019 flags | EXT4_GET_BLOCKS_CONVERT);
4026 if (ret <= 0) 4020 if (ret <= 0)
4027 goto out; 4021 goto out;
4028 /* 4022 /*
@@ -4040,7 +4034,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
4040 /* IO end_io complete, convert the filled extent to written */ 4034 /* IO end_io complete, convert the filled extent to written */
4041 if (flags & EXT4_GET_BLOCKS_CONVERT) { 4035 if (flags & EXT4_GET_BLOCKS_CONVERT) {
4042 ret = ext4_convert_unwritten_extents_endio(handle, inode, map, 4036 ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
4043 path); 4037 ppath);
4044 if (ret >= 0) { 4038 if (ret >= 0) {
4045 ext4_update_inode_fsync_trans(handle, inode, 1); 4039 ext4_update_inode_fsync_trans(handle, inode, 1);
4046 err = check_eofblocks_fl(handle, inode, map->m_lblk, 4040 err = check_eofblocks_fl(handle, inode, map->m_lblk,
@@ -4078,7 +4072,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
4078 } 4072 }
4079 4073
4080 /* buffered write, writepage time, convert*/ 4074 /* buffered write, writepage time, convert*/
4081 ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags); 4075 ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags);
4082 if (ret >= 0) 4076 if (ret >= 0)
4083 ext4_update_inode_fsync_trans(handle, inode, 1); 4077 ext4_update_inode_fsync_trans(handle, inode, 1);
4084out: 4078out:
@@ -4279,7 +4273,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4279 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 4273 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
4280 4274
4281 /* find extent for this block */ 4275 /* find extent for this block */
4282 path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0); 4276 path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
4283 if (IS_ERR(path)) { 4277 if (IS_ERR(path)) {
4284 err = PTR_ERR(path); 4278 err = PTR_ERR(path);
4285 path = NULL; 4279 path = NULL;
@@ -4291,7 +4285,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4291 /* 4285 /*
4292 * consistent leaf must not be empty; 4286 * consistent leaf must not be empty;
4293 * this situation is possible, though, _during_ tree modification; 4287 * this situation is possible, though, _during_ tree modification;
4294 * this is why assert can't be put in ext4_ext_find_extent() 4288 * this is why assert can't be put in ext4_find_extent()
4295 */ 4289 */
4296 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 4290 if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
4297 EXT4_ERROR_INODE(inode, "bad extent address " 4291 EXT4_ERROR_INODE(inode, "bad extent address "
@@ -4331,15 +4325,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4331 */ 4325 */
4332 if ((!ext4_ext_is_unwritten(ex)) && 4326 if ((!ext4_ext_is_unwritten(ex)) &&
4333 (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { 4327 (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4334 allocated = ext4_ext_convert_initialized_extent( 4328 allocated = convert_initialized_extent(
4335 handle, inode, map, path, flags, 4329 handle, inode, map, &path,
4336 allocated, newblock); 4330 flags, allocated, newblock);
4337 goto out2; 4331 goto out2;
4338 } else if (!ext4_ext_is_unwritten(ex)) 4332 } else if (!ext4_ext_is_unwritten(ex))
4339 goto out; 4333 goto out;
4340 4334
4341 ret = ext4_ext_handle_unwritten_extents( 4335 ret = ext4_ext_handle_unwritten_extents(
4342 handle, inode, map, path, flags, 4336 handle, inode, map, &path, flags,
4343 allocated, newblock); 4337 allocated, newblock);
4344 if (ret < 0) 4338 if (ret < 0)
4345 err = ret; 4339 err = ret;
@@ -4376,7 +4370,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4376 4370
4377 /* 4371 /*
4378 * If we are doing bigalloc, check to see if the extent returned 4372 * If we are doing bigalloc, check to see if the extent returned
4379 * by ext4_ext_find_extent() implies a cluster we can use. 4373 * by ext4_find_extent() implies a cluster we can use.
4380 */ 4374 */
4381 if (cluster_offset && ex && 4375 if (cluster_offset && ex &&
4382 get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { 4376 get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
@@ -4451,6 +4445,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4451 ar.flags = 0; 4445 ar.flags = 0;
4452 if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) 4446 if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
4453 ar.flags |= EXT4_MB_HINT_NOPREALLOC; 4447 ar.flags |= EXT4_MB_HINT_NOPREALLOC;
4448 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
4449 ar.flags |= EXT4_MB_DELALLOC_RESERVED;
4454 newblock = ext4_mb_new_blocks(handle, &ar, &err); 4450 newblock = ext4_mb_new_blocks(handle, &ar, &err);
4455 if (!newblock) 4451 if (!newblock)
4456 goto out2; 4452 goto out2;
@@ -4486,7 +4482,7 @@ got_allocated_blocks:
4486 err = check_eofblocks_fl(handle, inode, map->m_lblk, 4482 err = check_eofblocks_fl(handle, inode, map->m_lblk,
4487 path, ar.len); 4483 path, ar.len);
4488 if (!err) 4484 if (!err)
4489 err = ext4_ext_insert_extent(handle, inode, path, 4485 err = ext4_ext_insert_extent(handle, inode, &path,
4490 &newex, flags); 4486 &newex, flags);
4491 4487
4492 if (!err && set_unwritten) { 4488 if (!err && set_unwritten) {
@@ -4619,10 +4615,8 @@ out:
4619 map->m_pblk = newblock; 4615 map->m_pblk = newblock;
4620 map->m_len = allocated; 4616 map->m_len = allocated;
4621out2: 4617out2:
4622 if (path) { 4618 ext4_ext_drop_refs(path);
4623 ext4_ext_drop_refs(path); 4619 kfree(path);
4624 kfree(path);
4625 }
4626 4620
4627 trace_ext4_ext_map_blocks_exit(inode, flags, map, 4621 trace_ext4_ext_map_blocks_exit(inode, flags, map,
4628 err ? err : allocated); 4622 err ? err : allocated);
@@ -4799,7 +4793,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4799 max_blocks -= lblk; 4793 max_blocks -= lblk;
4800 4794
4801 flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT | 4795 flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
4802 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN; 4796 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
4797 EXT4_EX_NOCACHE;
4803 if (mode & FALLOC_FL_KEEP_SIZE) 4798 if (mode & FALLOC_FL_KEEP_SIZE)
4804 flags |= EXT4_GET_BLOCKS_KEEP_SIZE; 4799 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4805 4800
@@ -4837,15 +4832,21 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4837 ext4_inode_block_unlocked_dio(inode); 4832 ext4_inode_block_unlocked_dio(inode);
4838 inode_dio_wait(inode); 4833 inode_dio_wait(inode);
4839 4834
4835 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4836 flags, mode);
4837 if (ret)
4838 goto out_dio;
4840 /* 4839 /*
4841 * Remove entire range from the extent status tree. 4840 * Remove entire range from the extent status tree.
4841 *
4842 * ext4_es_remove_extent(inode, lblk, max_blocks) is
4843 * NOT sufficient. I'm not sure why this is the case,
4844 * but let's be conservative and remove the extent
4845 * status tree for the entire inode. There should be
4846 * no outstanding delalloc extents thanks to the
4847 * filemap_write_and_wait_range() call above.
4842 */ 4848 */
4843 ret = ext4_es_remove_extent(inode, lblk, max_blocks); 4849 ret = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
4844 if (ret)
4845 goto out_dio;
4846
4847 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4848 flags, mode);
4849 if (ret) 4850 if (ret)
4850 goto out_dio; 4851 goto out_dio;
4851 } 4852 }
@@ -5304,36 +5305,31 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5304 struct ext4_ext_path *path; 5305 struct ext4_ext_path *path;
5305 int ret = 0, depth; 5306 int ret = 0, depth;
5306 struct ext4_extent *extent; 5307 struct ext4_extent *extent;
5307 ext4_lblk_t stop_block, current_block; 5308 ext4_lblk_t stop_block;
5308 ext4_lblk_t ex_start, ex_end; 5309 ext4_lblk_t ex_start, ex_end;
5309 5310
5310 /* Let path point to the last extent */ 5311 /* Let path point to the last extent */
5311 path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); 5312 path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
5312 if (IS_ERR(path)) 5313 if (IS_ERR(path))
5313 return PTR_ERR(path); 5314 return PTR_ERR(path);
5314 5315
5315 depth = path->p_depth; 5316 depth = path->p_depth;
5316 extent = path[depth].p_ext; 5317 extent = path[depth].p_ext;
5317 if (!extent) { 5318 if (!extent)
5318 ext4_ext_drop_refs(path); 5319 goto out;
5319 kfree(path);
5320 return ret;
5321 }
5322 5320
5323 stop_block = le32_to_cpu(extent->ee_block) + 5321 stop_block = le32_to_cpu(extent->ee_block) +
5324 ext4_ext_get_actual_len(extent); 5322 ext4_ext_get_actual_len(extent);
5325 ext4_ext_drop_refs(path);
5326 kfree(path);
5327 5323
5328 /* Nothing to shift, if hole is at the end of file */ 5324 /* Nothing to shift, if hole is at the end of file */
5329 if (start >= stop_block) 5325 if (start >= stop_block)
5330 return ret; 5326 goto out;
5331 5327
5332 /* 5328 /*
5333 * Don't start shifting extents until we make sure the hole is big 5329 * Don't start shifting extents until we make sure the hole is big
5334 * enough to accomodate the shift. 5330 * enough to accomodate the shift.
5335 */ 5331 */
5336 path = ext4_ext_find_extent(inode, start - 1, NULL, 0); 5332 path = ext4_find_extent(inode, start - 1, &path, 0);
5337 if (IS_ERR(path)) 5333 if (IS_ERR(path))
5338 return PTR_ERR(path); 5334 return PTR_ERR(path);
5339 depth = path->p_depth; 5335 depth = path->p_depth;
@@ -5346,8 +5342,6 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5346 ex_start = 0; 5342 ex_start = 0;
5347 ex_end = 0; 5343 ex_end = 0;
5348 } 5344 }
5349 ext4_ext_drop_refs(path);
5350 kfree(path);
5351 5345
5352 if ((start == ex_start && shift > ex_start) || 5346 if ((start == ex_start && shift > ex_start) ||
5353 (shift > start - ex_end)) 5347 (shift > start - ex_end))
@@ -5355,7 +5349,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5355 5349
5356 /* Its safe to start updating extents */ 5350 /* Its safe to start updating extents */
5357 while (start < stop_block) { 5351 while (start < stop_block) {
5358 path = ext4_ext_find_extent(inode, start, NULL, 0); 5352 path = ext4_find_extent(inode, start, &path, 0);
5359 if (IS_ERR(path)) 5353 if (IS_ERR(path))
5360 return PTR_ERR(path); 5354 return PTR_ERR(path);
5361 depth = path->p_depth; 5355 depth = path->p_depth;
@@ -5365,27 +5359,23 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
5365 (unsigned long) start); 5359 (unsigned long) start);
5366 return -EIO; 5360 return -EIO;
5367 } 5361 }
5368 5362 if (start > le32_to_cpu(extent->ee_block)) {
5369 current_block = le32_to_cpu(extent->ee_block);
5370 if (start > current_block) {
5371 /* Hole, move to the next extent */ 5363 /* Hole, move to the next extent */
5372 ret = mext_next_extent(inode, path, &extent); 5364 if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
5373 if (ret != 0) { 5365 path[depth].p_ext++;
5374 ext4_ext_drop_refs(path); 5366 } else {
5375 kfree(path); 5367 start = ext4_ext_next_allocated_block(path);
5376 if (ret == 1) 5368 continue;
5377 ret = 0;
5378 break;
5379 } 5369 }
5380 } 5370 }
5381 ret = ext4_ext_shift_path_extents(path, shift, inode, 5371 ret = ext4_ext_shift_path_extents(path, shift, inode,
5382 handle, &start); 5372 handle, &start);
5383 ext4_ext_drop_refs(path);
5384 kfree(path);
5385 if (ret) 5373 if (ret)
5386 break; 5374 break;
5387 } 5375 }
5388 5376out:
5377 ext4_ext_drop_refs(path);
5378 kfree(path);
5389 return ret; 5379 return ret;
5390} 5380}
5391 5381
@@ -5508,3 +5498,199 @@ out_mutex:
5508 mutex_unlock(&inode->i_mutex); 5498 mutex_unlock(&inode->i_mutex);
5509 return ret; 5499 return ret;
5510} 5500}
5501
5502/**
5503 * ext4_swap_extents - Swap extents between two inodes
5504 *
5505 * @inode1: First inode
5506 * @inode2: Second inode
5507 * @lblk1: Start block for first inode
5508 * @lblk2: Start block for second inode
5509 * @count: Number of blocks to swap
5510 * @mark_unwritten: Mark second inode's extents as unwritten after swap
5511 * @erp: Pointer to save error value
5512 *
5513 * This helper routine does exactly what is promise "swap extents". All other
5514 * stuff such as page-cache locking consistency, bh mapping consistency or
5515 * extent's data copying must be performed by caller.
5516 * Locking:
5517 * i_mutex is held for both inodes
5518 * i_data_sem is locked for write for both inodes
5519 * Assumptions:
5520 * All pages from requested range are locked for both inodes
5521 */
5522int
5523ext4_swap_extents(handle_t *handle, struct inode *inode1,
5524 struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
5525 ext4_lblk_t count, int unwritten, int *erp)
5526{
5527 struct ext4_ext_path *path1 = NULL;
5528 struct ext4_ext_path *path2 = NULL;
5529 int replaced_count = 0;
5530
5531 BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
5532 BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
5533 BUG_ON(!mutex_is_locked(&inode1->i_mutex));
5534 BUG_ON(!mutex_is_locked(&inode1->i_mutex));
5535
5536 *erp = ext4_es_remove_extent(inode1, lblk1, count);
5537 if (unlikely(*erp))
5538 return 0;
5539 *erp = ext4_es_remove_extent(inode2, lblk2, count);
5540 if (unlikely(*erp))
5541 return 0;
5542
5543 while (count) {
5544 struct ext4_extent *ex1, *ex2, tmp_ex;
5545 ext4_lblk_t e1_blk, e2_blk;
5546 int e1_len, e2_len, len;
5547 int split = 0;
5548
5549 path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
5550 if (unlikely(IS_ERR(path1))) {
5551 *erp = PTR_ERR(path1);
5552 path1 = NULL;
5553 finish:
5554 count = 0;
5555 goto repeat;
5556 }
5557 path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
5558 if (unlikely(IS_ERR(path2))) {
5559 *erp = PTR_ERR(path2);
5560 path2 = NULL;
5561 goto finish;
5562 }
5563 ex1 = path1[path1->p_depth].p_ext;
5564 ex2 = path2[path2->p_depth].p_ext;
5565 /* Do we have somthing to swap ? */
5566 if (unlikely(!ex2 || !ex1))
5567 goto finish;
5568
5569 e1_blk = le32_to_cpu(ex1->ee_block);
5570 e2_blk = le32_to_cpu(ex2->ee_block);
5571 e1_len = ext4_ext_get_actual_len(ex1);
5572 e2_len = ext4_ext_get_actual_len(ex2);
5573
5574 /* Hole handling */
5575 if (!in_range(lblk1, e1_blk, e1_len) ||
5576 !in_range(lblk2, e2_blk, e2_len)) {
5577 ext4_lblk_t next1, next2;
5578
5579 /* if hole after extent, then go to next extent */
5580 next1 = ext4_ext_next_allocated_block(path1);
5581 next2 = ext4_ext_next_allocated_block(path2);
5582 /* If hole before extent, then shift to that extent */
5583 if (e1_blk > lblk1)
5584 next1 = e1_blk;
5585 if (e2_blk > lblk2)
5586 next2 = e1_blk;
5587 /* Do we have something to swap */
5588 if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
5589 goto finish;
5590 /* Move to the rightest boundary */
5591 len = next1 - lblk1;
5592 if (len < next2 - lblk2)
5593 len = next2 - lblk2;
5594 if (len > count)
5595 len = count;
5596 lblk1 += len;
5597 lblk2 += len;
5598 count -= len;
5599 goto repeat;
5600 }
5601
5602 /* Prepare left boundary */
5603 if (e1_blk < lblk1) {
5604 split = 1;
5605 *erp = ext4_force_split_extent_at(handle, inode1,
5606 &path1, lblk1, 0);
5607 if (unlikely(*erp))
5608 goto finish;
5609 }
5610 if (e2_blk < lblk2) {
5611 split = 1;
5612 *erp = ext4_force_split_extent_at(handle, inode2,
5613 &path2, lblk2, 0);
5614 if (unlikely(*erp))
5615 goto finish;
5616 }
5617 /* ext4_split_extent_at() may result in leaf extent split,
5618 * path must to be revalidated. */
5619 if (split)
5620 goto repeat;
5621
5622 /* Prepare right boundary */
5623 len = count;
5624 if (len > e1_blk + e1_len - lblk1)
5625 len = e1_blk + e1_len - lblk1;
5626 if (len > e2_blk + e2_len - lblk2)
5627 len = e2_blk + e2_len - lblk2;
5628
5629 if (len != e1_len) {
5630 split = 1;
5631 *erp = ext4_force_split_extent_at(handle, inode1,
5632 &path1, lblk1 + len, 0);
5633 if (unlikely(*erp))
5634 goto finish;
5635 }
5636 if (len != e2_len) {
5637 split = 1;
5638 *erp = ext4_force_split_extent_at(handle, inode2,
5639 &path2, lblk2 + len, 0);
5640 if (*erp)
5641 goto finish;
5642 }
5643 /* ext4_split_extent_at() may result in leaf extent split,
5644 * path must to be revalidated. */
5645 if (split)
5646 goto repeat;
5647
5648 BUG_ON(e2_len != e1_len);
5649 *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
5650 if (unlikely(*erp))
5651 goto finish;
5652 *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
5653 if (unlikely(*erp))
5654 goto finish;
5655
5656 /* Both extents are fully inside boundaries. Swap it now */
5657 tmp_ex = *ex1;
5658 ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
5659 ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
5660 ex1->ee_len = cpu_to_le16(e2_len);
5661 ex2->ee_len = cpu_to_le16(e1_len);
5662 if (unwritten)
5663 ext4_ext_mark_unwritten(ex2);
5664 if (ext4_ext_is_unwritten(&tmp_ex))
5665 ext4_ext_mark_unwritten(ex1);
5666
5667 ext4_ext_try_to_merge(handle, inode2, path2, ex2);
5668 ext4_ext_try_to_merge(handle, inode1, path1, ex1);
5669 *erp = ext4_ext_dirty(handle, inode2, path2 +
5670 path2->p_depth);
5671 if (unlikely(*erp))
5672 goto finish;
5673 *erp = ext4_ext_dirty(handle, inode1, path1 +
5674 path1->p_depth);
5675 /*
5676 * Looks scarry ah..? second inode already points to new blocks,
5677 * and it was successfully dirtied. But luckily error may happen
5678 * only due to journal error, so full transaction will be
5679 * aborted anyway.
5680 */
5681 if (unlikely(*erp))
5682 goto finish;
5683 lblk1 += len;
5684 lblk2 += len;
5685 replaced_count += len;
5686 count -= len;
5687
5688 repeat:
5689 ext4_ext_drop_refs(path1);
5690 kfree(path1);
5691 ext4_ext_drop_refs(path2);
5692 kfree(path2);
5693 path1 = path2 = NULL;
5694 }
5695 return replaced_count;
5696}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 0b7e28e7eaa4..94e7855ae71b 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -11,6 +11,8 @@
11 */ 11 */
12#include <linux/rbtree.h> 12#include <linux/rbtree.h>
13#include <linux/list_sort.h> 13#include <linux/list_sort.h>
14#include <linux/proc_fs.h>
15#include <linux/seq_file.h>
14#include "ext4.h" 16#include "ext4.h"
15#include "extents_status.h" 17#include "extents_status.h"
16 18
@@ -313,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
313 */ 315 */
314 if (!ext4_es_is_delayed(es)) { 316 if (!ext4_es_is_delayed(es)) {
315 EXT4_I(inode)->i_es_lru_nr++; 317 EXT4_I(inode)->i_es_lru_nr++;
316 percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); 318 percpu_counter_inc(&EXT4_SB(inode->i_sb)->
319 s_es_stats.es_stats_lru_cnt);
317 } 320 }
318 321
322 EXT4_I(inode)->i_es_all_nr++;
323 percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
324
319 return es; 325 return es;
320} 326}
321 327
322static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) 328static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
323{ 329{
330 EXT4_I(inode)->i_es_all_nr--;
331 percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
332
324 /* Decrease the lru counter when this es is not delayed */ 333 /* Decrease the lru counter when this es is not delayed */
325 if (!ext4_es_is_delayed(es)) { 334 if (!ext4_es_is_delayed(es)) {
326 BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); 335 BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
327 EXT4_I(inode)->i_es_lru_nr--; 336 EXT4_I(inode)->i_es_lru_nr--;
328 percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); 337 percpu_counter_dec(&EXT4_SB(inode->i_sb)->
338 s_es_stats.es_stats_lru_cnt);
329 } 339 }
330 340
331 kmem_cache_free(ext4_es_cachep, es); 341 kmem_cache_free(ext4_es_cachep, es);
@@ -426,7 +436,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
426 unsigned short ee_len; 436 unsigned short ee_len;
427 int depth, ee_status, es_status; 437 int depth, ee_status, es_status;
428 438
429 path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); 439 path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
430 if (IS_ERR(path)) 440 if (IS_ERR(path))
431 return; 441 return;
432 442
@@ -499,10 +509,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
499 } 509 }
500 } 510 }
501out: 511out:
502 if (path) { 512 ext4_ext_drop_refs(path);
503 ext4_ext_drop_refs(path); 513 kfree(path);
504 kfree(path);
505 }
506} 514}
507 515
508static void ext4_es_insert_extent_ind_check(struct inode *inode, 516static void ext4_es_insert_extent_ind_check(struct inode *inode,
@@ -731,6 +739,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
731 struct extent_status *es) 739 struct extent_status *es)
732{ 740{
733 struct ext4_es_tree *tree; 741 struct ext4_es_tree *tree;
742 struct ext4_es_stats *stats;
734 struct extent_status *es1 = NULL; 743 struct extent_status *es1 = NULL;
735 struct rb_node *node; 744 struct rb_node *node;
736 int found = 0; 745 int found = 0;
@@ -767,11 +776,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
767 } 776 }
768 777
769out: 778out:
779 stats = &EXT4_SB(inode->i_sb)->s_es_stats;
770 if (found) { 780 if (found) {
771 BUG_ON(!es1); 781 BUG_ON(!es1);
772 es->es_lblk = es1->es_lblk; 782 es->es_lblk = es1->es_lblk;
773 es->es_len = es1->es_len; 783 es->es_len = es1->es_len;
774 es->es_pblk = es1->es_pblk; 784 es->es_pblk = es1->es_pblk;
785 stats->es_stats_cache_hits++;
786 } else {
787 stats->es_stats_cache_misses++;
775 } 788 }
776 789
777 read_unlock(&EXT4_I(inode)->i_es_lock); 790 read_unlock(&EXT4_I(inode)->i_es_lock);
@@ -933,11 +946,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
933 struct ext4_inode_info *locked_ei) 946 struct ext4_inode_info *locked_ei)
934{ 947{
935 struct ext4_inode_info *ei; 948 struct ext4_inode_info *ei;
949 struct ext4_es_stats *es_stats;
936 struct list_head *cur, *tmp; 950 struct list_head *cur, *tmp;
937 LIST_HEAD(skipped); 951 LIST_HEAD(skipped);
952 ktime_t start_time;
953 u64 scan_time;
938 int nr_shrunk = 0; 954 int nr_shrunk = 0;
939 int retried = 0, skip_precached = 1, nr_skipped = 0; 955 int retried = 0, skip_precached = 1, nr_skipped = 0;
940 956
957 es_stats = &sbi->s_es_stats;
958 start_time = ktime_get();
941 spin_lock(&sbi->s_es_lru_lock); 959 spin_lock(&sbi->s_es_lru_lock);
942 960
943retry: 961retry:
@@ -948,7 +966,8 @@ retry:
948 * If we have already reclaimed all extents from extent 966 * If we have already reclaimed all extents from extent
949 * status tree, just stop the loop immediately. 967 * status tree, just stop the loop immediately.
950 */ 968 */
951 if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0) 969 if (percpu_counter_read_positive(
970 &es_stats->es_stats_lru_cnt) == 0)
952 break; 971 break;
953 972
954 ei = list_entry(cur, struct ext4_inode_info, i_es_lru); 973 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
@@ -958,7 +977,7 @@ retry:
958 * time. Normally we try hard to avoid shrinking 977 * time. Normally we try hard to avoid shrinking
959 * precached inodes, but we will as a last resort. 978 * precached inodes, but we will as a last resort.
960 */ 979 */
961 if ((sbi->s_es_last_sorted < ei->i_touch_when) || 980 if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
962 (skip_precached && ext4_test_inode_state(&ei->vfs_inode, 981 (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
963 EXT4_STATE_EXT_PRECACHED))) { 982 EXT4_STATE_EXT_PRECACHED))) {
964 nr_skipped++; 983 nr_skipped++;
@@ -992,7 +1011,7 @@ retry:
992 if ((nr_shrunk == 0) && nr_skipped && !retried) { 1011 if ((nr_shrunk == 0) && nr_skipped && !retried) {
993 retried++; 1012 retried++;
994 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); 1013 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
995 sbi->s_es_last_sorted = jiffies; 1014 es_stats->es_stats_last_sorted = jiffies;
996 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, 1015 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
997 i_es_lru); 1016 i_es_lru);
998 /* 1017 /*
@@ -1010,6 +1029,22 @@ retry:
1010 if (locked_ei && nr_shrunk == 0) 1029 if (locked_ei && nr_shrunk == 0)
1011 nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); 1030 nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
1012 1031
1032 scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1033 if (likely(es_stats->es_stats_scan_time))
1034 es_stats->es_stats_scan_time = (scan_time +
1035 es_stats->es_stats_scan_time*3) / 4;
1036 else
1037 es_stats->es_stats_scan_time = scan_time;
1038 if (scan_time > es_stats->es_stats_max_scan_time)
1039 es_stats->es_stats_max_scan_time = scan_time;
1040 if (likely(es_stats->es_stats_shrunk))
1041 es_stats->es_stats_shrunk = (nr_shrunk +
1042 es_stats->es_stats_shrunk*3) / 4;
1043 else
1044 es_stats->es_stats_shrunk = nr_shrunk;
1045
1046 trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached,
1047 nr_skipped, retried);
1013 return nr_shrunk; 1048 return nr_shrunk;
1014} 1049}
1015 1050
@@ -1020,8 +1055,8 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
1020 struct ext4_sb_info *sbi; 1055 struct ext4_sb_info *sbi;
1021 1056
1022 sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); 1057 sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
1023 nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); 1058 nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
1024 trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr); 1059 trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
1025 return nr; 1060 return nr;
1026} 1061}
1027 1062
@@ -1033,31 +1068,160 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
1033 int nr_to_scan = sc->nr_to_scan; 1068 int nr_to_scan = sc->nr_to_scan;
1034 int ret, nr_shrunk; 1069 int ret, nr_shrunk;
1035 1070
1036 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); 1071 ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
1037 trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); 1072 trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
1038 1073
1039 if (!nr_to_scan) 1074 if (!nr_to_scan)
1040 return ret; 1075 return ret;
1041 1076
1042 nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); 1077 nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
1043 1078
1044 trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); 1079 trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
1045 return nr_shrunk; 1080 return nr_shrunk;
1046} 1081}
1047 1082
1048void ext4_es_register_shrinker(struct ext4_sb_info *sbi) 1083static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
1049{ 1084{
1085 return *pos ? NULL : SEQ_START_TOKEN;
1086}
1087
1088static void *
1089ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
1090{
1091 return NULL;
1092}
1093
1094static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
1095{
1096 struct ext4_sb_info *sbi = seq->private;
1097 struct ext4_es_stats *es_stats = &sbi->s_es_stats;
1098 struct ext4_inode_info *ei, *max = NULL;
1099 unsigned int inode_cnt = 0;
1100
1101 if (v != SEQ_START_TOKEN)
1102 return 0;
1103
1104 /* here we just find an inode that has the max nr. of objects */
1105 spin_lock(&sbi->s_es_lru_lock);
1106 list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
1107 inode_cnt++;
1108 if (max && max->i_es_all_nr < ei->i_es_all_nr)
1109 max = ei;
1110 else if (!max)
1111 max = ei;
1112 }
1113 spin_unlock(&sbi->s_es_lru_lock);
1114
1115 seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
1116 percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
1117 percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
1118 seq_printf(seq, " %lu/%lu cache hits/misses\n",
1119 es_stats->es_stats_cache_hits,
1120 es_stats->es_stats_cache_misses);
1121 if (es_stats->es_stats_last_sorted != 0)
1122 seq_printf(seq, " %u ms last sorted interval\n",
1123 jiffies_to_msecs(jiffies -
1124 es_stats->es_stats_last_sorted));
1125 if (inode_cnt)
1126 seq_printf(seq, " %d inodes on lru list\n", inode_cnt);
1127
1128 seq_printf(seq, "average:\n %llu us scan time\n",
1129 div_u64(es_stats->es_stats_scan_time, 1000));
1130 seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk);
1131 if (inode_cnt)
1132 seq_printf(seq,
1133 "maximum:\n %lu inode (%u objects, %u reclaimable)\n"
1134 " %llu us max scan time\n",
1135 max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
1136 div_u64(es_stats->es_stats_max_scan_time, 1000));
1137
1138 return 0;
1139}
1140
1141static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
1142{
1143}
1144
1145static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
1146 .start = ext4_es_seq_shrinker_info_start,
1147 .next = ext4_es_seq_shrinker_info_next,
1148 .stop = ext4_es_seq_shrinker_info_stop,
1149 .show = ext4_es_seq_shrinker_info_show,
1150};
1151
1152static int
1153ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
1154{
1155 int ret;
1156
1157 ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
1158 if (!ret) {
1159 struct seq_file *m = file->private_data;
1160 m->private = PDE_DATA(inode);
1161 }
1162
1163 return ret;
1164}
1165
1166static int
1167ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
1168{
1169 return seq_release(inode, file);
1170}
1171
1172static const struct file_operations ext4_es_seq_shrinker_info_fops = {
1173 .owner = THIS_MODULE,
1174 .open = ext4_es_seq_shrinker_info_open,
1175 .read = seq_read,
1176 .llseek = seq_lseek,
1177 .release = ext4_es_seq_shrinker_info_release,
1178};
1179
1180int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
1181{
1182 int err;
1183
1050 INIT_LIST_HEAD(&sbi->s_es_lru); 1184 INIT_LIST_HEAD(&sbi->s_es_lru);
1051 spin_lock_init(&sbi->s_es_lru_lock); 1185 spin_lock_init(&sbi->s_es_lru_lock);
1052 sbi->s_es_last_sorted = 0; 1186 sbi->s_es_stats.es_stats_last_sorted = 0;
1187 sbi->s_es_stats.es_stats_shrunk = 0;
1188 sbi->s_es_stats.es_stats_cache_hits = 0;
1189 sbi->s_es_stats.es_stats_cache_misses = 0;
1190 sbi->s_es_stats.es_stats_scan_time = 0;
1191 sbi->s_es_stats.es_stats_max_scan_time = 0;
1192 err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
1193 if (err)
1194 return err;
1195 err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0, GFP_KERNEL);
1196 if (err)
1197 goto err1;
1198
1053 sbi->s_es_shrinker.scan_objects = ext4_es_scan; 1199 sbi->s_es_shrinker.scan_objects = ext4_es_scan;
1054 sbi->s_es_shrinker.count_objects = ext4_es_count; 1200 sbi->s_es_shrinker.count_objects = ext4_es_count;
1055 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; 1201 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
1056 register_shrinker(&sbi->s_es_shrinker); 1202 err = register_shrinker(&sbi->s_es_shrinker);
1203 if (err)
1204 goto err2;
1205
1206 if (sbi->s_proc)
1207 proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
1208 &ext4_es_seq_shrinker_info_fops, sbi);
1209
1210 return 0;
1211
1212err2:
1213 percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
1214err1:
1215 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1216 return err;
1057} 1217}
1058 1218
1059void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) 1219void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
1060{ 1220{
1221 if (sbi->s_proc)
1222 remove_proc_entry("es_shrinker_info", sbi->s_proc);
1223 percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
1224 percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
1061 unregister_shrinker(&sbi->s_es_shrinker); 1225 unregister_shrinker(&sbi->s_es_shrinker);
1062} 1226}
1063 1227
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index f1b62a419920..efd5f970b501 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -64,6 +64,17 @@ struct ext4_es_tree {
64 struct extent_status *cache_es; /* recently accessed extent */ 64 struct extent_status *cache_es; /* recently accessed extent */
65}; 65};
66 66
67struct ext4_es_stats {
68 unsigned long es_stats_last_sorted;
69 unsigned long es_stats_shrunk;
70 unsigned long es_stats_cache_hits;
71 unsigned long es_stats_cache_misses;
72 u64 es_stats_scan_time;
73 u64 es_stats_max_scan_time;
74 struct percpu_counter es_stats_all_cnt;
75 struct percpu_counter es_stats_lru_cnt;
76};
77
67extern int __init ext4_init_es(void); 78extern int __init ext4_init_es(void);
68extern void ext4_exit_es(void); 79extern void ext4_exit_es(void);
69extern void ext4_es_init_tree(struct ext4_es_tree *tree); 80extern void ext4_es_init_tree(struct ext4_es_tree *tree);
@@ -138,7 +149,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
138 (pb & ~ES_MASK)); 149 (pb & ~ES_MASK));
139} 150}
140 151
141extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); 152extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
142extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); 153extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
143extern void ext4_es_lru_add(struct inode *inode); 154extern void ext4_es_lru_add(struct inode *inode);
144extern void ext4_es_lru_del(struct inode *inode); 155extern void ext4_es_lru_del(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index aca7b24a4432..8131be8c0af3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -137,10 +137,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
137 iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos); 137 iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
138 } 138 }
139 139
140 iocb->private = &overwrite;
140 if (o_direct) { 141 if (o_direct) {
141 blk_start_plug(&plug); 142 blk_start_plug(&plug);
142 143
143 iocb->private = &overwrite;
144 144
145 /* check whether we do a DIO overwrite or not */ 145 /* check whether we do a DIO overwrite or not */
146 if (ext4_should_dioread_nolock(inode) && !aio_mutex && 146 if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 5b87fc36aab8..ac644c31ca67 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -887,6 +887,10 @@ got:
887 struct buffer_head *block_bitmap_bh; 887 struct buffer_head *block_bitmap_bh;
888 888
889 block_bitmap_bh = ext4_read_block_bitmap(sb, group); 889 block_bitmap_bh = ext4_read_block_bitmap(sb, group);
890 if (!block_bitmap_bh) {
891 err = -EIO;
892 goto out;
893 }
890 BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); 894 BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
891 err = ext4_journal_get_write_access(handle, block_bitmap_bh); 895 err = ext4_journal_get_write_access(handle, block_bitmap_bh);
892 if (err) { 896 if (err) {
@@ -1011,8 +1015,7 @@ got:
1011 spin_unlock(&sbi->s_next_gen_lock); 1015 spin_unlock(&sbi->s_next_gen_lock);
1012 1016
1013 /* Precompute checksum seed for inode metadata */ 1017 /* Precompute checksum seed for inode metadata */
1014 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 1018 if (ext4_has_metadata_csum(sb)) {
1015 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
1016 __u32 csum; 1019 __u32 csum;
1017 __le32 inum = cpu_to_le32(inode->i_ino); 1020 __le32 inum = cpu_to_le32(inode->i_ino);
1018 __le32 gen = cpu_to_le32(inode->i_generation); 1021 __le32 gen = cpu_to_le32(inode->i_generation);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index e75f840000a0..36b369697a13 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -318,34 +318,24 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
318 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain 318 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
319 * as described above and return 0. 319 * as described above and return 0.
320 */ 320 */
321static int ext4_alloc_branch(handle_t *handle, struct inode *inode, 321static int ext4_alloc_branch(handle_t *handle,
322 ext4_lblk_t iblock, int indirect_blks, 322 struct ext4_allocation_request *ar,
323 int *blks, ext4_fsblk_t goal, 323 int indirect_blks, ext4_lblk_t *offsets,
324 ext4_lblk_t *offsets, Indirect *branch) 324 Indirect *branch)
325{ 325{
326 struct ext4_allocation_request ar;
327 struct buffer_head * bh; 326 struct buffer_head * bh;
328 ext4_fsblk_t b, new_blocks[4]; 327 ext4_fsblk_t b, new_blocks[4];
329 __le32 *p; 328 __le32 *p;
330 int i, j, err, len = 1; 329 int i, j, err, len = 1;
331 330
332 /*
333 * Set up for the direct block allocation
334 */
335 memset(&ar, 0, sizeof(ar));
336 ar.inode = inode;
337 ar.len = *blks;
338 ar.logical = iblock;
339 if (S_ISREG(inode->i_mode))
340 ar.flags = EXT4_MB_HINT_DATA;
341
342 for (i = 0; i <= indirect_blks; i++) { 331 for (i = 0; i <= indirect_blks; i++) {
343 if (i == indirect_blks) { 332 if (i == indirect_blks) {
344 ar.goal = goal; 333 new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err);
345 new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err);
346 } else 334 } else
347 goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode, 335 ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle,
348 goal, 0, NULL, &err); 336 ar->inode, ar->goal,
337 ar->flags & EXT4_MB_DELALLOC_RESERVED,
338 NULL, &err);
349 if (err) { 339 if (err) {
350 i--; 340 i--;
351 goto failed; 341 goto failed;
@@ -354,7 +344,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
354 if (i == 0) 344 if (i == 0)
355 continue; 345 continue;
356 346
357 bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]); 347 bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]);
358 if (unlikely(!bh)) { 348 if (unlikely(!bh)) {
359 err = -ENOMEM; 349 err = -ENOMEM;
360 goto failed; 350 goto failed;
@@ -372,7 +362,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
372 b = new_blocks[i]; 362 b = new_blocks[i];
373 363
374 if (i == indirect_blks) 364 if (i == indirect_blks)
375 len = ar.len; 365 len = ar->len;
376 for (j = 0; j < len; j++) 366 for (j = 0; j < len; j++)
377 *p++ = cpu_to_le32(b++); 367 *p++ = cpu_to_le32(b++);
378 368
@@ -381,11 +371,10 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
381 unlock_buffer(bh); 371 unlock_buffer(bh);
382 372
383 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 373 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
384 err = ext4_handle_dirty_metadata(handle, inode, bh); 374 err = ext4_handle_dirty_metadata(handle, ar->inode, bh);
385 if (err) 375 if (err)
386 goto failed; 376 goto failed;
387 } 377 }
388 *blks = ar.len;
389 return 0; 378 return 0;
390failed: 379failed:
391 for (; i >= 0; i--) { 380 for (; i >= 0; i--) {
@@ -396,10 +385,10 @@ failed:
396 * existing before ext4_alloc_branch() was called. 385 * existing before ext4_alloc_branch() was called.
397 */ 386 */
398 if (i > 0 && i != indirect_blks && branch[i].bh) 387 if (i > 0 && i != indirect_blks && branch[i].bh)
399 ext4_forget(handle, 1, inode, branch[i].bh, 388 ext4_forget(handle, 1, ar->inode, branch[i].bh,
400 branch[i].bh->b_blocknr); 389 branch[i].bh->b_blocknr);
401 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 390 ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i],
402 (i == indirect_blks) ? ar.len : 1, 0); 391 (i == indirect_blks) ? ar->len : 1, 0);
403 } 392 }
404 return err; 393 return err;
405} 394}
@@ -419,9 +408,9 @@ failed:
419 * inode (->i_blocks, etc.). In case of success we end up with the full 408 * inode (->i_blocks, etc.). In case of success we end up with the full
420 * chain to new block and return 0. 409 * chain to new block and return 0.
421 */ 410 */
422static int ext4_splice_branch(handle_t *handle, struct inode *inode, 411static int ext4_splice_branch(handle_t *handle,
423 ext4_lblk_t block, Indirect *where, int num, 412 struct ext4_allocation_request *ar,
424 int blks) 413 Indirect *where, int num)
425{ 414{
426 int i; 415 int i;
427 int err = 0; 416 int err = 0;
@@ -446,9 +435,9 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
446 * Update the host buffer_head or inode to point to more just allocated 435 * Update the host buffer_head or inode to point to more just allocated
447 * direct blocks blocks 436 * direct blocks blocks
448 */ 437 */
449 if (num == 0 && blks > 1) { 438 if (num == 0 && ar->len > 1) {
450 current_block = le32_to_cpu(where->key) + 1; 439 current_block = le32_to_cpu(where->key) + 1;
451 for (i = 1; i < blks; i++) 440 for (i = 1; i < ar->len; i++)
452 *(where->p + i) = cpu_to_le32(current_block++); 441 *(where->p + i) = cpu_to_le32(current_block++);
453 } 442 }
454 443
@@ -465,14 +454,14 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
465 */ 454 */
466 jbd_debug(5, "splicing indirect only\n"); 455 jbd_debug(5, "splicing indirect only\n");
467 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); 456 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
468 err = ext4_handle_dirty_metadata(handle, inode, where->bh); 457 err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh);
469 if (err) 458 if (err)
470 goto err_out; 459 goto err_out;
471 } else { 460 } else {
472 /* 461 /*
473 * OK, we spliced it into the inode itself on a direct block. 462 * OK, we spliced it into the inode itself on a direct block.
474 */ 463 */
475 ext4_mark_inode_dirty(handle, inode); 464 ext4_mark_inode_dirty(handle, ar->inode);
476 jbd_debug(5, "splicing direct\n"); 465 jbd_debug(5, "splicing direct\n");
477 } 466 }
478 return err; 467 return err;
@@ -484,11 +473,11 @@ err_out:
484 * need to revoke the block, which is why we don't 473 * need to revoke the block, which is why we don't
485 * need to set EXT4_FREE_BLOCKS_METADATA. 474 * need to set EXT4_FREE_BLOCKS_METADATA.
486 */ 475 */
487 ext4_free_blocks(handle, inode, where[i].bh, 0, 1, 476 ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1,
488 EXT4_FREE_BLOCKS_FORGET); 477 EXT4_FREE_BLOCKS_FORGET);
489 } 478 }
490 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), 479 ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key),
491 blks, 0); 480 ar->len, 0);
492 481
493 return err; 482 return err;
494} 483}
@@ -525,11 +514,11 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
525 struct ext4_map_blocks *map, 514 struct ext4_map_blocks *map,
526 int flags) 515 int flags)
527{ 516{
517 struct ext4_allocation_request ar;
528 int err = -EIO; 518 int err = -EIO;
529 ext4_lblk_t offsets[4]; 519 ext4_lblk_t offsets[4];
530 Indirect chain[4]; 520 Indirect chain[4];
531 Indirect *partial; 521 Indirect *partial;
532 ext4_fsblk_t goal;
533 int indirect_blks; 522 int indirect_blks;
534 int blocks_to_boundary = 0; 523 int blocks_to_boundary = 0;
535 int depth; 524 int depth;
@@ -579,7 +568,16 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
579 return -ENOSPC; 568 return -ENOSPC;
580 } 569 }
581 570
582 goal = ext4_find_goal(inode, map->m_lblk, partial); 571 /* Set up for the direct block allocation */
572 memset(&ar, 0, sizeof(ar));
573 ar.inode = inode;
574 ar.logical = map->m_lblk;
575 if (S_ISREG(inode->i_mode))
576 ar.flags = EXT4_MB_HINT_DATA;
577 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
578 ar.flags |= EXT4_MB_DELALLOC_RESERVED;
579
580 ar.goal = ext4_find_goal(inode, map->m_lblk, partial);
583 581
584 /* the number of blocks need to allocate for [d,t]indirect blocks */ 582 /* the number of blocks need to allocate for [d,t]indirect blocks */
585 indirect_blks = (chain + depth) - partial - 1; 583 indirect_blks = (chain + depth) - partial - 1;
@@ -588,13 +586,13 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
588 * Next look up the indirect map to count the totoal number of 586 * Next look up the indirect map to count the totoal number of
589 * direct blocks to allocate for this branch. 587 * direct blocks to allocate for this branch.
590 */ 588 */
591 count = ext4_blks_to_allocate(partial, indirect_blks, 589 ar.len = ext4_blks_to_allocate(partial, indirect_blks,
592 map->m_len, blocks_to_boundary); 590 map->m_len, blocks_to_boundary);
591
593 /* 592 /*
594 * Block out ext4_truncate while we alter the tree 593 * Block out ext4_truncate while we alter the tree
595 */ 594 */
596 err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, 595 err = ext4_alloc_branch(handle, &ar, indirect_blks,
597 &count, goal,
598 offsets + (partial - chain), partial); 596 offsets + (partial - chain), partial);
599 597
600 /* 598 /*
@@ -605,14 +603,14 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
605 * may need to return -EAGAIN upwards in the worst case. --sct 603 * may need to return -EAGAIN upwards in the worst case. --sct
606 */ 604 */
607 if (!err) 605 if (!err)
608 err = ext4_splice_branch(handle, inode, map->m_lblk, 606 err = ext4_splice_branch(handle, &ar, partial, indirect_blks);
609 partial, indirect_blks, count);
610 if (err) 607 if (err)
611 goto cleanup; 608 goto cleanup;
612 609
613 map->m_flags |= EXT4_MAP_NEW; 610 map->m_flags |= EXT4_MAP_NEW;
614 611
615 ext4_update_inode_fsync_trans(handle, inode, 1); 612 ext4_update_inode_fsync_trans(handle, inode, 1);
613 count = ar.len;
616got_it: 614got_it:
617 map->m_flags |= EXT4_MAP_MAPPED; 615 map->m_flags |= EXT4_MAP_MAPPED;
618 map->m_pblk = le32_to_cpu(chain[depth-1].key); 616 map->m_pblk = le32_to_cpu(chain[depth-1].key);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index bea662bd0ca6..3ea62695abce 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -594,6 +594,7 @@ retry:
594 if (ret) { 594 if (ret) {
595 unlock_page(page); 595 unlock_page(page);
596 page_cache_release(page); 596 page_cache_release(page);
597 page = NULL;
597 ext4_orphan_add(handle, inode); 598 ext4_orphan_add(handle, inode);
598 up_write(&EXT4_I(inode)->xattr_sem); 599 up_write(&EXT4_I(inode)->xattr_sem);
599 sem_held = 0; 600 sem_held = 0;
@@ -613,7 +614,8 @@ retry:
613 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 614 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
614 goto retry; 615 goto retry;
615 616
616 block_commit_write(page, from, to); 617 if (page)
618 block_commit_write(page, from, to);
617out: 619out:
618 if (page) { 620 if (page) {
619 unlock_page(page); 621 unlock_page(page);
@@ -1126,8 +1128,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle,
1126 memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, 1128 memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
1127 inline_size - EXT4_INLINE_DOTDOT_SIZE); 1129 inline_size - EXT4_INLINE_DOTDOT_SIZE);
1128 1130
1129 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 1131 if (ext4_has_metadata_csum(inode->i_sb))
1130 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
1131 csum_size = sizeof(struct ext4_dir_entry_tail); 1132 csum_size = sizeof(struct ext4_dir_entry_tail);
1132 1133
1133 inode->i_size = inode->i_sb->s_blocksize; 1134 inode->i_size = inode->i_sb->s_blocksize;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3aa26e9117c4..3356ab5395f4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -83,8 +83,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
83 83
84 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 84 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
85 cpu_to_le32(EXT4_OS_LINUX) || 85 cpu_to_le32(EXT4_OS_LINUX) ||
86 !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 86 !ext4_has_metadata_csum(inode->i_sb))
87 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
88 return 1; 87 return 1;
89 88
90 provided = le16_to_cpu(raw->i_checksum_lo); 89 provided = le16_to_cpu(raw->i_checksum_lo);
@@ -105,8 +104,7 @@ static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
105 104
106 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 105 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
107 cpu_to_le32(EXT4_OS_LINUX) || 106 cpu_to_le32(EXT4_OS_LINUX) ||
108 !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 107 !ext4_has_metadata_csum(inode->i_sb))
109 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
110 return; 108 return;
111 109
112 csum = ext4_inode_csum(inode, raw, ei); 110 csum = ext4_inode_csum(inode, raw, ei);
@@ -224,16 +222,15 @@ void ext4_evict_inode(struct inode *inode)
224 goto no_delete; 222 goto no_delete;
225 } 223 }
226 224
227 if (!is_bad_inode(inode)) 225 if (is_bad_inode(inode))
228 dquot_initialize(inode); 226 goto no_delete;
227 dquot_initialize(inode);
229 228
230 if (ext4_should_order_data(inode)) 229 if (ext4_should_order_data(inode))
231 ext4_begin_ordered_truncate(inode, 0); 230 ext4_begin_ordered_truncate(inode, 0);
232 truncate_inode_pages_final(&inode->i_data); 231 truncate_inode_pages_final(&inode->i_data);
233 232
234 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); 233 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
235 if (is_bad_inode(inode))
236 goto no_delete;
237 234
238 /* 235 /*
239 * Protect us against freezing - iput() caller didn't have to have any 236 * Protect us against freezing - iput() caller didn't have to have any
@@ -590,20 +587,12 @@ found:
590 /* 587 /*
591 * New blocks allocate and/or writing to unwritten extent 588 * New blocks allocate and/or writing to unwritten extent
592 * will possibly result in updating i_data, so we take 589 * will possibly result in updating i_data, so we take
593 * the write lock of i_data_sem, and call get_blocks() 590 * the write lock of i_data_sem, and call get_block()
594 * with create == 1 flag. 591 * with create == 1 flag.
595 */ 592 */
596 down_write(&EXT4_I(inode)->i_data_sem); 593 down_write(&EXT4_I(inode)->i_data_sem);
597 594
598 /* 595 /*
599 * if the caller is from delayed allocation writeout path
600 * we have already reserved fs blocks for allocation
601 * let the underlying get_block() function know to
602 * avoid double accounting
603 */
604 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
605 ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
606 /*
607 * We need to check for EXT4 here because migrate 596 * We need to check for EXT4 here because migrate
608 * could have changed the inode type in between 597 * could have changed the inode type in between
609 */ 598 */
@@ -631,8 +620,6 @@ found:
631 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) 620 (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
632 ext4_da_update_reserve_space(inode, retval, 1); 621 ext4_da_update_reserve_space(inode, retval, 1);
633 } 622 }
634 if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
635 ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
636 623
637 if (retval > 0) { 624 if (retval > 0) {
638 unsigned int status; 625 unsigned int status;
@@ -734,11 +721,11 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
734 * `handle' can be NULL if create is zero 721 * `handle' can be NULL if create is zero
735 */ 722 */
736struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 723struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
737 ext4_lblk_t block, int create, int *errp) 724 ext4_lblk_t block, int create)
738{ 725{
739 struct ext4_map_blocks map; 726 struct ext4_map_blocks map;
740 struct buffer_head *bh; 727 struct buffer_head *bh;
741 int fatal = 0, err; 728 int err;
742 729
743 J_ASSERT(handle != NULL || create == 0); 730 J_ASSERT(handle != NULL || create == 0);
744 731
@@ -747,21 +734,14 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
747 err = ext4_map_blocks(handle, inode, &map, 734 err = ext4_map_blocks(handle, inode, &map,
748 create ? EXT4_GET_BLOCKS_CREATE : 0); 735 create ? EXT4_GET_BLOCKS_CREATE : 0);
749 736
750 /* ensure we send some value back into *errp */ 737 if (err == 0)
751 *errp = 0; 738 return create ? ERR_PTR(-ENOSPC) : NULL;
752
753 if (create && err == 0)
754 err = -ENOSPC; /* should never happen */
755 if (err < 0) 739 if (err < 0)
756 *errp = err; 740 return ERR_PTR(err);
757 if (err <= 0)
758 return NULL;
759 741
760 bh = sb_getblk(inode->i_sb, map.m_pblk); 742 bh = sb_getblk(inode->i_sb, map.m_pblk);
761 if (unlikely(!bh)) { 743 if (unlikely(!bh))
762 *errp = -ENOMEM; 744 return ERR_PTR(-ENOMEM);
763 return NULL;
764 }
765 if (map.m_flags & EXT4_MAP_NEW) { 745 if (map.m_flags & EXT4_MAP_NEW) {
766 J_ASSERT(create != 0); 746 J_ASSERT(create != 0);
767 J_ASSERT(handle != NULL); 747 J_ASSERT(handle != NULL);
@@ -775,44 +755,44 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
775 */ 755 */
776 lock_buffer(bh); 756 lock_buffer(bh);
777 BUFFER_TRACE(bh, "call get_create_access"); 757 BUFFER_TRACE(bh, "call get_create_access");
778 fatal = ext4_journal_get_create_access(handle, bh); 758 err = ext4_journal_get_create_access(handle, bh);
779 if (!fatal && !buffer_uptodate(bh)) { 759 if (unlikely(err)) {
760 unlock_buffer(bh);
761 goto errout;
762 }
763 if (!buffer_uptodate(bh)) {
780 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 764 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
781 set_buffer_uptodate(bh); 765 set_buffer_uptodate(bh);
782 } 766 }
783 unlock_buffer(bh); 767 unlock_buffer(bh);
784 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 768 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
785 err = ext4_handle_dirty_metadata(handle, inode, bh); 769 err = ext4_handle_dirty_metadata(handle, inode, bh);
786 if (!fatal) 770 if (unlikely(err))
787 fatal = err; 771 goto errout;
788 } else { 772 } else
789 BUFFER_TRACE(bh, "not a new buffer"); 773 BUFFER_TRACE(bh, "not a new buffer");
790 }
791 if (fatal) {
792 *errp = fatal;
793 brelse(bh);
794 bh = NULL;
795 }
796 return bh; 774 return bh;
775errout:
776 brelse(bh);
777 return ERR_PTR(err);
797} 778}
798 779
799struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 780struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
800 ext4_lblk_t block, int create, int *err) 781 ext4_lblk_t block, int create)
801{ 782{
802 struct buffer_head *bh; 783 struct buffer_head *bh;
803 784
804 bh = ext4_getblk(handle, inode, block, create, err); 785 bh = ext4_getblk(handle, inode, block, create);
805 if (!bh) 786 if (IS_ERR(bh))
806 return bh; 787 return bh;
807 if (buffer_uptodate(bh)) 788 if (!bh || buffer_uptodate(bh))
808 return bh; 789 return bh;
809 ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); 790 ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
810 wait_on_buffer(bh); 791 wait_on_buffer(bh);
811 if (buffer_uptodate(bh)) 792 if (buffer_uptodate(bh))
812 return bh; 793 return bh;
813 put_bh(bh); 794 put_bh(bh);
814 *err = -EIO; 795 return ERR_PTR(-EIO);
815 return NULL;
816} 796}
817 797
818int ext4_walk_page_buffers(handle_t *handle, 798int ext4_walk_page_buffers(handle_t *handle,
@@ -1536,7 +1516,7 @@ out_unlock:
1536} 1516}
1537 1517
1538/* 1518/*
1539 * This is a special get_blocks_t callback which is used by 1519 * This is a special get_block_t callback which is used by
1540 * ext4_da_write_begin(). It will either return mapped block or 1520 * ext4_da_write_begin(). It will either return mapped block or
1541 * reserve space for a single block. 1521 * reserve space for a single block.
1542 * 1522 *
@@ -2011,12 +1991,10 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
2011 * in data loss. So use reserved blocks to allocate metadata if 1991 * in data loss. So use reserved blocks to allocate metadata if
2012 * possible. 1992 * possible.
2013 * 1993 *
2014 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks 1994 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if
2015 * in question are delalloc blocks. This affects functions in many 1995 * the blocks in question are delalloc blocks. This indicates
2016 * different parts of the allocation call path. This flag exists 1996 * that the blocks and quotas has already been checked when
2017 * primarily because we don't want to change *many* call functions, so 1997 * the data was copied into the page cache.
2018 * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
2019 * once the inode's allocation semaphore is taken.
2020 */ 1998 */
2021 get_blocks_flags = EXT4_GET_BLOCKS_CREATE | 1999 get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
2022 EXT4_GET_BLOCKS_METADATA_NOFAIL; 2000 EXT4_GET_BLOCKS_METADATA_NOFAIL;
@@ -2515,6 +2493,20 @@ static int ext4_nonda_switch(struct super_block *sb)
2515 return 0; 2493 return 0;
2516} 2494}
2517 2495
2496/* We always reserve for an inode update; the superblock could be there too */
2497static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
2498{
2499 if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
2500 EXT4_FEATURE_RO_COMPAT_LARGE_FILE)))
2501 return 1;
2502
2503 if (pos + len <= 0x7fffffffULL)
2504 return 1;
2505
2506 /* We might need to update the superblock to set LARGE_FILE */
2507 return 2;
2508}
2509
2518static int ext4_da_write_begin(struct file *file, struct address_space *mapping, 2510static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
2519 loff_t pos, unsigned len, unsigned flags, 2511 loff_t pos, unsigned len, unsigned flags,
2520 struct page **pagep, void **fsdata) 2512 struct page **pagep, void **fsdata)
@@ -2565,7 +2557,8 @@ retry_grab:
2565 * of file which has an already mapped buffer. 2557 * of file which has an already mapped buffer.
2566 */ 2558 */
2567retry_journal: 2559retry_journal:
2568 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1); 2560 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
2561 ext4_da_write_credits(inode, pos, len));
2569 if (IS_ERR(handle)) { 2562 if (IS_ERR(handle)) {
2570 page_cache_release(page); 2563 page_cache_release(page);
2571 return PTR_ERR(handle); 2564 return PTR_ERR(handle);
@@ -2658,10 +2651,7 @@ static int ext4_da_write_end(struct file *file,
2658 if (copied && new_i_size > EXT4_I(inode)->i_disksize) { 2651 if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
2659 if (ext4_has_inline_data(inode) || 2652 if (ext4_has_inline_data(inode) ||
2660 ext4_da_should_update_i_disksize(page, end)) { 2653 ext4_da_should_update_i_disksize(page, end)) {
2661 down_write(&EXT4_I(inode)->i_data_sem); 2654 ext4_update_i_disksize(inode, new_i_size);
2662 if (new_i_size > EXT4_I(inode)->i_disksize)
2663 EXT4_I(inode)->i_disksize = new_i_size;
2664 up_write(&EXT4_I(inode)->i_data_sem);
2665 /* We need to mark inode dirty even if 2655 /* We need to mark inode dirty even if
2666 * new_i_size is less that inode->i_size 2656 * new_i_size is less that inode->i_size
2667 * bu greater than i_disksize.(hint delalloc) 2657 * bu greater than i_disksize.(hint delalloc)
@@ -3936,8 +3926,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
3936 ei->i_extra_isize = 0; 3926 ei->i_extra_isize = 0;
3937 3927
3938 /* Precompute checksum seed for inode metadata */ 3928 /* Precompute checksum seed for inode metadata */
3939 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 3929 if (ext4_has_metadata_csum(sb)) {
3940 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
3941 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3930 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
3942 __u32 csum; 3931 __u32 csum;
3943 __le32 inum = cpu_to_le32(inode->i_ino); 3932 __le32 inum = cpu_to_le32(inode->i_ino);
@@ -4127,6 +4116,13 @@ bad_inode:
4127 return ERR_PTR(ret); 4116 return ERR_PTR(ret);
4128} 4117}
4129 4118
4119struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
4120{
4121 if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
4122 return ERR_PTR(-EIO);
4123 return ext4_iget(sb, ino);
4124}
4125
4130static int ext4_inode_blocks_set(handle_t *handle, 4126static int ext4_inode_blocks_set(handle_t *handle,
4131 struct ext4_inode *raw_inode, 4127 struct ext4_inode *raw_inode,
4132 struct ext4_inode_info *ei) 4128 struct ext4_inode_info *ei)
@@ -4226,7 +4222,8 @@ static int ext4_do_update_inode(handle_t *handle,
4226 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); 4222 EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
4227 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); 4223 EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
4228 4224
4229 if (ext4_inode_blocks_set(handle, raw_inode, ei)) { 4225 err = ext4_inode_blocks_set(handle, raw_inode, ei);
4226 if (err) {
4230 spin_unlock(&ei->i_raw_lock); 4227 spin_unlock(&ei->i_raw_lock);
4231 goto out_brelse; 4228 goto out_brelse;
4232 } 4229 }
@@ -4536,8 +4533,12 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4536 ext4_orphan_del(NULL, inode); 4533 ext4_orphan_del(NULL, inode);
4537 goto err_out; 4534 goto err_out;
4538 } 4535 }
4539 } else 4536 } else {
4537 loff_t oldsize = inode->i_size;
4538
4540 i_size_write(inode, attr->ia_size); 4539 i_size_write(inode, attr->ia_size);
4540 pagecache_isize_extended(inode, oldsize, inode->i_size);
4541 }
4541 4542
4542 /* 4543 /*
4543 * Blocks are going to be removed from the inode. Wait 4544 * Blocks are going to be removed from the inode. Wait
@@ -4958,7 +4959,12 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4958 if (val) 4959 if (val)
4959 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 4960 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
4960 else { 4961 else {
4961 jbd2_journal_flush(journal); 4962 err = jbd2_journal_flush(journal);
4963 if (err < 0) {
4964 jbd2_journal_unlock_updates(journal);
4965 ext4_inode_resume_unlocked_dio(inode);
4966 return err;
4967 }
4962 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 4968 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
4963 } 4969 }
4964 ext4_set_aops(inode); 4970 ext4_set_aops(inode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 0f2252ec274d..bfda18a15592 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -331,8 +331,7 @@ flags_out:
331 if (!inode_owner_or_capable(inode)) 331 if (!inode_owner_or_capable(inode))
332 return -EPERM; 332 return -EPERM;
333 333
334 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 334 if (ext4_has_metadata_csum(inode->i_sb)) {
335 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
336 ext4_warning(sb, "Setting inode version is not " 335 ext4_warning(sb, "Setting inode version is not "
337 "supported with metadata_csum enabled."); 336 "supported with metadata_csum enabled.");
338 return -ENOTTY; 337 return -ENOTTY;
@@ -532,9 +531,17 @@ group_add_out:
532 } 531 }
533 532
534 case EXT4_IOC_SWAP_BOOT: 533 case EXT4_IOC_SWAP_BOOT:
534 {
535 int err;
535 if (!(filp->f_mode & FMODE_WRITE)) 536 if (!(filp->f_mode & FMODE_WRITE))
536 return -EBADF; 537 return -EBADF;
537 return swap_inode_boot_loader(sb, inode); 538 err = mnt_want_write_file(filp);
539 if (err)
540 return err;
541 err = swap_inode_boot_loader(sb, inode);
542 mnt_drop_write_file(filp);
543 return err;
544 }
538 545
539 case EXT4_IOC_RESIZE_FS: { 546 case EXT4_IOC_RESIZE_FS: {
540 ext4_fsblk_t n_blocks_count; 547 ext4_fsblk_t n_blocks_count;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 748c9136a60a..dbfe15c2533c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3155,9 +3155,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3155 "start %lu, size %lu, fe_logical %lu", 3155 "start %lu, size %lu, fe_logical %lu",
3156 (unsigned long) start, (unsigned long) size, 3156 (unsigned long) start, (unsigned long) size,
3157 (unsigned long) ac->ac_o_ex.fe_logical); 3157 (unsigned long) ac->ac_o_ex.fe_logical);
3158 BUG();
3158 } 3159 }
3159 BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
3160 start > ac->ac_o_ex.fe_logical);
3161 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 3160 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
3162 3161
3163 /* now prepare goal request */ 3162 /* now prepare goal request */
@@ -4410,14 +4409,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4410 if (IS_NOQUOTA(ar->inode)) 4409 if (IS_NOQUOTA(ar->inode))
4411 ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; 4410 ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
4412 4411
4413 /* 4412 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
4414 * For delayed allocation, we could skip the ENOSPC and
4415 * EDQUOT check, as blocks and quotas have been already
4416 * reserved when data being copied into pagecache.
4417 */
4418 if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
4419 ar->flags |= EXT4_MB_DELALLOC_RESERVED;
4420 else {
4421 /* Without delayed allocation we need to verify 4413 /* Without delayed allocation we need to verify
4422 * there is enough free blocks to do block allocation 4414 * there is enough free blocks to do block allocation
4423 * and verify allocation doesn't exceed the quota limits. 4415 * and verify allocation doesn't exceed the quota limits.
@@ -4528,8 +4520,7 @@ out:
4528 if (inquota && ar->len < inquota) 4520 if (inquota && ar->len < inquota)
4529 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); 4521 dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
4530 if (!ar->len) { 4522 if (!ar->len) {
4531 if (!ext4_test_inode_state(ar->inode, 4523 if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
4532 EXT4_STATE_DELALLOC_RESERVED))
4533 /* release all the reserved blocks if non delalloc */ 4524 /* release all the reserved blocks if non delalloc */
4534 percpu_counter_sub(&sbi->s_dirtyclusters_counter, 4525 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
4535 reserv_clstrs); 4526 reserv_clstrs);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index d3567f27bae7..a432634f2e6a 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -41,8 +41,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
41 ext4_ext_store_pblock(&newext, lb->first_pblock); 41 ext4_ext_store_pblock(&newext, lb->first_pblock);
42 /* Locking only for convinience since we are operating on temp inode */ 42 /* Locking only for convinience since we are operating on temp inode */
43 down_write(&EXT4_I(inode)->i_data_sem); 43 down_write(&EXT4_I(inode)->i_data_sem);
44 path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0); 44 path = ext4_find_extent(inode, lb->first_block, NULL, 0);
45
46 if (IS_ERR(path)) { 45 if (IS_ERR(path)) {
47 retval = PTR_ERR(path); 46 retval = PTR_ERR(path);
48 path = NULL; 47 path = NULL;
@@ -81,13 +80,11 @@ static int finish_range(handle_t *handle, struct inode *inode,
81 goto err_out; 80 goto err_out;
82 } 81 }
83 } 82 }
84 retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0); 83 retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0);
85err_out: 84err_out:
86 up_write((&EXT4_I(inode)->i_data_sem)); 85 up_write((&EXT4_I(inode)->i_data_sem));
87 if (path) { 86 ext4_ext_drop_refs(path);
88 ext4_ext_drop_refs(path); 87 kfree(path);
89 kfree(path);
90 }
91 lb->first_pblock = 0; 88 lb->first_pblock = 0;
92 return retval; 89 return retval;
93} 90}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 32bce844c2e1..8313ca3324ec 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -20,8 +20,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
20 20
21static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) 21static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
22{ 22{
23 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 23 if (!ext4_has_metadata_csum(sb))
24 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
25 return 1; 24 return 1;
26 25
27 return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp); 26 return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
@@ -29,8 +28,7 @@ static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
29 28
30static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) 29static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
31{ 30{
32 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 31 if (!ext4_has_metadata_csum(sb))
33 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
34 return; 32 return;
35 33
36 mmp->mmp_checksum = ext4_mmp_csum(sb, mmp); 34 mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 671a74b14fd7..9f2311bc9c4f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -27,120 +27,26 @@
27 * @lblock: logical block number to find an extent path 27 * @lblock: logical block number to find an extent path
28 * @path: pointer to an extent path pointer (for output) 28 * @path: pointer to an extent path pointer (for output)
29 * 29 *
30 * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value 30 * ext4_find_extent wrapper. Return 0 on success, or a negative error value
31 * on failure. 31 * on failure.
32 */ 32 */
33static inline int 33static inline int
34get_ext_path(struct inode *inode, ext4_lblk_t lblock, 34get_ext_path(struct inode *inode, ext4_lblk_t lblock,
35 struct ext4_ext_path **orig_path) 35 struct ext4_ext_path **ppath)
36{ 36{
37 int ret = 0;
38 struct ext4_ext_path *path; 37 struct ext4_ext_path *path;
39 38
40 path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE); 39 path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE);
41 if (IS_ERR(path)) 40 if (IS_ERR(path))
42 ret = PTR_ERR(path); 41 return PTR_ERR(path);
43 else if (path[ext_depth(inode)].p_ext == NULL) 42 if (path[ext_depth(inode)].p_ext == NULL) {
44 ret = -ENODATA; 43 ext4_ext_drop_refs(path);
45 else 44 kfree(path);
46 *orig_path = path; 45 *ppath = NULL;
47 46 return -ENODATA;
48 return ret;
49}
50
51/**
52 * copy_extent_status - Copy the extent's initialization status
53 *
54 * @src: an extent for getting initialize status
55 * @dest: an extent to be set the status
56 */
57static void
58copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
59{
60 if (ext4_ext_is_unwritten(src))
61 ext4_ext_mark_unwritten(dest);
62 else
63 dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
64}
65
66/**
67 * mext_next_extent - Search for the next extent and set it to "extent"
68 *
69 * @inode: inode which is searched
70 * @path: this will obtain data for the next extent
71 * @extent: pointer to the next extent we have just gotten
72 *
73 * Search the next extent in the array of ext4_ext_path structure (@path)
74 * and set it to ext4_extent structure (@extent). In addition, the member of
75 * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
76 * ext4_ext_path structure refers to the last extent, or a negative error
77 * value on failure.
78 */
79int
80mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
81 struct ext4_extent **extent)
82{
83 struct ext4_extent_header *eh;
84 int ppos, leaf_ppos = path->p_depth;
85
86 ppos = leaf_ppos;
87 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
88 /* leaf block */
89 *extent = ++path[ppos].p_ext;
90 path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
91 return 0;
92 }
93
94 while (--ppos >= 0) {
95 if (EXT_LAST_INDEX(path[ppos].p_hdr) >
96 path[ppos].p_idx) {
97 int cur_ppos = ppos;
98
99 /* index block */
100 path[ppos].p_idx++;
101 path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
102 if (path[ppos+1].p_bh)
103 brelse(path[ppos+1].p_bh);
104 path[ppos+1].p_bh =
105 sb_bread(inode->i_sb, path[ppos].p_block);
106 if (!path[ppos+1].p_bh)
107 return -EIO;
108 path[ppos+1].p_hdr =
109 ext_block_hdr(path[ppos+1].p_bh);
110
111 /* Halfway index block */
112 while (++cur_ppos < leaf_ppos) {
113 path[cur_ppos].p_idx =
114 EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
115 path[cur_ppos].p_block =
116 ext4_idx_pblock(path[cur_ppos].p_idx);
117 if (path[cur_ppos+1].p_bh)
118 brelse(path[cur_ppos+1].p_bh);
119 path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
120 path[cur_ppos].p_block);
121 if (!path[cur_ppos+1].p_bh)
122 return -EIO;
123 path[cur_ppos+1].p_hdr =
124 ext_block_hdr(path[cur_ppos+1].p_bh);
125 }
126
127 path[leaf_ppos].p_ext = *extent = NULL;
128
129 eh = path[leaf_ppos].p_hdr;
130 if (le16_to_cpu(eh->eh_entries) == 0)
131 /* empty leaf is found */
132 return -ENODATA;
133
134 /* leaf block */
135 path[leaf_ppos].p_ext = *extent =
136 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
137 path[leaf_ppos].p_block =
138 ext4_ext_pblock(path[leaf_ppos].p_ext);
139 return 0;
140 }
141 } 47 }
142 /* We found the last extent */ 48 *ppath = path;
143 return 1; 49 return 0;
144} 50}
145 51
146/** 52/**
@@ -178,417 +84,6 @@ ext4_double_up_write_data_sem(struct inode *orig_inode,
178} 84}
179 85
180/** 86/**
181 * mext_insert_across_blocks - Insert extents across leaf block
182 *
183 * @handle: journal handle
184 * @orig_inode: original inode
185 * @o_start: first original extent to be changed
186 * @o_end: last original extent to be changed
187 * @start_ext: first new extent to be inserted
188 * @new_ext: middle of new extent to be inserted
189 * @end_ext: last new extent to be inserted
190 *
191 * Allocate a new leaf block and insert extents into it. Return 0 on success,
192 * or a negative error value on failure.
193 */
194static int
195mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
196 struct ext4_extent *o_start, struct ext4_extent *o_end,
197 struct ext4_extent *start_ext, struct ext4_extent *new_ext,
198 struct ext4_extent *end_ext)
199{
200 struct ext4_ext_path *orig_path = NULL;
201 ext4_lblk_t eblock = 0;
202 int new_flag = 0;
203 int end_flag = 0;
204 int err = 0;
205
206 if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
207 if (o_start == o_end) {
208
209 /* start_ext new_ext end_ext
210 * donor |---------|-----------|--------|
211 * orig |------------------------------|
212 */
213 end_flag = 1;
214 } else {
215
216 /* start_ext new_ext end_ext
217 * donor |---------|----------|---------|
218 * orig |---------------|--------------|
219 */
220 o_end->ee_block = end_ext->ee_block;
221 o_end->ee_len = end_ext->ee_len;
222 ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
223 }
224
225 o_start->ee_len = start_ext->ee_len;
226 eblock = le32_to_cpu(start_ext->ee_block);
227 new_flag = 1;
228
229 } else if (start_ext->ee_len && new_ext->ee_len &&
230 !end_ext->ee_len && o_start == o_end) {
231
232 /* start_ext new_ext
233 * donor |--------------|---------------|
234 * orig |------------------------------|
235 */
236 o_start->ee_len = start_ext->ee_len;
237 eblock = le32_to_cpu(start_ext->ee_block);
238 new_flag = 1;
239
240 } else if (!start_ext->ee_len && new_ext->ee_len &&
241 end_ext->ee_len && o_start == o_end) {
242
243 /* new_ext end_ext
244 * donor |--------------|---------------|
245 * orig |------------------------------|
246 */
247 o_end->ee_block = end_ext->ee_block;
248 o_end->ee_len = end_ext->ee_len;
249 ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
250
251 /*
252 * Set 0 to the extent block if new_ext was
253 * the first block.
254 */
255 if (new_ext->ee_block)
256 eblock = le32_to_cpu(new_ext->ee_block);
257
258 new_flag = 1;
259 } else {
260 ext4_debug("ext4 move extent: Unexpected insert case\n");
261 return -EIO;
262 }
263
264 if (new_flag) {
265 err = get_ext_path(orig_inode, eblock, &orig_path);
266 if (err)
267 goto out;
268
269 if (ext4_ext_insert_extent(handle, orig_inode,
270 orig_path, new_ext, 0))
271 goto out;
272 }
273
274 if (end_flag) {
275 err = get_ext_path(orig_inode,
276 le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
277 if (err)
278 goto out;
279
280 if (ext4_ext_insert_extent(handle, orig_inode,
281 orig_path, end_ext, 0))
282 goto out;
283 }
284out:
285 if (orig_path) {
286 ext4_ext_drop_refs(orig_path);
287 kfree(orig_path);
288 }
289
290 return err;
291
292}
293
294/**
295 * mext_insert_inside_block - Insert new extent to the extent block
296 *
297 * @o_start: first original extent to be moved
298 * @o_end: last original extent to be moved
299 * @start_ext: first new extent to be inserted
300 * @new_ext: middle of new extent to be inserted
301 * @end_ext: last new extent to be inserted
302 * @eh: extent header of target leaf block
303 * @range_to_move: used to decide how to insert extent
304 *
305 * Insert extents into the leaf block. The extent (@o_start) is overwritten
306 * by inserted extents.
307 */
308static void
309mext_insert_inside_block(struct ext4_extent *o_start,
310 struct ext4_extent *o_end,
311 struct ext4_extent *start_ext,
312 struct ext4_extent *new_ext,
313 struct ext4_extent *end_ext,
314 struct ext4_extent_header *eh,
315 int range_to_move)
316{
317 int i = 0;
318 unsigned long len;
319
320 /* Move the existing extents */
321 if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
322 len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
323 (unsigned long)(o_end + 1);
324 memmove(o_end + 1 + range_to_move, o_end + 1, len);
325 }
326
327 /* Insert start entry */
328 if (start_ext->ee_len)
329 o_start[i++].ee_len = start_ext->ee_len;
330
331 /* Insert new entry */
332 if (new_ext->ee_len) {
333 o_start[i] = *new_ext;
334 ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
335 }
336
337 /* Insert end entry */
338 if (end_ext->ee_len)
339 o_start[i] = *end_ext;
340
341 /* Increment the total entries counter on the extent block */
342 le16_add_cpu(&eh->eh_entries, range_to_move);
343}
344
345/**
346 * mext_insert_extents - Insert new extent
347 *
348 * @handle: journal handle
349 * @orig_inode: original inode
350 * @orig_path: path indicates first extent to be changed
351 * @o_start: first original extent to be changed
352 * @o_end: last original extent to be changed
353 * @start_ext: first new extent to be inserted
354 * @new_ext: middle of new extent to be inserted
355 * @end_ext: last new extent to be inserted
356 *
357 * Call the function to insert extents. If we cannot add more extents into
358 * the leaf block, we call mext_insert_across_blocks() to create a
359 * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
360 * on success, or a negative error value on failure.
361 */
362static int
363mext_insert_extents(handle_t *handle, struct inode *orig_inode,
364 struct ext4_ext_path *orig_path,
365 struct ext4_extent *o_start,
366 struct ext4_extent *o_end,
367 struct ext4_extent *start_ext,
368 struct ext4_extent *new_ext,
369 struct ext4_extent *end_ext)
370{
371 struct ext4_extent_header *eh;
372 unsigned long need_slots, slots_range;
373 int range_to_move, depth, ret;
374
375 /*
376 * The extents need to be inserted
377 * start_extent + new_extent + end_extent.
378 */
379 need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
380 (new_ext->ee_len ? 1 : 0);
381
382 /* The number of slots between start and end */
383 slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
384 / sizeof(struct ext4_extent);
385
386 /* Range to move the end of extent */
387 range_to_move = need_slots - slots_range;
388 depth = orig_path->p_depth;
389 orig_path += depth;
390 eh = orig_path->p_hdr;
391
392 if (depth) {
393 /* Register to journal */
394 BUFFER_TRACE(orig_path->p_bh, "get_write_access");
395 ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
396 if (ret)
397 return ret;
398 }
399
400 /* Expansion */
401 if (range_to_move > 0 &&
402 (range_to_move > le16_to_cpu(eh->eh_max)
403 - le16_to_cpu(eh->eh_entries))) {
404
405 ret = mext_insert_across_blocks(handle, orig_inode, o_start,
406 o_end, start_ext, new_ext, end_ext);
407 if (ret < 0)
408 return ret;
409 } else
410 mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
411 end_ext, eh, range_to_move);
412
413 return ext4_ext_dirty(handle, orig_inode, orig_path);
414}
415
416/**
417 * mext_leaf_block - Move one leaf extent block into the inode.
418 *
419 * @handle: journal handle
420 * @orig_inode: original inode
421 * @orig_path: path indicates first extent to be changed
422 * @dext: donor extent
423 * @from: start offset on the target file
424 *
425 * In order to insert extents into the leaf block, we must divide the extent
426 * in the leaf block into three extents. The one is located to be inserted
427 * extents, and the others are located around it.
428 *
429 * Therefore, this function creates structures to save extents of the leaf
430 * block, and inserts extents by calling mext_insert_extents() with
431 * created extents. Return 0 on success, or a negative error value on failure.
432 */
433static int
434mext_leaf_block(handle_t *handle, struct inode *orig_inode,
435 struct ext4_ext_path *orig_path, struct ext4_extent *dext,
436 ext4_lblk_t *from)
437{
438 struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
439 struct ext4_extent new_ext, start_ext, end_ext;
440 ext4_lblk_t new_ext_end;
441 int oext_alen, new_ext_alen, end_ext_alen;
442 int depth = ext_depth(orig_inode);
443 int ret;
444
445 start_ext.ee_block = end_ext.ee_block = 0;
446 o_start = o_end = oext = orig_path[depth].p_ext;
447 oext_alen = ext4_ext_get_actual_len(oext);
448 start_ext.ee_len = end_ext.ee_len = 0;
449
450 new_ext.ee_block = cpu_to_le32(*from);
451 ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
452 new_ext.ee_len = dext->ee_len;
453 new_ext_alen = ext4_ext_get_actual_len(&new_ext);
454 new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
455
456 /*
457 * Case: original extent is first
458 * oext |--------|
459 * new_ext |--|
460 * start_ext |--|
461 */
462 if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
463 le32_to_cpu(new_ext.ee_block) <
464 le32_to_cpu(oext->ee_block) + oext_alen) {
465 start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
466 le32_to_cpu(oext->ee_block));
467 start_ext.ee_block = oext->ee_block;
468 copy_extent_status(oext, &start_ext);
469 } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
470 prev_ext = oext - 1;
471 /*
472 * We can merge new_ext into previous extent,
473 * if these are contiguous and same extent type.
474 */
475 if (ext4_can_extents_be_merged(orig_inode, prev_ext,
476 &new_ext)) {
477 o_start = prev_ext;
478 start_ext.ee_len = cpu_to_le16(
479 ext4_ext_get_actual_len(prev_ext) +
480 new_ext_alen);
481 start_ext.ee_block = oext->ee_block;
482 copy_extent_status(prev_ext, &start_ext);
483 new_ext.ee_len = 0;
484 }
485 }
486
487 /*
488 * Case: new_ext_end must be less than oext
489 * oext |-----------|
490 * new_ext |-------|
491 */
492 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
493 EXT4_ERROR_INODE(orig_inode,
494 "new_ext_end(%u) should be less than or equal to "
495 "oext->ee_block(%u) + oext_alen(%d) - 1",
496 new_ext_end, le32_to_cpu(oext->ee_block),
497 oext_alen);
498 ret = -EIO;
499 goto out;
500 }
501
502 /*
503 * Case: new_ext is smaller than original extent
504 * oext |---------------|
505 * new_ext |-----------|
506 * end_ext |---|
507 */
508 if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
509 new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
510 end_ext.ee_len =
511 cpu_to_le16(le32_to_cpu(oext->ee_block) +
512 oext_alen - 1 - new_ext_end);
513 copy_extent_status(oext, &end_ext);
514 end_ext_alen = ext4_ext_get_actual_len(&end_ext);
515 ext4_ext_store_pblock(&end_ext,
516 (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
517 end_ext.ee_block =
518 cpu_to_le32(le32_to_cpu(o_end->ee_block) +
519 oext_alen - end_ext_alen);
520 }
521
522 ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
523 o_end, &start_ext, &new_ext, &end_ext);
524out:
525 return ret;
526}
527
528/**
529 * mext_calc_swap_extents - Calculate extents for extent swapping.
530 *
531 * @tmp_dext: the extent that will belong to the original inode
532 * @tmp_oext: the extent that will belong to the donor inode
533 * @orig_off: block offset of original inode
534 * @donor_off: block offset of donor inode
535 * @max_count: the maximum length of extents
536 *
537 * Return 0 on success, or a negative error value on failure.
538 */
539static int
540mext_calc_swap_extents(struct ext4_extent *tmp_dext,
541 struct ext4_extent *tmp_oext,
542 ext4_lblk_t orig_off, ext4_lblk_t donor_off,
543 ext4_lblk_t max_count)
544{
545 ext4_lblk_t diff, orig_diff;
546 struct ext4_extent dext_old, oext_old;
547
548 BUG_ON(orig_off != donor_off);
549
550 /* original and donor extents have to cover the same block offset */
551 if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
552 le32_to_cpu(tmp_oext->ee_block) +
553 ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
554 return -ENODATA;
555
556 if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
557 le32_to_cpu(tmp_dext->ee_block) +
558 ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
559 return -ENODATA;
560
561 dext_old = *tmp_dext;
562 oext_old = *tmp_oext;
563
564 /* When tmp_dext is too large, pick up the target range. */
565 diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
566
567 ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
568 le32_add_cpu(&tmp_dext->ee_block, diff);
569 le16_add_cpu(&tmp_dext->ee_len, -diff);
570
571 if (max_count < ext4_ext_get_actual_len(tmp_dext))
572 tmp_dext->ee_len = cpu_to_le16(max_count);
573
574 orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
575 ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
576
577 /* Adjust extent length if donor extent is larger than orig */
578 if (ext4_ext_get_actual_len(tmp_dext) >
579 ext4_ext_get_actual_len(tmp_oext) - orig_diff)
580 tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
581 orig_diff);
582
583 tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
584
585 copy_extent_status(&oext_old, tmp_dext);
586 copy_extent_status(&dext_old, tmp_oext);
587
588 return 0;
589}
590
591/**
592 * mext_check_coverage - Check that all extents in range has the same type 87 * mext_check_coverage - Check that all extents in range has the same type
593 * 88 *
594 * @inode: inode in question 89 * @inode: inode in question
@@ -619,171 +114,25 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
619 } 114 }
620 ret = 1; 115 ret = 1;
621out: 116out:
622 if (path) { 117 ext4_ext_drop_refs(path);
623 ext4_ext_drop_refs(path); 118 kfree(path);
624 kfree(path);
625 }
626 return ret; 119 return ret;
627} 120}
628 121
629/** 122/**
630 * mext_replace_branches - Replace original extents with new extents
631 *
632 * @handle: journal handle
633 * @orig_inode: original inode
634 * @donor_inode: donor inode
635 * @from: block offset of orig_inode
636 * @count: block count to be replaced
637 * @err: pointer to save return value
638 *
639 * Replace original inode extents and donor inode extents page by page.
640 * We implement this replacement in the following three steps:
641 * 1. Save the block information of original and donor inodes into
642 * dummy extents.
643 * 2. Change the block information of original inode to point at the
644 * donor inode blocks.
645 * 3. Change the block information of donor inode to point at the saved
646 * original inode blocks in the dummy extents.
647 *
648 * Return replaced block count.
649 */
650static int
651mext_replace_branches(handle_t *handle, struct inode *orig_inode,
652 struct inode *donor_inode, ext4_lblk_t from,
653 ext4_lblk_t count, int *err)
654{
655 struct ext4_ext_path *orig_path = NULL;
656 struct ext4_ext_path *donor_path = NULL;
657 struct ext4_extent *oext, *dext;
658 struct ext4_extent tmp_dext, tmp_oext;
659 ext4_lblk_t orig_off = from, donor_off = from;
660 int depth;
661 int replaced_count = 0;
662 int dext_alen;
663
664 *err = ext4_es_remove_extent(orig_inode, from, count);
665 if (*err)
666 goto out;
667
668 *err = ext4_es_remove_extent(donor_inode, from, count);
669 if (*err)
670 goto out;
671
672 /* Get the original extent for the block "orig_off" */
673 *err = get_ext_path(orig_inode, orig_off, &orig_path);
674 if (*err)
675 goto out;
676
677 /* Get the donor extent for the head */
678 *err = get_ext_path(donor_inode, donor_off, &donor_path);
679 if (*err)
680 goto out;
681 depth = ext_depth(orig_inode);
682 oext = orig_path[depth].p_ext;
683 tmp_oext = *oext;
684
685 depth = ext_depth(donor_inode);
686 dext = donor_path[depth].p_ext;
687 if (unlikely(!dext))
688 goto missing_donor_extent;
689 tmp_dext = *dext;
690
691 *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
692 donor_off, count);
693 if (*err)
694 goto out;
695
696 /* Loop for the donor extents */
697 while (1) {
698 /* The extent for donor must be found. */
699 if (unlikely(!dext)) {
700 missing_donor_extent:
701 EXT4_ERROR_INODE(donor_inode,
702 "The extent for donor must be found");
703 *err = -EIO;
704 goto out;
705 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
706 EXT4_ERROR_INODE(donor_inode,
707 "Donor offset(%u) and the first block of donor "
708 "extent(%u) should be equal",
709 donor_off,
710 le32_to_cpu(tmp_dext.ee_block));
711 *err = -EIO;
712 goto out;
713 }
714
715 /* Set donor extent to orig extent */
716 *err = mext_leaf_block(handle, orig_inode,
717 orig_path, &tmp_dext, &orig_off);
718 if (*err)
719 goto out;
720
721 /* Set orig extent to donor extent */
722 *err = mext_leaf_block(handle, donor_inode,
723 donor_path, &tmp_oext, &donor_off);
724 if (*err)
725 goto out;
726
727 dext_alen = ext4_ext_get_actual_len(&tmp_dext);
728 replaced_count += dext_alen;
729 donor_off += dext_alen;
730 orig_off += dext_alen;
731
732 BUG_ON(replaced_count > count);
733 /* Already moved the expected blocks */
734 if (replaced_count >= count)
735 break;
736
737 if (orig_path)
738 ext4_ext_drop_refs(orig_path);
739 *err = get_ext_path(orig_inode, orig_off, &orig_path);
740 if (*err)
741 goto out;
742 depth = ext_depth(orig_inode);
743 oext = orig_path[depth].p_ext;
744 tmp_oext = *oext;
745
746 if (donor_path)
747 ext4_ext_drop_refs(donor_path);
748 *err = get_ext_path(donor_inode, donor_off, &donor_path);
749 if (*err)
750 goto out;
751 depth = ext_depth(donor_inode);
752 dext = donor_path[depth].p_ext;
753 tmp_dext = *dext;
754
755 *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
756 donor_off, count - replaced_count);
757 if (*err)
758 goto out;
759 }
760
761out:
762 if (orig_path) {
763 ext4_ext_drop_refs(orig_path);
764 kfree(orig_path);
765 }
766 if (donor_path) {
767 ext4_ext_drop_refs(donor_path);
768 kfree(donor_path);
769 }
770
771 return replaced_count;
772}
773
774/**
775 * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2 123 * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
776 * 124 *
777 * @inode1: the inode structure 125 * @inode1: the inode structure
778 * @inode2: the inode structure 126 * @inode2: the inode structure
779 * @index: page index 127 * @index1: page index
128 * @index2: page index
780 * @page: result page vector 129 * @page: result page vector
781 * 130 *
782 * Grab two locked pages for inode's by inode order 131 * Grab two locked pages for inode's by inode order
783 */ 132 */
784static int 133static int
785mext_page_double_lock(struct inode *inode1, struct inode *inode2, 134mext_page_double_lock(struct inode *inode1, struct inode *inode2,
786 pgoff_t index, struct page *page[2]) 135 pgoff_t index1, pgoff_t index2, struct page *page[2])
787{ 136{
788 struct address_space *mapping[2]; 137 struct address_space *mapping[2];
789 unsigned fl = AOP_FLAG_NOFS; 138 unsigned fl = AOP_FLAG_NOFS;
@@ -793,15 +142,18 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
793 mapping[0] = inode1->i_mapping; 142 mapping[0] = inode1->i_mapping;
794 mapping[1] = inode2->i_mapping; 143 mapping[1] = inode2->i_mapping;
795 } else { 144 } else {
145 pgoff_t tmp = index1;
146 index1 = index2;
147 index2 = tmp;
796 mapping[0] = inode2->i_mapping; 148 mapping[0] = inode2->i_mapping;
797 mapping[1] = inode1->i_mapping; 149 mapping[1] = inode1->i_mapping;
798 } 150 }
799 151
800 page[0] = grab_cache_page_write_begin(mapping[0], index, fl); 152 page[0] = grab_cache_page_write_begin(mapping[0], index1, fl);
801 if (!page[0]) 153 if (!page[0])
802 return -ENOMEM; 154 return -ENOMEM;
803 155
804 page[1] = grab_cache_page_write_begin(mapping[1], index, fl); 156 page[1] = grab_cache_page_write_begin(mapping[1], index2, fl);
805 if (!page[1]) { 157 if (!page[1]) {
806 unlock_page(page[0]); 158 unlock_page(page[0]);
807 page_cache_release(page[0]); 159 page_cache_release(page[0]);
@@ -893,25 +245,27 @@ out:
893 * @o_filp: file structure of original file 245 * @o_filp: file structure of original file
894 * @donor_inode: donor inode 246 * @donor_inode: donor inode
895 * @orig_page_offset: page index on original file 247 * @orig_page_offset: page index on original file
248 * @donor_page_offset: page index on donor file
896 * @data_offset_in_page: block index where data swapping starts 249 * @data_offset_in_page: block index where data swapping starts
897 * @block_len_in_page: the number of blocks to be swapped 250 * @block_len_in_page: the number of blocks to be swapped
898 * @unwritten: orig extent is unwritten or not 251 * @unwritten: orig extent is unwritten or not
899 * @err: pointer to save return value 252 * @err: pointer to save return value
900 * 253 *
901 * Save the data in original inode blocks and replace original inode extents 254 * Save the data in original inode blocks and replace original inode extents
902 * with donor inode extents by calling mext_replace_branches(). 255 * with donor inode extents by calling ext4_swap_extents().
903 * Finally, write out the saved data in new original inode blocks. Return 256 * Finally, write out the saved data in new original inode blocks. Return
904 * replaced block count. 257 * replaced block count.
905 */ 258 */
906static int 259static int
907move_extent_per_page(struct file *o_filp, struct inode *donor_inode, 260move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
908 pgoff_t orig_page_offset, int data_offset_in_page, 261 pgoff_t orig_page_offset, pgoff_t donor_page_offset,
909 int block_len_in_page, int unwritten, int *err) 262 int data_offset_in_page,
263 int block_len_in_page, int unwritten, int *err)
910{ 264{
911 struct inode *orig_inode = file_inode(o_filp); 265 struct inode *orig_inode = file_inode(o_filp);
912 struct page *pagep[2] = {NULL, NULL}; 266 struct page *pagep[2] = {NULL, NULL};
913 handle_t *handle; 267 handle_t *handle;
914 ext4_lblk_t orig_blk_offset; 268 ext4_lblk_t orig_blk_offset, donor_blk_offset;
915 unsigned long blocksize = orig_inode->i_sb->s_blocksize; 269 unsigned long blocksize = orig_inode->i_sb->s_blocksize;
916 unsigned int w_flags = 0; 270 unsigned int w_flags = 0;
917 unsigned int tmp_data_size, data_size, replaced_size; 271 unsigned int tmp_data_size, data_size, replaced_size;
@@ -939,6 +293,9 @@ again:
939 orig_blk_offset = orig_page_offset * blocks_per_page + 293 orig_blk_offset = orig_page_offset * blocks_per_page +
940 data_offset_in_page; 294 data_offset_in_page;
941 295
296 donor_blk_offset = donor_page_offset * blocks_per_page +
297 data_offset_in_page;
298
942 /* Calculate data_size */ 299 /* Calculate data_size */
943 if ((orig_blk_offset + block_len_in_page - 1) == 300 if ((orig_blk_offset + block_len_in_page - 1) ==
944 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { 301 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
@@ -959,7 +316,7 @@ again:
959 replaced_size = data_size; 316 replaced_size = data_size;
960 317
961 *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset, 318 *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
962 pagep); 319 donor_page_offset, pagep);
963 if (unlikely(*err < 0)) 320 if (unlikely(*err < 0))
964 goto stop_journal; 321 goto stop_journal;
965 /* 322 /*
@@ -978,7 +335,7 @@ again:
978 if (*err) 335 if (*err)
979 goto drop_data_sem; 336 goto drop_data_sem;
980 337
981 unwritten &= mext_check_coverage(donor_inode, orig_blk_offset, 338 unwritten &= mext_check_coverage(donor_inode, donor_blk_offset,
982 block_len_in_page, 1, err); 339 block_len_in_page, 1, err);
983 if (*err) 340 if (*err)
984 goto drop_data_sem; 341 goto drop_data_sem;
@@ -994,9 +351,10 @@ again:
994 *err = -EBUSY; 351 *err = -EBUSY;
995 goto drop_data_sem; 352 goto drop_data_sem;
996 } 353 }
997 replaced_count = mext_replace_branches(handle, orig_inode, 354 replaced_count = ext4_swap_extents(handle, orig_inode,
998 donor_inode, orig_blk_offset, 355 donor_inode, orig_blk_offset,
999 block_len_in_page, err); 356 donor_blk_offset,
357 block_len_in_page, 1, err);
1000 drop_data_sem: 358 drop_data_sem:
1001 ext4_double_up_write_data_sem(orig_inode, donor_inode); 359 ext4_double_up_write_data_sem(orig_inode, donor_inode);
1002 goto unlock_pages; 360 goto unlock_pages;
@@ -1014,9 +372,9 @@ data_copy:
1014 goto unlock_pages; 372 goto unlock_pages;
1015 } 373 }
1016 ext4_double_down_write_data_sem(orig_inode, donor_inode); 374 ext4_double_down_write_data_sem(orig_inode, donor_inode);
1017 replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, 375 replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
1018 orig_blk_offset, 376 orig_blk_offset, donor_blk_offset,
1019 block_len_in_page, err); 377 block_len_in_page, 1, err);
1020 ext4_double_up_write_data_sem(orig_inode, donor_inode); 378 ext4_double_up_write_data_sem(orig_inode, donor_inode);
1021 if (*err) { 379 if (*err) {
1022 if (replaced_count) { 380 if (replaced_count) {
@@ -1061,9 +419,9 @@ repair_branches:
1061 * Try to swap extents to it's original places 419 * Try to swap extents to it's original places
1062 */ 420 */
1063 ext4_double_down_write_data_sem(orig_inode, donor_inode); 421 ext4_double_down_write_data_sem(orig_inode, donor_inode);
1064 replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, 422 replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode,
1065 orig_blk_offset, 423 orig_blk_offset, donor_blk_offset,
1066 block_len_in_page, &err2); 424 block_len_in_page, 0, &err2);
1067 ext4_double_up_write_data_sem(orig_inode, donor_inode); 425 ext4_double_up_write_data_sem(orig_inode, donor_inode);
1068 if (replaced_count != block_len_in_page) { 426 if (replaced_count != block_len_in_page) {
1069 EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), 427 EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
@@ -1093,10 +451,14 @@ mext_check_arguments(struct inode *orig_inode,
1093 struct inode *donor_inode, __u64 orig_start, 451 struct inode *donor_inode, __u64 orig_start,
1094 __u64 donor_start, __u64 *len) 452 __u64 donor_start, __u64 *len)
1095{ 453{
1096 ext4_lblk_t orig_blocks, donor_blocks; 454 __u64 orig_eof, donor_eof;
1097 unsigned int blkbits = orig_inode->i_blkbits; 455 unsigned int blkbits = orig_inode->i_blkbits;
1098 unsigned int blocksize = 1 << blkbits; 456 unsigned int blocksize = 1 << blkbits;
1099 457
458 orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits;
459 donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits;
460
461
1100 if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { 462 if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
1101 ext4_debug("ext4 move extent: suid or sgid is set" 463 ext4_debug("ext4 move extent: suid or sgid is set"
1102 " to donor file [ino:orig %lu, donor %lu]\n", 464 " to donor file [ino:orig %lu, donor %lu]\n",
@@ -1112,7 +474,7 @@ mext_check_arguments(struct inode *orig_inode,
1112 ext4_debug("ext4 move extent: The argument files should " 474 ext4_debug("ext4 move extent: The argument files should "
1113 "not be swapfile [ino:orig %lu, donor %lu]\n", 475 "not be swapfile [ino:orig %lu, donor %lu]\n",
1114 orig_inode->i_ino, donor_inode->i_ino); 476 orig_inode->i_ino, donor_inode->i_ino);
1115 return -EINVAL; 477 return -EBUSY;
1116 } 478 }
1117 479
1118 /* Ext4 move extent supports only extent based file */ 480 /* Ext4 move extent supports only extent based file */
@@ -1132,67 +494,28 @@ mext_check_arguments(struct inode *orig_inode,
1132 } 494 }
1133 495
1134 /* Start offset should be same */ 496 /* Start offset should be same */
1135 if (orig_start != donor_start) { 497 if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) !=
498 (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) {
1136 ext4_debug("ext4 move extent: orig and donor's start " 499 ext4_debug("ext4 move extent: orig and donor's start "
1137 "offset are not same [ino:orig %lu, donor %lu]\n", 500 "offset are not alligned [ino:orig %lu, donor %lu]\n",
1138 orig_inode->i_ino, donor_inode->i_ino); 501 orig_inode->i_ino, donor_inode->i_ino);
1139 return -EINVAL; 502 return -EINVAL;
1140 } 503 }
1141 504
1142 if ((orig_start >= EXT_MAX_BLOCKS) || 505 if ((orig_start >= EXT_MAX_BLOCKS) ||
506 (donor_start >= EXT_MAX_BLOCKS) ||
1143 (*len > EXT_MAX_BLOCKS) || 507 (*len > EXT_MAX_BLOCKS) ||
508 (donor_start + *len >= EXT_MAX_BLOCKS) ||
1144 (orig_start + *len >= EXT_MAX_BLOCKS)) { 509 (orig_start + *len >= EXT_MAX_BLOCKS)) {
1145 ext4_debug("ext4 move extent: Can't handle over [%u] blocks " 510 ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
1146 "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, 511 "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
1147 orig_inode->i_ino, donor_inode->i_ino); 512 orig_inode->i_ino, donor_inode->i_ino);
1148 return -EINVAL; 513 return -EINVAL;
1149 } 514 }
1150 515 if (orig_eof < orig_start + *len - 1)
1151 if (orig_inode->i_size > donor_inode->i_size) { 516 *len = orig_eof - orig_start;
1152 donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits; 517 if (donor_eof < donor_start + *len - 1)
1153 /* TODO: eliminate this artificial restriction */ 518 *len = donor_eof - donor_start;
1154 if (orig_start >= donor_blocks) {
1155 ext4_debug("ext4 move extent: orig start offset "
1156 "[%llu] should be less than donor file blocks "
1157 "[%u] [ino:orig %lu, donor %lu]\n",
1158 orig_start, donor_blocks,
1159 orig_inode->i_ino, donor_inode->i_ino);
1160 return -EINVAL;
1161 }
1162
1163 /* TODO: eliminate this artificial restriction */
1164 if (orig_start + *len > donor_blocks) {
1165 ext4_debug("ext4 move extent: End offset [%llu] should "
1166 "be less than donor file blocks [%u]."
1167 "So adjust length from %llu to %llu "
1168 "[ino:orig %lu, donor %lu]\n",
1169 orig_start + *len, donor_blocks,
1170 *len, donor_blocks - orig_start,
1171 orig_inode->i_ino, donor_inode->i_ino);
1172 *len = donor_blocks - orig_start;
1173 }
1174 } else {
1175 orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
1176 if (orig_start >= orig_blocks) {
1177 ext4_debug("ext4 move extent: start offset [%llu] "
1178 "should be less than original file blocks "
1179 "[%u] [ino:orig %lu, donor %lu]\n",
1180 orig_start, orig_blocks,
1181 orig_inode->i_ino, donor_inode->i_ino);
1182 return -EINVAL;
1183 }
1184
1185 if (orig_start + *len > orig_blocks) {
1186 ext4_debug("ext4 move extent: Adjust length "
1187 "from %llu to %llu. Because it should be "
1188 "less than original file blocks "
1189 "[ino:orig %lu, donor %lu]\n",
1190 *len, orig_blocks - orig_start,
1191 orig_inode->i_ino, donor_inode->i_ino);
1192 *len = orig_blocks - orig_start;
1193 }
1194 }
1195
1196 if (!*len) { 519 if (!*len) {
1197 ext4_debug("ext4 move extent: len should not be 0 " 520 ext4_debug("ext4 move extent: len should not be 0 "
1198 "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, 521 "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
@@ -1208,60 +531,26 @@ mext_check_arguments(struct inode *orig_inode,
1208 * 531 *
1209 * @o_filp: file structure of the original file 532 * @o_filp: file structure of the original file
1210 * @d_filp: file structure of the donor file 533 * @d_filp: file structure of the donor file
1211 * @orig_start: start offset in block for orig 534 * @orig_blk: start offset in block for orig
1212 * @donor_start: start offset in block for donor 535 * @donor_blk: start offset in block for donor
1213 * @len: the number of blocks to be moved 536 * @len: the number of blocks to be moved
1214 * @moved_len: moved block length 537 * @moved_len: moved block length
1215 * 538 *
1216 * This function returns 0 and moved block length is set in moved_len 539 * This function returns 0 and moved block length is set in moved_len
1217 * if succeed, otherwise returns error value. 540 * if succeed, otherwise returns error value.
1218 * 541 *
1219 * Note: ext4_move_extents() proceeds the following order.
1220 * 1:ext4_move_extents() calculates the last block number of moving extent
1221 * function by the start block number (orig_start) and the number of blocks
1222 * to be moved (len) specified as arguments.
1223 * If the {orig, donor}_start points a hole, the extent's start offset
1224 * pointed by ext_cur (current extent), holecheck_path, orig_path are set
1225 * after hole behind.
1226 * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
1227 * or the ext_cur exceeds the block_end which is last logical block number.
1228 * 3:To get the length of continues area, call mext_next_extent()
1229 * specified with the ext_cur (initial value is holecheck_path) re-cursive,
1230 * until find un-continuous extent, the start logical block number exceeds
1231 * the block_end or the extent points to the last extent.
1232 * 4:Exchange the original inode data with donor inode data
1233 * from orig_page_offset to seq_end_page.
1234 * The start indexes of data are specified as arguments.
1235 * That of the original inode is orig_page_offset,
1236 * and the donor inode is also orig_page_offset
1237 * (To easily handle blocksize != pagesize case, the offset for the
1238 * donor inode is block unit).
1239 * 5:Update holecheck_path and orig_path to points a next proceeding extent,
1240 * then returns to step 2.
1241 * 6:Release holecheck_path, orig_path and set the len to moved_len
1242 * which shows the number of moved blocks.
1243 * The moved_len is useful for the command to calculate the file offset
1244 * for starting next move extent ioctl.
1245 * 7:Return 0 on success, or a negative error value on failure.
1246 */ 542 */
1247int 543int
1248ext4_move_extents(struct file *o_filp, struct file *d_filp, 544ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
1249 __u64 orig_start, __u64 donor_start, __u64 len, 545 __u64 donor_blk, __u64 len, __u64 *moved_len)
1250 __u64 *moved_len)
1251{ 546{
1252 struct inode *orig_inode = file_inode(o_filp); 547 struct inode *orig_inode = file_inode(o_filp);
1253 struct inode *donor_inode = file_inode(d_filp); 548 struct inode *donor_inode = file_inode(d_filp);
1254 struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL; 549 struct ext4_ext_path *path = NULL;
1255 struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
1256 ext4_lblk_t block_start = orig_start;
1257 ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
1258 ext4_lblk_t rest_blocks;
1259 pgoff_t orig_page_offset = 0, seq_end_page;
1260 int ret, depth, last_extent = 0;
1261 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 550 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
1262 int data_offset_in_page; 551 ext4_lblk_t o_end, o_start = orig_blk;
1263 int block_len_in_page; 552 ext4_lblk_t d_start = donor_blk;
1264 int unwritten; 553 int ret;
1265 554
1266 if (orig_inode->i_sb != donor_inode->i_sb) { 555 if (orig_inode->i_sb != donor_inode->i_sb) {
1267 ext4_debug("ext4 move extent: The argument files " 556 ext4_debug("ext4 move extent: The argument files "
@@ -1303,121 +592,58 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1303 /* Protect extent tree against block allocations via delalloc */ 592 /* Protect extent tree against block allocations via delalloc */
1304 ext4_double_down_write_data_sem(orig_inode, donor_inode); 593 ext4_double_down_write_data_sem(orig_inode, donor_inode);
1305 /* Check the filesystem environment whether move_extent can be done */ 594 /* Check the filesystem environment whether move_extent can be done */
1306 ret = mext_check_arguments(orig_inode, donor_inode, orig_start, 595 ret = mext_check_arguments(orig_inode, donor_inode, orig_blk,
1307 donor_start, &len); 596 donor_blk, &len);
1308 if (ret) 597 if (ret)
1309 goto out; 598 goto out;
599 o_end = o_start + len;
1310 600
1311 file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; 601 while (o_start < o_end) {
1312 block_end = block_start + len - 1; 602 struct ext4_extent *ex;
1313 if (file_end < block_end) 603 ext4_lblk_t cur_blk, next_blk;
1314 len -= block_end - file_end; 604 pgoff_t orig_page_index, donor_page_index;
605 int offset_in_page;
606 int unwritten, cur_len;
1315 607
1316 ret = get_ext_path(orig_inode, block_start, &orig_path); 608 ret = get_ext_path(orig_inode, o_start, &path);
1317 if (ret) 609 if (ret)
1318 goto out;
1319
1320 /* Get path structure to check the hole */
1321 ret = get_ext_path(orig_inode, block_start, &holecheck_path);
1322 if (ret)
1323 goto out;
1324
1325 depth = ext_depth(orig_inode);
1326 ext_cur = holecheck_path[depth].p_ext;
1327
1328 /*
1329 * Get proper starting location of block replacement if block_start was
1330 * within the hole.
1331 */
1332 if (le32_to_cpu(ext_cur->ee_block) +
1333 ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
1334 /*
1335 * The hole exists between extents or the tail of
1336 * original file.
1337 */
1338 last_extent = mext_next_extent(orig_inode,
1339 holecheck_path, &ext_cur);
1340 if (last_extent < 0) {
1341 ret = last_extent;
1342 goto out;
1343 }
1344 last_extent = mext_next_extent(orig_inode, orig_path,
1345 &ext_dummy);
1346 if (last_extent < 0) {
1347 ret = last_extent;
1348 goto out; 610 goto out;
1349 } 611 ex = path[path->p_depth].p_ext;
1350 seq_start = le32_to_cpu(ext_cur->ee_block); 612 next_blk = ext4_ext_next_allocated_block(path);
1351 } else if (le32_to_cpu(ext_cur->ee_block) > block_start) 613 cur_blk = le32_to_cpu(ex->ee_block);
1352 /* The hole exists at the beginning of original file. */ 614 cur_len = ext4_ext_get_actual_len(ex);
1353 seq_start = le32_to_cpu(ext_cur->ee_block); 615 /* Check hole before the start pos */
1354 else 616 if (cur_blk + cur_len - 1 < o_start) {
1355 seq_start = block_start; 617 if (next_blk == EXT_MAX_BLOCKS) {
1356 618 o_start = o_end;
1357 /* No blocks within the specified range. */ 619 ret = -ENODATA;
1358 if (le32_to_cpu(ext_cur->ee_block) > block_end) { 620 goto out;
1359 ext4_debug("ext4 move extent: The specified range of file " 621 }
1360 "may be the hole\n"); 622 d_start += next_blk - o_start;
1361 ret = -EINVAL; 623 o_start = next_blk;
1362 goto out;
1363 }
1364
1365 /* Adjust start blocks */
1366 add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
1367 ext4_ext_get_actual_len(ext_cur), block_end + 1) -
1368 max(le32_to_cpu(ext_cur->ee_block), block_start);
1369
1370 while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
1371 seq_blocks += add_blocks;
1372
1373 /* Adjust tail blocks */
1374 if (seq_start + seq_blocks - 1 > block_end)
1375 seq_blocks = block_end - seq_start + 1;
1376
1377 ext_prev = ext_cur;
1378 last_extent = mext_next_extent(orig_inode, holecheck_path,
1379 &ext_cur);
1380 if (last_extent < 0) {
1381 ret = last_extent;
1382 break;
1383 }
1384 add_blocks = ext4_ext_get_actual_len(ext_cur);
1385
1386 /*
1387 * Extend the length of contiguous block (seq_blocks)
1388 * if extents are contiguous.
1389 */
1390 if (ext4_can_extents_be_merged(orig_inode,
1391 ext_prev, ext_cur) &&
1392 block_end >= le32_to_cpu(ext_cur->ee_block) &&
1393 !last_extent)
1394 continue; 624 continue;
1395 625 /* Check hole after the start pos */
1396 /* Is original extent is unwritten */ 626 } else if (cur_blk > o_start) {
1397 unwritten = ext4_ext_is_unwritten(ext_prev); 627 /* Skip hole */
1398 628 d_start += cur_blk - o_start;
1399 data_offset_in_page = seq_start % blocks_per_page; 629 o_start = cur_blk;
1400 630 /* Extent inside requested range ?*/
1401 /* 631 if (cur_blk >= o_end)
1402 * Calculate data blocks count that should be swapped 632 goto out;
1403 * at the first page. 633 } else { /* in_range(o_start, o_blk, o_len) */
1404 */ 634 cur_len += cur_blk - o_start;
1405 if (data_offset_in_page + seq_blocks > blocks_per_page) {
1406 /* Swapped blocks are across pages */
1407 block_len_in_page =
1408 blocks_per_page - data_offset_in_page;
1409 } else {
1410 /* Swapped blocks are in a page */
1411 block_len_in_page = seq_blocks;
1412 } 635 }
1413 636 unwritten = ext4_ext_is_unwritten(ex);
1414 orig_page_offset = seq_start >> 637 if (o_end - o_start < cur_len)
1415 (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); 638 cur_len = o_end - o_start;
1416 seq_end_page = (seq_start + seq_blocks - 1) >> 639
1417 (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); 640 orig_page_index = o_start >> (PAGE_CACHE_SHIFT -
1418 seq_start = le32_to_cpu(ext_cur->ee_block); 641 orig_inode->i_blkbits);
1419 rest_blocks = seq_blocks; 642 donor_page_index = d_start >> (PAGE_CACHE_SHIFT -
1420 643 donor_inode->i_blkbits);
644 offset_in_page = o_start % blocks_per_page;
645 if (cur_len > blocks_per_page- offset_in_page)
646 cur_len = blocks_per_page - offset_in_page;
1421 /* 647 /*
1422 * Up semaphore to avoid following problems: 648 * Up semaphore to avoid following problems:
1423 * a. transaction deadlock among ext4_journal_start, 649 * a. transaction deadlock among ext4_journal_start,
@@ -1426,77 +652,29 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1426 * in move_extent_per_page 652 * in move_extent_per_page
1427 */ 653 */
1428 ext4_double_up_write_data_sem(orig_inode, donor_inode); 654 ext4_double_up_write_data_sem(orig_inode, donor_inode);
1429 655 /* Swap original branches with new branches */
1430 while (orig_page_offset <= seq_end_page) { 656 move_extent_per_page(o_filp, donor_inode,
1431 657 orig_page_index, donor_page_index,
1432 /* Swap original branches with new branches */ 658 offset_in_page, cur_len,
1433 block_len_in_page = move_extent_per_page( 659 unwritten, &ret);
1434 o_filp, donor_inode,
1435 orig_page_offset,
1436 data_offset_in_page,
1437 block_len_in_page,
1438 unwritten, &ret);
1439
1440 /* Count how many blocks we have exchanged */
1441 *moved_len += block_len_in_page;
1442 if (ret < 0)
1443 break;
1444 if (*moved_len > len) {
1445 EXT4_ERROR_INODE(orig_inode,
1446 "We replaced blocks too much! "
1447 "sum of replaced: %llu requested: %llu",
1448 *moved_len, len);
1449 ret = -EIO;
1450 break;
1451 }
1452
1453 orig_page_offset++;
1454 data_offset_in_page = 0;
1455 rest_blocks -= block_len_in_page;
1456 if (rest_blocks > blocks_per_page)
1457 block_len_in_page = blocks_per_page;
1458 else
1459 block_len_in_page = rest_blocks;
1460 }
1461
1462 ext4_double_down_write_data_sem(orig_inode, donor_inode); 660 ext4_double_down_write_data_sem(orig_inode, donor_inode);
1463 if (ret < 0) 661 if (ret < 0)
1464 break; 662 break;
1465 663 o_start += cur_len;
1466 /* Decrease buffer counter */ 664 d_start += cur_len;
1467 if (holecheck_path)
1468 ext4_ext_drop_refs(holecheck_path);
1469 ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
1470 if (ret)
1471 break;
1472 depth = holecheck_path->p_depth;
1473
1474 /* Decrease buffer counter */
1475 if (orig_path)
1476 ext4_ext_drop_refs(orig_path);
1477 ret = get_ext_path(orig_inode, seq_start, &orig_path);
1478 if (ret)
1479 break;
1480
1481 ext_cur = holecheck_path[depth].p_ext;
1482 add_blocks = ext4_ext_get_actual_len(ext_cur);
1483 seq_blocks = 0;
1484
1485 } 665 }
666 *moved_len = o_start - orig_blk;
667 if (*moved_len > len)
668 *moved_len = len;
669
1486out: 670out:
1487 if (*moved_len) { 671 if (*moved_len) {
1488 ext4_discard_preallocations(orig_inode); 672 ext4_discard_preallocations(orig_inode);
1489 ext4_discard_preallocations(donor_inode); 673 ext4_discard_preallocations(donor_inode);
1490 } 674 }
1491 675
1492 if (orig_path) { 676 ext4_ext_drop_refs(path);
1493 ext4_ext_drop_refs(orig_path); 677 kfree(path);
1494 kfree(orig_path);
1495 }
1496 if (holecheck_path) {
1497 ext4_ext_drop_refs(holecheck_path);
1498 kfree(holecheck_path);
1499 }
1500 ext4_double_up_write_data_sem(orig_inode, donor_inode); 678 ext4_double_up_write_data_sem(orig_inode, donor_inode);
1501 ext4_inode_resume_unlocked_dio(orig_inode); 679 ext4_inode_resume_unlocked_dio(orig_inode);
1502 ext4_inode_resume_unlocked_dio(donor_inode); 680 ext4_inode_resume_unlocked_dio(donor_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 603e4ebbd0ac..426211882f72 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -53,7 +53,7 @@ static struct buffer_head *ext4_append(handle_t *handle,
53 ext4_lblk_t *block) 53 ext4_lblk_t *block)
54{ 54{
55 struct buffer_head *bh; 55 struct buffer_head *bh;
56 int err = 0; 56 int err;
57 57
58 if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && 58 if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
59 ((inode->i_size >> 10) >= 59 ((inode->i_size >> 10) >=
@@ -62,9 +62,9 @@ static struct buffer_head *ext4_append(handle_t *handle,
62 62
63 *block = inode->i_size >> inode->i_sb->s_blocksize_bits; 63 *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
64 64
65 bh = ext4_bread(handle, inode, *block, 1, &err); 65 bh = ext4_bread(handle, inode, *block, 1);
66 if (!bh) 66 if (IS_ERR(bh))
67 return ERR_PTR(err); 67 return bh;
68 inode->i_size += inode->i_sb->s_blocksize; 68 inode->i_size += inode->i_sb->s_blocksize;
69 EXT4_I(inode)->i_disksize = inode->i_size; 69 EXT4_I(inode)->i_disksize = inode->i_size;
70 BUFFER_TRACE(bh, "get_write_access"); 70 BUFFER_TRACE(bh, "get_write_access");
@@ -94,20 +94,20 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
94{ 94{
95 struct buffer_head *bh; 95 struct buffer_head *bh;
96 struct ext4_dir_entry *dirent; 96 struct ext4_dir_entry *dirent;
97 int err = 0, is_dx_block = 0; 97 int is_dx_block = 0;
98 98
99 bh = ext4_bread(NULL, inode, block, 0, &err); 99 bh = ext4_bread(NULL, inode, block, 0);
100 if (!bh) { 100 if (IS_ERR(bh)) {
101 if (err == 0) {
102 ext4_error_inode(inode, __func__, line, block,
103 "Directory hole found");
104 return ERR_PTR(-EIO);
105 }
106 __ext4_warning(inode->i_sb, __func__, line, 101 __ext4_warning(inode->i_sb, __func__, line,
107 "error reading directory block " 102 "error %ld reading directory block "
108 "(ino %lu, block %lu)", inode->i_ino, 103 "(ino %lu, block %lu)", PTR_ERR(bh), inode->i_ino,
109 (unsigned long) block); 104 (unsigned long) block);
110 return ERR_PTR(err); 105
106 return bh;
107 }
108 if (!bh) {
109 ext4_error_inode(inode, __func__, line, block, "Directory hole found");
110 return ERR_PTR(-EIO);
111 } 111 }
112 dirent = (struct ext4_dir_entry *) bh->b_data; 112 dirent = (struct ext4_dir_entry *) bh->b_data;
113 /* Determine whether or not we have an index block */ 113 /* Determine whether or not we have an index block */
@@ -124,8 +124,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
124 "directory leaf block found instead of index block"); 124 "directory leaf block found instead of index block");
125 return ERR_PTR(-EIO); 125 return ERR_PTR(-EIO);
126 } 126 }
127 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 127 if (!ext4_has_metadata_csum(inode->i_sb) ||
128 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) ||
129 buffer_verified(bh)) 128 buffer_verified(bh))
130 return bh; 129 return bh;
131 130
@@ -253,8 +252,7 @@ static unsigned dx_node_limit(struct inode *dir);
253static struct dx_frame *dx_probe(const struct qstr *d_name, 252static struct dx_frame *dx_probe(const struct qstr *d_name,
254 struct inode *dir, 253 struct inode *dir,
255 struct dx_hash_info *hinfo, 254 struct dx_hash_info *hinfo,
256 struct dx_frame *frame, 255 struct dx_frame *frame);
257 int *err);
258static void dx_release(struct dx_frame *frames); 256static void dx_release(struct dx_frame *frames);
259static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, 257static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
260 struct dx_hash_info *hinfo, struct dx_map_entry map[]); 258 struct dx_hash_info *hinfo, struct dx_map_entry map[]);
@@ -270,8 +268,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
270 __u32 *start_hash); 268 __u32 *start_hash);
271static struct buffer_head * ext4_dx_find_entry(struct inode *dir, 269static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
272 const struct qstr *d_name, 270 const struct qstr *d_name,
273 struct ext4_dir_entry_2 **res_dir, 271 struct ext4_dir_entry_2 **res_dir);
274 int *err);
275static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, 272static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
276 struct inode *inode); 273 struct inode *inode);
277 274
@@ -340,8 +337,7 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
340{ 337{
341 struct ext4_dir_entry_tail *t; 338 struct ext4_dir_entry_tail *t;
342 339
343 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 340 if (!ext4_has_metadata_csum(inode->i_sb))
344 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
345 return 1; 341 return 1;
346 342
347 t = get_dirent_tail(inode, dirent); 343 t = get_dirent_tail(inode, dirent);
@@ -362,8 +358,7 @@ static void ext4_dirent_csum_set(struct inode *inode,
362{ 358{
363 struct ext4_dir_entry_tail *t; 359 struct ext4_dir_entry_tail *t;
364 360
365 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 361 if (!ext4_has_metadata_csum(inode->i_sb))
366 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
367 return; 362 return;
368 363
369 t = get_dirent_tail(inode, dirent); 364 t = get_dirent_tail(inode, dirent);
@@ -438,8 +433,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
438 struct dx_tail *t; 433 struct dx_tail *t;
439 int count_offset, limit, count; 434 int count_offset, limit, count;
440 435
441 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 436 if (!ext4_has_metadata_csum(inode->i_sb))
442 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
443 return 1; 437 return 1;
444 438
445 c = get_dx_countlimit(inode, dirent, &count_offset); 439 c = get_dx_countlimit(inode, dirent, &count_offset);
@@ -468,8 +462,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
468 struct dx_tail *t; 462 struct dx_tail *t;
469 int count_offset, limit, count; 463 int count_offset, limit, count;
470 464
471 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 465 if (!ext4_has_metadata_csum(inode->i_sb))
472 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
473 return; 466 return;
474 467
475 c = get_dx_countlimit(inode, dirent, &count_offset); 468 c = get_dx_countlimit(inode, dirent, &count_offset);
@@ -557,8 +550,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
557 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - 550 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
558 EXT4_DIR_REC_LEN(2) - infosize; 551 EXT4_DIR_REC_LEN(2) - infosize;
559 552
560 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, 553 if (ext4_has_metadata_csum(dir->i_sb))
561 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
562 entry_space -= sizeof(struct dx_tail); 554 entry_space -= sizeof(struct dx_tail);
563 return entry_space / sizeof(struct dx_entry); 555 return entry_space / sizeof(struct dx_entry);
564} 556}
@@ -567,8 +559,7 @@ static inline unsigned dx_node_limit(struct inode *dir)
567{ 559{
568 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); 560 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
569 561
570 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, 562 if (ext4_has_metadata_csum(dir->i_sb))
571 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
572 entry_space -= sizeof(struct dx_tail); 563 entry_space -= sizeof(struct dx_tail);
573 return entry_space / sizeof(struct dx_entry); 564 return entry_space / sizeof(struct dx_entry);
574} 565}
@@ -641,7 +632,9 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
641 u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; 632 u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
642 struct stats stats; 633 struct stats stats;
643 printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); 634 printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
644 if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue; 635 bh = ext4_bread(NULL,dir, block, 0);
636 if (!bh || IS_ERR(bh))
637 continue;
645 stats = levels? 638 stats = levels?
646 dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): 639 dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
647 dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0); 640 dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
@@ -669,29 +662,25 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
669 */ 662 */
670static struct dx_frame * 663static struct dx_frame *
671dx_probe(const struct qstr *d_name, struct inode *dir, 664dx_probe(const struct qstr *d_name, struct inode *dir,
672 struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) 665 struct dx_hash_info *hinfo, struct dx_frame *frame_in)
673{ 666{
674 unsigned count, indirect; 667 unsigned count, indirect;
675 struct dx_entry *at, *entries, *p, *q, *m; 668 struct dx_entry *at, *entries, *p, *q, *m;
676 struct dx_root *root; 669 struct dx_root *root;
677 struct buffer_head *bh;
678 struct dx_frame *frame = frame_in; 670 struct dx_frame *frame = frame_in;
671 struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
679 u32 hash; 672 u32 hash;
680 673
681 frame->bh = NULL; 674 frame->bh = ext4_read_dirblock(dir, 0, INDEX);
682 bh = ext4_read_dirblock(dir, 0, INDEX); 675 if (IS_ERR(frame->bh))
683 if (IS_ERR(bh)) { 676 return (struct dx_frame *) frame->bh;
684 *err = PTR_ERR(bh); 677
685 goto fail; 678 root = (struct dx_root *) frame->bh->b_data;
686 }
687 root = (struct dx_root *) bh->b_data;
688 if (root->info.hash_version != DX_HASH_TEA && 679 if (root->info.hash_version != DX_HASH_TEA &&
689 root->info.hash_version != DX_HASH_HALF_MD4 && 680 root->info.hash_version != DX_HASH_HALF_MD4 &&
690 root->info.hash_version != DX_HASH_LEGACY) { 681 root->info.hash_version != DX_HASH_LEGACY) {
691 ext4_warning(dir->i_sb, "Unrecognised inode hash code %d", 682 ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
692 root->info.hash_version); 683 root->info.hash_version);
693 brelse(bh);
694 *err = ERR_BAD_DX_DIR;
695 goto fail; 684 goto fail;
696 } 685 }
697 hinfo->hash_version = root->info.hash_version; 686 hinfo->hash_version = root->info.hash_version;
@@ -705,16 +694,12 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
705 if (root->info.unused_flags & 1) { 694 if (root->info.unused_flags & 1) {
706 ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", 695 ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
707 root->info.unused_flags); 696 root->info.unused_flags);
708 brelse(bh);
709 *err = ERR_BAD_DX_DIR;
710 goto fail; 697 goto fail;
711 } 698 }
712 699
713 if ((indirect = root->info.indirect_levels) > 1) { 700 if ((indirect = root->info.indirect_levels) > 1) {
714 ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", 701 ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
715 root->info.indirect_levels); 702 root->info.indirect_levels);
716 brelse(bh);
717 *err = ERR_BAD_DX_DIR;
718 goto fail; 703 goto fail;
719 } 704 }
720 705
@@ -724,27 +709,21 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
724 if (dx_get_limit(entries) != dx_root_limit(dir, 709 if (dx_get_limit(entries) != dx_root_limit(dir,
725 root->info.info_length)) { 710 root->info.info_length)) {
726 ext4_warning(dir->i_sb, "dx entry: limit != root limit"); 711 ext4_warning(dir->i_sb, "dx entry: limit != root limit");
727 brelse(bh);
728 *err = ERR_BAD_DX_DIR;
729 goto fail; 712 goto fail;
730 } 713 }
731 714
732 dxtrace(printk("Look up %x", hash)); 715 dxtrace(printk("Look up %x", hash));
733 while (1) 716 while (1) {
734 {
735 count = dx_get_count(entries); 717 count = dx_get_count(entries);
736 if (!count || count > dx_get_limit(entries)) { 718 if (!count || count > dx_get_limit(entries)) {
737 ext4_warning(dir->i_sb, 719 ext4_warning(dir->i_sb,
738 "dx entry: no count or count > limit"); 720 "dx entry: no count or count > limit");
739 brelse(bh); 721 goto fail;
740 *err = ERR_BAD_DX_DIR;
741 goto fail2;
742 } 722 }
743 723
744 p = entries + 1; 724 p = entries + 1;
745 q = entries + count - 1; 725 q = entries + count - 1;
746 while (p <= q) 726 while (p <= q) {
747 {
748 m = p + (q - p)/2; 727 m = p + (q - p)/2;
749 dxtrace(printk(".")); 728 dxtrace(printk("."));
750 if (dx_get_hash(m) > hash) 729 if (dx_get_hash(m) > hash)
@@ -753,8 +732,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
753 p = m + 1; 732 p = m + 1;
754 } 733 }
755 734
756 if (0) // linear search cross check 735 if (0) { // linear search cross check
757 {
758 unsigned n = count - 1; 736 unsigned n = count - 1;
759 at = entries; 737 at = entries;
760 while (n--) 738 while (n--)
@@ -771,38 +749,35 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
771 749
772 at = p - 1; 750 at = p - 1;
773 dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); 751 dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
774 frame->bh = bh;
775 frame->entries = entries; 752 frame->entries = entries;
776 frame->at = at; 753 frame->at = at;
777 if (!indirect--) return frame; 754 if (!indirect--)
778 bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); 755 return frame;
779 if (IS_ERR(bh)) { 756 frame++;
780 *err = PTR_ERR(bh); 757 frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
781 goto fail2; 758 if (IS_ERR(frame->bh)) {
759 ret_err = (struct dx_frame *) frame->bh;
760 frame->bh = NULL;
761 goto fail;
782 } 762 }
783 entries = ((struct dx_node *) bh->b_data)->entries; 763 entries = ((struct dx_node *) frame->bh->b_data)->entries;
784 764
785 if (dx_get_limit(entries) != dx_node_limit (dir)) { 765 if (dx_get_limit(entries) != dx_node_limit (dir)) {
786 ext4_warning(dir->i_sb, 766 ext4_warning(dir->i_sb,
787 "dx entry: limit != node limit"); 767 "dx entry: limit != node limit");
788 brelse(bh); 768 goto fail;
789 *err = ERR_BAD_DX_DIR;
790 goto fail2;
791 } 769 }
792 frame++;
793 frame->bh = NULL;
794 } 770 }
795fail2: 771fail:
796 while (frame >= frame_in) { 772 while (frame >= frame_in) {
797 brelse(frame->bh); 773 brelse(frame->bh);
798 frame--; 774 frame--;
799 } 775 }
800fail: 776 if (ret_err == ERR_PTR(ERR_BAD_DX_DIR))
801 if (*err == ERR_BAD_DX_DIR)
802 ext4_warning(dir->i_sb, 777 ext4_warning(dir->i_sb,
803 "Corrupt dir inode %lu, running e2fsck is " 778 "Corrupt dir inode %lu, running e2fsck is "
804 "recommended.", dir->i_ino); 779 "recommended.", dir->i_ino);
805 return NULL; 780 return ret_err;
806} 781}
807 782
808static void dx_release (struct dx_frame *frames) 783static void dx_release (struct dx_frame *frames)
@@ -988,9 +963,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
988 } 963 }
989 hinfo.hash = start_hash; 964 hinfo.hash = start_hash;
990 hinfo.minor_hash = 0; 965 hinfo.minor_hash = 0;
991 frame = dx_probe(NULL, dir, &hinfo, frames, &err); 966 frame = dx_probe(NULL, dir, &hinfo, frames);
992 if (!frame) 967 if (IS_ERR(frame))
993 return err; 968 return PTR_ERR(frame);
994 969
995 /* Add '.' and '..' from the htree header */ 970 /* Add '.' and '..' from the htree header */
996 if (!start_hash && !start_minor_hash) { 971 if (!start_hash && !start_minor_hash) {
@@ -1227,8 +1202,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
1227 buffer */ 1202 buffer */
1228 int num = 0; 1203 int num = 0;
1229 ext4_lblk_t nblocks; 1204 ext4_lblk_t nblocks;
1230 int i, err = 0; 1205 int i, namelen;
1231 int namelen;
1232 1206
1233 *res_dir = NULL; 1207 *res_dir = NULL;
1234 sb = dir->i_sb; 1208 sb = dir->i_sb;
@@ -1258,17 +1232,13 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
1258 goto restart; 1232 goto restart;
1259 } 1233 }
1260 if (is_dx(dir)) { 1234 if (is_dx(dir)) {
1261 bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); 1235 bh = ext4_dx_find_entry(dir, d_name, res_dir);
1262 /* 1236 /*
1263 * On success, or if the error was file not found, 1237 * On success, or if the error was file not found,
1264 * return. Otherwise, fall back to doing a search the 1238 * return. Otherwise, fall back to doing a search the
1265 * old fashioned way. 1239 * old fashioned way.
1266 */ 1240 */
1267 if (err == -ENOENT) 1241 if (!IS_ERR(bh) || PTR_ERR(bh) != ERR_BAD_DX_DIR)
1268 return NULL;
1269 if (err && err != ERR_BAD_DX_DIR)
1270 return ERR_PTR(err);
1271 if (bh)
1272 return bh; 1242 return bh;
1273 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " 1243 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
1274 "falling back\n")); 1244 "falling back\n"));
@@ -1298,10 +1268,10 @@ restart:
1298 break; 1268 break;
1299 } 1269 }
1300 num++; 1270 num++;
1301 bh = ext4_getblk(NULL, dir, b++, 0, &err); 1271 bh = ext4_getblk(NULL, dir, b++, 0);
1302 if (unlikely(err)) { 1272 if (unlikely(IS_ERR(bh))) {
1303 if (ra_max == 0) 1273 if (ra_max == 0)
1304 return ERR_PTR(err); 1274 return bh;
1305 break; 1275 break;
1306 } 1276 }
1307 bh_use[ra_max] = bh; 1277 bh_use[ra_max] = bh;
@@ -1366,7 +1336,7 @@ cleanup_and_exit:
1366} 1336}
1367 1337
1368static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, 1338static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
1369 struct ext4_dir_entry_2 **res_dir, int *err) 1339 struct ext4_dir_entry_2 **res_dir)
1370{ 1340{
1371 struct super_block * sb = dir->i_sb; 1341 struct super_block * sb = dir->i_sb;
1372 struct dx_hash_info hinfo; 1342 struct dx_hash_info hinfo;
@@ -1375,25 +1345,23 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
1375 ext4_lblk_t block; 1345 ext4_lblk_t block;
1376 int retval; 1346 int retval;
1377 1347
1378 if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) 1348 frame = dx_probe(d_name, dir, &hinfo, frames);
1379 return NULL; 1349 if (IS_ERR(frame))
1350 return (struct buffer_head *) frame;
1380 do { 1351 do {
1381 block = dx_get_block(frame->at); 1352 block = dx_get_block(frame->at);
1382 bh = ext4_read_dirblock(dir, block, DIRENT); 1353 bh = ext4_read_dirblock(dir, block, DIRENT);
1383 if (IS_ERR(bh)) { 1354 if (IS_ERR(bh))
1384 *err = PTR_ERR(bh);
1385 goto errout; 1355 goto errout;
1386 } 1356
1387 retval = search_dirblock(bh, dir, d_name, 1357 retval = search_dirblock(bh, dir, d_name,
1388 block << EXT4_BLOCK_SIZE_BITS(sb), 1358 block << EXT4_BLOCK_SIZE_BITS(sb),
1389 res_dir); 1359 res_dir);
1390 if (retval == 1) { /* Success! */ 1360 if (retval == 1)
1391 dx_release(frames); 1361 goto success;
1392 return bh;
1393 }
1394 brelse(bh); 1362 brelse(bh);
1395 if (retval == -1) { 1363 if (retval == -1) {
1396 *err = ERR_BAD_DX_DIR; 1364 bh = ERR_PTR(ERR_BAD_DX_DIR);
1397 goto errout; 1365 goto errout;
1398 } 1366 }
1399 1367
@@ -1402,18 +1370,19 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
1402 frames, NULL); 1370 frames, NULL);
1403 if (retval < 0) { 1371 if (retval < 0) {
1404 ext4_warning(sb, 1372 ext4_warning(sb,
1405 "error reading index page in directory #%lu", 1373 "error %d reading index page in directory #%lu",
1406 dir->i_ino); 1374 retval, dir->i_ino);
1407 *err = retval; 1375 bh = ERR_PTR(retval);
1408 goto errout; 1376 goto errout;
1409 } 1377 }
1410 } while (retval == 1); 1378 } while (retval == 1);
1411 1379
1412 *err = -ENOENT; 1380 bh = NULL;
1413errout: 1381errout:
1414 dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); 1382 dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
1415 dx_release (frames); 1383success:
1416 return NULL; 1384 dx_release(frames);
1385 return bh;
1417} 1386}
1418 1387
1419static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 1388static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@ -1441,7 +1410,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
1441 dentry); 1410 dentry);
1442 return ERR_PTR(-EIO); 1411 return ERR_PTR(-EIO);
1443 } 1412 }
1444 inode = ext4_iget(dir->i_sb, ino); 1413 inode = ext4_iget_normal(dir->i_sb, ino);
1445 if (inode == ERR_PTR(-ESTALE)) { 1414 if (inode == ERR_PTR(-ESTALE)) {
1446 EXT4_ERROR_INODE(dir, 1415 EXT4_ERROR_INODE(dir,
1447 "deleted inode referenced: %u", 1416 "deleted inode referenced: %u",
@@ -1474,7 +1443,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
1474 return ERR_PTR(-EIO); 1443 return ERR_PTR(-EIO);
1475 } 1444 }
1476 1445
1477 return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); 1446 return d_obtain_alias(ext4_iget_normal(child->d_inode->i_sb, ino));
1478} 1447}
1479 1448
1480/* 1449/*
@@ -1533,7 +1502,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
1533 */ 1502 */
1534static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, 1503static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1535 struct buffer_head **bh,struct dx_frame *frame, 1504 struct buffer_head **bh,struct dx_frame *frame,
1536 struct dx_hash_info *hinfo, int *error) 1505 struct dx_hash_info *hinfo)
1537{ 1506{
1538 unsigned blocksize = dir->i_sb->s_blocksize; 1507 unsigned blocksize = dir->i_sb->s_blocksize;
1539 unsigned count, continued; 1508 unsigned count, continued;
@@ -1548,16 +1517,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1548 int csum_size = 0; 1517 int csum_size = 0;
1549 int err = 0, i; 1518 int err = 0, i;
1550 1519
1551 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, 1520 if (ext4_has_metadata_csum(dir->i_sb))
1552 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
1553 csum_size = sizeof(struct ext4_dir_entry_tail); 1521 csum_size = sizeof(struct ext4_dir_entry_tail);
1554 1522
1555 bh2 = ext4_append(handle, dir, &newblock); 1523 bh2 = ext4_append(handle, dir, &newblock);
1556 if (IS_ERR(bh2)) { 1524 if (IS_ERR(bh2)) {
1557 brelse(*bh); 1525 brelse(*bh);
1558 *bh = NULL; 1526 *bh = NULL;
1559 *error = PTR_ERR(bh2); 1527 return (struct ext4_dir_entry_2 *) bh2;
1560 return NULL;
1561 } 1528 }
1562 1529
1563 BUFFER_TRACE(*bh, "get_write_access"); 1530 BUFFER_TRACE(*bh, "get_write_access");
@@ -1617,8 +1584,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1617 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); 1584 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
1618 1585
1619 /* Which block gets the new entry? */ 1586 /* Which block gets the new entry? */
1620 if (hinfo->hash >= hash2) 1587 if (hinfo->hash >= hash2) {
1621 {
1622 swap(*bh, bh2); 1588 swap(*bh, bh2);
1623 de = de2; 1589 de = de2;
1624 } 1590 }
@@ -1638,8 +1604,7 @@ journal_error:
1638 brelse(bh2); 1604 brelse(bh2);
1639 *bh = NULL; 1605 *bh = NULL;
1640 ext4_std_error(dir->i_sb, err); 1606 ext4_std_error(dir->i_sb, err);
1641 *error = err; 1607 return ERR_PTR(err);
1642 return NULL;
1643} 1608}
1644 1609
1645int ext4_find_dest_de(struct inode *dir, struct inode *inode, 1610int ext4_find_dest_de(struct inode *dir, struct inode *inode,
@@ -1718,8 +1683,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1718 int csum_size = 0; 1683 int csum_size = 0;
1719 int err; 1684 int err;
1720 1685
1721 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 1686 if (ext4_has_metadata_csum(inode->i_sb))
1722 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
1723 csum_size = sizeof(struct ext4_dir_entry_tail); 1687 csum_size = sizeof(struct ext4_dir_entry_tail);
1724 1688
1725 if (!de) { 1689 if (!de) {
@@ -1786,8 +1750,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1786 struct fake_dirent *fde; 1750 struct fake_dirent *fde;
1787 int csum_size = 0; 1751 int csum_size = 0;
1788 1752
1789 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 1753 if (ext4_has_metadata_csum(inode->i_sb))
1790 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
1791 csum_size = sizeof(struct ext4_dir_entry_tail); 1754 csum_size = sizeof(struct ext4_dir_entry_tail);
1792 1755
1793 blocksize = dir->i_sb->s_blocksize; 1756 blocksize = dir->i_sb->s_blocksize;
@@ -1853,31 +1816,39 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1853 hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; 1816 hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
1854 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; 1817 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
1855 ext4fs_dirhash(name, namelen, &hinfo); 1818 ext4fs_dirhash(name, namelen, &hinfo);
1819 memset(frames, 0, sizeof(frames));
1856 frame = frames; 1820 frame = frames;
1857 frame->entries = entries; 1821 frame->entries = entries;
1858 frame->at = entries; 1822 frame->at = entries;
1859 frame->bh = bh; 1823 frame->bh = bh;
1860 bh = bh2; 1824 bh = bh2;
1861 1825
1862 ext4_handle_dirty_dx_node(handle, dir, frame->bh); 1826 retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
1863 ext4_handle_dirty_dirent_node(handle, dir, bh); 1827 if (retval)
1828 goto out_frames;
1829 retval = ext4_handle_dirty_dirent_node(handle, dir, bh);
1830 if (retval)
1831 goto out_frames;
1864 1832
1865 de = do_split(handle,dir, &bh, frame, &hinfo, &retval); 1833 de = do_split(handle,dir, &bh, frame, &hinfo);
1866 if (!de) { 1834 if (IS_ERR(de)) {
1867 /* 1835 retval = PTR_ERR(de);
1868 * Even if the block split failed, we have to properly write 1836 goto out_frames;
1869 * out all the changes we did so far. Otherwise we can end up
1870 * with corrupted filesystem.
1871 */
1872 ext4_mark_inode_dirty(handle, dir);
1873 dx_release(frames);
1874 return retval;
1875 } 1837 }
1876 dx_release(frames); 1838 dx_release(frames);
1877 1839
1878 retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1840 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1879 brelse(bh); 1841 brelse(bh);
1880 return retval; 1842 return retval;
1843out_frames:
1844 /*
1845 * Even if the block split failed, we have to properly write
1846 * out all the changes we did so far. Otherwise we can end up
1847 * with corrupted filesystem.
1848 */
1849 ext4_mark_inode_dirty(handle, dir);
1850 dx_release(frames);
1851 return retval;
1881} 1852}
1882 1853
1883/* 1854/*
@@ -1904,8 +1875,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1904 ext4_lblk_t block, blocks; 1875 ext4_lblk_t block, blocks;
1905 int csum_size = 0; 1876 int csum_size = 0;
1906 1877
1907 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 1878 if (ext4_has_metadata_csum(inode->i_sb))
1908 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
1909 csum_size = sizeof(struct ext4_dir_entry_tail); 1879 csum_size = sizeof(struct ext4_dir_entry_tail);
1910 1880
1911 sb = dir->i_sb; 1881 sb = dir->i_sb;
@@ -1982,9 +1952,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1982 struct ext4_dir_entry_2 *de; 1952 struct ext4_dir_entry_2 *de;
1983 int err; 1953 int err;
1984 1954
1985 frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); 1955 frame = dx_probe(&dentry->d_name, dir, &hinfo, frames);
1986 if (!frame) 1956 if (IS_ERR(frame))
1987 return err; 1957 return PTR_ERR(frame);
1988 entries = frame->entries; 1958 entries = frame->entries;
1989 at = frame->at; 1959 at = frame->at;
1990 bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT); 1960 bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT);
@@ -2095,9 +2065,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
2095 goto cleanup; 2065 goto cleanup;
2096 } 2066 }
2097 } 2067 }
2098 de = do_split(handle, dir, &bh, frame, &hinfo, &err); 2068 de = do_split(handle, dir, &bh, frame, &hinfo);
2099 if (!de) 2069 if (IS_ERR(de)) {
2070 err = PTR_ERR(de);
2100 goto cleanup; 2071 goto cleanup;
2072 }
2101 err = add_dirent_to_buf(handle, dentry, inode, de, bh); 2073 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
2102 goto cleanup; 2074 goto cleanup;
2103 2075
@@ -2167,8 +2139,7 @@ static int ext4_delete_entry(handle_t *handle,
2167 return err; 2139 return err;
2168 } 2140 }
2169 2141
2170 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, 2142 if (ext4_has_metadata_csum(dir->i_sb))
2171 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
2172 csum_size = sizeof(struct ext4_dir_entry_tail); 2143 csum_size = sizeof(struct ext4_dir_entry_tail);
2173 2144
2174 BUFFER_TRACE(bh, "get_write_access"); 2145 BUFFER_TRACE(bh, "get_write_access");
@@ -2387,8 +2358,7 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
2387 int csum_size = 0; 2358 int csum_size = 0;
2388 int err; 2359 int err;
2389 2360
2390 if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, 2361 if (ext4_has_metadata_csum(dir->i_sb))
2391 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
2392 csum_size = sizeof(struct ext4_dir_entry_tail); 2362 csum_size = sizeof(struct ext4_dir_entry_tail);
2393 2363
2394 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { 2364 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
@@ -2403,10 +2373,6 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
2403 dir_block = ext4_append(handle, inode, &block); 2373 dir_block = ext4_append(handle, inode, &block);
2404 if (IS_ERR(dir_block)) 2374 if (IS_ERR(dir_block))
2405 return PTR_ERR(dir_block); 2375 return PTR_ERR(dir_block);
2406 BUFFER_TRACE(dir_block, "get_write_access");
2407 err = ext4_journal_get_write_access(handle, dir_block);
2408 if (err)
2409 goto out;
2410 de = (struct ext4_dir_entry_2 *)dir_block->b_data; 2376 de = (struct ext4_dir_entry_2 *)dir_block->b_data;
2411 ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); 2377 ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
2412 set_nlink(inode, 2); 2378 set_nlink(inode, 2);
@@ -2573,7 +2539,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
2573 int err = 0, rc; 2539 int err = 0, rc;
2574 bool dirty = false; 2540 bool dirty = false;
2575 2541
2576 if (!sbi->s_journal) 2542 if (!sbi->s_journal || is_bad_inode(inode))
2577 return 0; 2543 return 0;
2578 2544
2579 WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && 2545 WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
@@ -3190,6 +3156,39 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
3190 } 3156 }
3191} 3157}
3192 3158
3159static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent,
3160 int credits, handle_t **h)
3161{
3162 struct inode *wh;
3163 handle_t *handle;
3164 int retries = 0;
3165
3166 /*
3167 * for inode block, sb block, group summaries,
3168 * and inode bitmap
3169 */
3170 credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) +
3171 EXT4_XATTR_TRANS_BLOCKS + 4);
3172retry:
3173 wh = ext4_new_inode_start_handle(ent->dir, S_IFCHR | WHITEOUT_MODE,
3174 &ent->dentry->d_name, 0, NULL,
3175 EXT4_HT_DIR, credits);
3176
3177 handle = ext4_journal_current_handle();
3178 if (IS_ERR(wh)) {
3179 if (handle)
3180 ext4_journal_stop(handle);
3181 if (PTR_ERR(wh) == -ENOSPC &&
3182 ext4_should_retry_alloc(ent->dir->i_sb, &retries))
3183 goto retry;
3184 } else {
3185 *h = handle;
3186 init_special_inode(wh, wh->i_mode, WHITEOUT_DEV);
3187 wh->i_op = &ext4_special_inode_operations;
3188 }
3189 return wh;
3190}
3191
3193/* 3192/*
3194 * Anybody can rename anything with this: the permission checks are left to the 3193 * Anybody can rename anything with this: the permission checks are left to the
3195 * higher-level routines. 3194 * higher-level routines.
@@ -3199,7 +3198,8 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
3199 * This comes from rename(const char *oldpath, const char *newpath) 3198 * This comes from rename(const char *oldpath, const char *newpath)
3200 */ 3199 */
3201static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, 3200static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3202 struct inode *new_dir, struct dentry *new_dentry) 3201 struct inode *new_dir, struct dentry *new_dentry,
3202 unsigned int flags)
3203{ 3203{
3204 handle_t *handle = NULL; 3204 handle_t *handle = NULL;
3205 struct ext4_renament old = { 3205 struct ext4_renament old = {
@@ -3214,6 +3214,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3214 }; 3214 };
3215 int force_reread; 3215 int force_reread;
3216 int retval; 3216 int retval;
3217 struct inode *whiteout = NULL;
3218 int credits;
3219 u8 old_file_type;
3217 3220
3218 dquot_initialize(old.dir); 3221 dquot_initialize(old.dir);
3219 dquot_initialize(new.dir); 3222 dquot_initialize(new.dir);
@@ -3252,11 +3255,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3252 if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC)) 3255 if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC))
3253 ext4_alloc_da_blocks(old.inode); 3256 ext4_alloc_da_blocks(old.inode);
3254 3257
3255 handle = ext4_journal_start(old.dir, EXT4_HT_DIR, 3258 credits = (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
3256 (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + 3259 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
3257 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); 3260 if (!(flags & RENAME_WHITEOUT)) {
3258 if (IS_ERR(handle)) 3261 handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits);
3259 return PTR_ERR(handle); 3262 if (IS_ERR(handle))
3263 return PTR_ERR(handle);
3264 } else {
3265 whiteout = ext4_whiteout_for_rename(&old, credits, &handle);
3266 if (IS_ERR(whiteout))
3267 return PTR_ERR(whiteout);
3268 }
3260 3269
3261 if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) 3270 if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
3262 ext4_handle_sync(handle); 3271 ext4_handle_sync(handle);
@@ -3284,13 +3293,26 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3284 */ 3293 */
3285 force_reread = (new.dir->i_ino == old.dir->i_ino && 3294 force_reread = (new.dir->i_ino == old.dir->i_ino &&
3286 ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA)); 3295 ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));
3296
3297 old_file_type = old.de->file_type;
3298 if (whiteout) {
3299 /*
3300 * Do this before adding a new entry, so the old entry is sure
3301 * to be still pointing to the valid old entry.
3302 */
3303 retval = ext4_setent(handle, &old, whiteout->i_ino,
3304 EXT4_FT_CHRDEV);
3305 if (retval)
3306 goto end_rename;
3307 ext4_mark_inode_dirty(handle, whiteout);
3308 }
3287 if (!new.bh) { 3309 if (!new.bh) {
3288 retval = ext4_add_entry(handle, new.dentry, old.inode); 3310 retval = ext4_add_entry(handle, new.dentry, old.inode);
3289 if (retval) 3311 if (retval)
3290 goto end_rename; 3312 goto end_rename;
3291 } else { 3313 } else {
3292 retval = ext4_setent(handle, &new, 3314 retval = ext4_setent(handle, &new,
3293 old.inode->i_ino, old.de->file_type); 3315 old.inode->i_ino, old_file_type);
3294 if (retval) 3316 if (retval)
3295 goto end_rename; 3317 goto end_rename;
3296 } 3318 }
@@ -3305,10 +3327,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3305 old.inode->i_ctime = ext4_current_time(old.inode); 3327 old.inode->i_ctime = ext4_current_time(old.inode);
3306 ext4_mark_inode_dirty(handle, old.inode); 3328 ext4_mark_inode_dirty(handle, old.inode);
3307 3329
3308 /* 3330 if (!whiteout) {
3309 * ok, that's it 3331 /*
3310 */ 3332 * ok, that's it
3311 ext4_rename_delete(handle, &old, force_reread); 3333 */
3334 ext4_rename_delete(handle, &old, force_reread);
3335 }
3312 3336
3313 if (new.inode) { 3337 if (new.inode) {
3314 ext4_dec_count(handle, new.inode); 3338 ext4_dec_count(handle, new.inode);
@@ -3344,6 +3368,12 @@ end_rename:
3344 brelse(old.dir_bh); 3368 brelse(old.dir_bh);
3345 brelse(old.bh); 3369 brelse(old.bh);
3346 brelse(new.bh); 3370 brelse(new.bh);
3371 if (whiteout) {
3372 if (retval)
3373 drop_nlink(whiteout);
3374 unlock_new_inode(whiteout);
3375 iput(whiteout);
3376 }
3347 if (handle) 3377 if (handle)
3348 ext4_journal_stop(handle); 3378 ext4_journal_stop(handle);
3349 return retval; 3379 return retval;
@@ -3476,18 +3506,15 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
3476 struct inode *new_dir, struct dentry *new_dentry, 3506 struct inode *new_dir, struct dentry *new_dentry,
3477 unsigned int flags) 3507 unsigned int flags)
3478{ 3508{
3479 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) 3509 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
3480 return -EINVAL; 3510 return -EINVAL;
3481 3511
3482 if (flags & RENAME_EXCHANGE) { 3512 if (flags & RENAME_EXCHANGE) {
3483 return ext4_cross_rename(old_dir, old_dentry, 3513 return ext4_cross_rename(old_dir, old_dentry,
3484 new_dir, new_dentry); 3514 new_dir, new_dentry);
3485 } 3515 }
3486 /* 3516
3487 * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE" 3517 return ext4_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
3488 * is equivalent to regular rename.
3489 */
3490 return ext4_rename(old_dir, old_dentry, new_dir, new_dentry);
3491} 3518}
3492 3519
3493/* 3520/*
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 1e43b905ff98..ca4588388fc3 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1081,7 +1081,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
1081 break; 1081 break;
1082 1082
1083 if (meta_bg == 0) 1083 if (meta_bg == 0)
1084 backup_block = group * bpg + blk_off; 1084 backup_block = ((ext4_fsblk_t)group) * bpg + blk_off;
1085 else 1085 else
1086 backup_block = (ext4_group_first_block_no(sb, group) + 1086 backup_block = (ext4_group_first_block_no(sb, group) +
1087 ext4_bg_has_super(sb, group)); 1087 ext4_bg_has_super(sb, group));
@@ -1212,8 +1212,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb,
1212{ 1212{
1213 struct buffer_head *bh; 1213 struct buffer_head *bh;
1214 1214
1215 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 1215 if (!ext4_has_metadata_csum(sb))
1216 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
1217 return 0; 1216 return 0;
1218 1217
1219 bh = ext4_get_bitmap(sb, group_data->inode_bitmap); 1218 bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 05c159218bc2..2c9e6864abd9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -70,7 +70,6 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
70static void ext4_clear_journal_err(struct super_block *sb, 70static void ext4_clear_journal_err(struct super_block *sb,
71 struct ext4_super_block *es); 71 struct ext4_super_block *es);
72static int ext4_sync_fs(struct super_block *sb, int wait); 72static int ext4_sync_fs(struct super_block *sb, int wait);
73static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
74static int ext4_remount(struct super_block *sb, int *flags, char *data); 73static int ext4_remount(struct super_block *sb, int *flags, char *data);
75static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); 74static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
76static int ext4_unfreeze(struct super_block *sb); 75static int ext4_unfreeze(struct super_block *sb);
@@ -141,8 +140,7 @@ static __le32 ext4_superblock_csum(struct super_block *sb,
141static int ext4_superblock_csum_verify(struct super_block *sb, 140static int ext4_superblock_csum_verify(struct super_block *sb,
142 struct ext4_super_block *es) 141 struct ext4_super_block *es)
143{ 142{
144 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 143 if (!ext4_has_metadata_csum(sb))
145 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
146 return 1; 144 return 1;
147 145
148 return es->s_checksum == ext4_superblock_csum(sb, es); 146 return es->s_checksum == ext4_superblock_csum(sb, es);
@@ -152,8 +150,7 @@ void ext4_superblock_csum_set(struct super_block *sb)
152{ 150{
153 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 151 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
154 152
155 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, 153 if (!ext4_has_metadata_csum(sb))
156 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
157 return; 154 return;
158 155
159 es->s_checksum = ext4_superblock_csum(sb, es); 156 es->s_checksum = ext4_superblock_csum(sb, es);
@@ -820,10 +817,9 @@ static void ext4_put_super(struct super_block *sb)
820 percpu_counter_destroy(&sbi->s_freeinodes_counter); 817 percpu_counter_destroy(&sbi->s_freeinodes_counter);
821 percpu_counter_destroy(&sbi->s_dirs_counter); 818 percpu_counter_destroy(&sbi->s_dirs_counter);
822 percpu_counter_destroy(&sbi->s_dirtyclusters_counter); 819 percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
823 percpu_counter_destroy(&sbi->s_extent_cache_cnt);
824 brelse(sbi->s_sbh); 820 brelse(sbi->s_sbh);
825#ifdef CONFIG_QUOTA 821#ifdef CONFIG_QUOTA
826 for (i = 0; i < MAXQUOTAS; i++) 822 for (i = 0; i < EXT4_MAXQUOTAS; i++)
827 kfree(sbi->s_qf_names[i]); 823 kfree(sbi->s_qf_names[i]);
828#endif 824#endif
829 825
@@ -885,6 +881,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
885 ext4_es_init_tree(&ei->i_es_tree); 881 ext4_es_init_tree(&ei->i_es_tree);
886 rwlock_init(&ei->i_es_lock); 882 rwlock_init(&ei->i_es_lock);
887 INIT_LIST_HEAD(&ei->i_es_lru); 883 INIT_LIST_HEAD(&ei->i_es_lru);
884 ei->i_es_all_nr = 0;
888 ei->i_es_lru_nr = 0; 885 ei->i_es_lru_nr = 0;
889 ei->i_touch_when = 0; 886 ei->i_touch_when = 0;
890 ei->i_reserved_data_blocks = 0; 887 ei->i_reserved_data_blocks = 0;
@@ -1002,7 +999,7 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
1002 * Currently we don't know the generation for parent directory, so 999 * Currently we don't know the generation for parent directory, so
1003 * a generation of 0 means "accept any" 1000 * a generation of 0 means "accept any"
1004 */ 1001 */
1005 inode = ext4_iget(sb, ino); 1002 inode = ext4_iget_normal(sb, ino);
1006 if (IS_ERR(inode)) 1003 if (IS_ERR(inode))
1007 return ERR_CAST(inode); 1004 return ERR_CAST(inode);
1008 if (generation && inode->i_generation != generation) { 1005 if (generation && inode->i_generation != generation) {
@@ -1124,25 +1121,6 @@ static const struct super_operations ext4_sops = {
1124 .bdev_try_to_free_page = bdev_try_to_free_page, 1121 .bdev_try_to_free_page = bdev_try_to_free_page,
1125}; 1122};
1126 1123
1127static const struct super_operations ext4_nojournal_sops = {
1128 .alloc_inode = ext4_alloc_inode,
1129 .destroy_inode = ext4_destroy_inode,
1130 .write_inode = ext4_write_inode,
1131 .dirty_inode = ext4_dirty_inode,
1132 .drop_inode = ext4_drop_inode,
1133 .evict_inode = ext4_evict_inode,
1134 .sync_fs = ext4_sync_fs_nojournal,
1135 .put_super = ext4_put_super,
1136 .statfs = ext4_statfs,
1137 .remount_fs = ext4_remount,
1138 .show_options = ext4_show_options,
1139#ifdef CONFIG_QUOTA
1140 .quota_read = ext4_quota_read,
1141 .quota_write = ext4_quota_write,
1142#endif
1143 .bdev_try_to_free_page = bdev_try_to_free_page,
1144};
1145
1146static const struct export_operations ext4_export_ops = { 1124static const struct export_operations ext4_export_ops = {
1147 .fh_to_dentry = ext4_fh_to_dentry, 1125 .fh_to_dentry = ext4_fh_to_dentry,
1148 .fh_to_parent = ext4_fh_to_parent, 1126 .fh_to_parent = ext4_fh_to_parent,
@@ -1712,13 +1690,6 @@ static int parse_options(char *options, struct super_block *sb,
1712 "not specified"); 1690 "not specified");
1713 return 0; 1691 return 0;
1714 } 1692 }
1715 } else {
1716 if (sbi->s_jquota_fmt) {
1717 ext4_msg(sb, KERN_ERR, "journaled quota format "
1718 "specified with no journaling "
1719 "enabled");
1720 return 0;
1721 }
1722 } 1693 }
1723#endif 1694#endif
1724 if (test_opt(sb, DIOREAD_NOLOCK)) { 1695 if (test_opt(sb, DIOREAD_NOLOCK)) {
@@ -2016,8 +1987,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
2016 __u16 crc = 0; 1987 __u16 crc = 0;
2017 __le32 le_group = cpu_to_le32(block_group); 1988 __le32 le_group = cpu_to_le32(block_group);
2018 1989
2019 if ((sbi->s_es->s_feature_ro_compat & 1990 if (ext4_has_metadata_csum(sbi->s_sb)) {
2020 cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
2021 /* Use new metadata_csum algorithm */ 1991 /* Use new metadata_csum algorithm */
2022 __le16 save_csum; 1992 __le16 save_csum;
2023 __u32 csum32; 1993 __u32 csum32;
@@ -2035,6 +2005,10 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
2035 } 2005 }
2036 2006
2037 /* old crc16 code */ 2007 /* old crc16 code */
2008 if (!(sbi->s_es->s_feature_ro_compat &
2009 cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)))
2010 return 0;
2011
2038 offset = offsetof(struct ext4_group_desc, bg_checksum); 2012 offset = offsetof(struct ext4_group_desc, bg_checksum);
2039 2013
2040 crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); 2014 crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
@@ -2191,7 +2165,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2191 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { 2165 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
2192 /* don't clear list on RO mount w/ errors */ 2166 /* don't clear list on RO mount w/ errors */
2193 if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { 2167 if (es->s_last_orphan && !(s_flags & MS_RDONLY)) {
2194 jbd_debug(1, "Errors on filesystem, " 2168 ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
2195 "clearing orphan list.\n"); 2169 "clearing orphan list.\n");
2196 es->s_last_orphan = 0; 2170 es->s_last_orphan = 0;
2197 } 2171 }
@@ -2207,7 +2181,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2207 /* Needed for iput() to work correctly and not trash data */ 2181 /* Needed for iput() to work correctly and not trash data */
2208 sb->s_flags |= MS_ACTIVE; 2182 sb->s_flags |= MS_ACTIVE;
2209 /* Turn on quotas so that they are updated correctly */ 2183 /* Turn on quotas so that they are updated correctly */
2210 for (i = 0; i < MAXQUOTAS; i++) { 2184 for (i = 0; i < EXT4_MAXQUOTAS; i++) {
2211 if (EXT4_SB(sb)->s_qf_names[i]) { 2185 if (EXT4_SB(sb)->s_qf_names[i]) {
2212 int ret = ext4_quota_on_mount(sb, i); 2186 int ret = ext4_quota_on_mount(sb, i);
2213 if (ret < 0) 2187 if (ret < 0)
@@ -2263,7 +2237,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2263 PLURAL(nr_truncates)); 2237 PLURAL(nr_truncates));
2264#ifdef CONFIG_QUOTA 2238#ifdef CONFIG_QUOTA
2265 /* Turn quotas off */ 2239 /* Turn quotas off */
2266 for (i = 0; i < MAXQUOTAS; i++) { 2240 for (i = 0; i < EXT4_MAXQUOTAS; i++) {
2267 if (sb_dqopt(sb)->files[i]) 2241 if (sb_dqopt(sb)->files[i])
2268 dquot_quota_off(sb, i); 2242 dquot_quota_off(sb, i);
2269 } 2243 }
@@ -2548,6 +2522,16 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
2548 return count; 2522 return count;
2549} 2523}
2550 2524
2525static ssize_t es_ui_show(struct ext4_attr *a,
2526 struct ext4_sb_info *sbi, char *buf)
2527{
2528
2529 unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) +
2530 a->u.offset);
2531
2532 return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
2533}
2534
2551static ssize_t reserved_clusters_show(struct ext4_attr *a, 2535static ssize_t reserved_clusters_show(struct ext4_attr *a,
2552 struct ext4_sb_info *sbi, char *buf) 2536 struct ext4_sb_info *sbi, char *buf)
2553{ 2537{
@@ -2601,14 +2585,29 @@ static struct ext4_attr ext4_attr_##_name = { \
2601 .offset = offsetof(struct ext4_sb_info, _elname),\ 2585 .offset = offsetof(struct ext4_sb_info, _elname),\
2602 }, \ 2586 }, \
2603} 2587}
2588
2589#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname) \
2590static struct ext4_attr ext4_attr_##_name = { \
2591 .attr = {.name = __stringify(_name), .mode = _mode }, \
2592 .show = _show, \
2593 .store = _store, \
2594 .u = { \
2595 .offset = offsetof(struct ext4_super_block, _elname), \
2596 }, \
2597}
2598
2604#define EXT4_ATTR(name, mode, show, store) \ 2599#define EXT4_ATTR(name, mode, show, store) \
2605static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) 2600static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2606 2601
2607#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL) 2602#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
2608#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) 2603#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
2609#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) 2604#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
2605
2606#define EXT4_RO_ATTR_ES_UI(name, elname) \
2607 EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname)
2610#define EXT4_RW_ATTR_SBI_UI(name, elname) \ 2608#define EXT4_RW_ATTR_SBI_UI(name, elname) \
2611 EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) 2609 EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
2610
2612#define ATTR_LIST(name) &ext4_attr_##name.attr 2611#define ATTR_LIST(name) &ext4_attr_##name.attr
2613#define EXT4_DEPRECATED_ATTR(_name, _val) \ 2612#define EXT4_DEPRECATED_ATTR(_name, _val) \
2614static struct ext4_attr ext4_attr_##_name = { \ 2613static struct ext4_attr ext4_attr_##_name = { \
@@ -2641,6 +2640,9 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int
2641EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); 2640EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
2642EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); 2641EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
2643EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); 2642EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
2643EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
2644EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
2645EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
2644 2646
2645static struct attribute *ext4_attrs[] = { 2647static struct attribute *ext4_attrs[] = {
2646 ATTR_LIST(delayed_allocation_blocks), 2648 ATTR_LIST(delayed_allocation_blocks),
@@ -2664,6 +2666,9 @@ static struct attribute *ext4_attrs[] = {
2664 ATTR_LIST(warning_ratelimit_burst), 2666 ATTR_LIST(warning_ratelimit_burst),
2665 ATTR_LIST(msg_ratelimit_interval_ms), 2667 ATTR_LIST(msg_ratelimit_interval_ms),
2666 ATTR_LIST(msg_ratelimit_burst), 2668 ATTR_LIST(msg_ratelimit_burst),
2669 ATTR_LIST(errors_count),
2670 ATTR_LIST(first_error_time),
2671 ATTR_LIST(last_error_time),
2667 NULL, 2672 NULL,
2668}; 2673};
2669 2674
@@ -2723,9 +2728,25 @@ static void ext4_feat_release(struct kobject *kobj)
2723 complete(&ext4_feat->f_kobj_unregister); 2728 complete(&ext4_feat->f_kobj_unregister);
2724} 2729}
2725 2730
2731static ssize_t ext4_feat_show(struct kobject *kobj,
2732 struct attribute *attr, char *buf)
2733{
2734 return snprintf(buf, PAGE_SIZE, "supported\n");
2735}
2736
2737/*
2738 * We can not use ext4_attr_show/store because it relies on the kobject
2739 * being embedded in the ext4_sb_info structure which is definitely not
2740 * true in this case.
2741 */
2742static const struct sysfs_ops ext4_feat_ops = {
2743 .show = ext4_feat_show,
2744 .store = NULL,
2745};
2746
2726static struct kobj_type ext4_feat_ktype = { 2747static struct kobj_type ext4_feat_ktype = {
2727 .default_attrs = ext4_feat_attrs, 2748 .default_attrs = ext4_feat_attrs,
2728 .sysfs_ops = &ext4_attr_ops, 2749 .sysfs_ops = &ext4_feat_ops,
2729 .release = ext4_feat_release, 2750 .release = ext4_feat_release,
2730}; 2751};
2731 2752
@@ -3179,8 +3200,7 @@ static int set_journal_csum_feature_set(struct super_block *sb)
3179 int compat, incompat; 3200 int compat, incompat;
3180 struct ext4_sb_info *sbi = EXT4_SB(sb); 3201 struct ext4_sb_info *sbi = EXT4_SB(sb);
3181 3202
3182 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 3203 if (ext4_has_metadata_csum(sb)) {
3183 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
3184 /* journal checksum v3 */ 3204 /* journal checksum v3 */
3185 compat = 0; 3205 compat = 0;
3186 incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; 3206 incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
@@ -3190,6 +3210,10 @@ static int set_journal_csum_feature_set(struct super_block *sb)
3190 incompat = 0; 3210 incompat = 0;
3191 } 3211 }
3192 3212
3213 jbd2_journal_clear_features(sbi->s_journal,
3214 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
3215 JBD2_FEATURE_INCOMPAT_CSUM_V3 |
3216 JBD2_FEATURE_INCOMPAT_CSUM_V2);
3193 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 3217 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
3194 ret = jbd2_journal_set_features(sbi->s_journal, 3218 ret = jbd2_journal_set_features(sbi->s_journal,
3195 compat, 0, 3219 compat, 0,
@@ -3202,11 +3226,8 @@ static int set_journal_csum_feature_set(struct super_block *sb)
3202 jbd2_journal_clear_features(sbi->s_journal, 0, 0, 3226 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
3203 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 3227 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3204 } else { 3228 } else {
3205 jbd2_journal_clear_features(sbi->s_journal, 3229 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
3206 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 3230 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3207 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
3208 JBD2_FEATURE_INCOMPAT_CSUM_V3 |
3209 JBD2_FEATURE_INCOMPAT_CSUM_V2);
3210 } 3231 }
3211 3232
3212 return ret; 3233 return ret;
@@ -3436,7 +3457,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3436 logical_sb_block = sb_block; 3457 logical_sb_block = sb_block;
3437 } 3458 }
3438 3459
3439 if (!(bh = sb_bread(sb, logical_sb_block))) { 3460 if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) {
3440 ext4_msg(sb, KERN_ERR, "unable to read superblock"); 3461 ext4_msg(sb, KERN_ERR, "unable to read superblock");
3441 goto out_fail; 3462 goto out_fail;
3442 } 3463 }
@@ -3487,8 +3508,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3487 } 3508 }
3488 3509
3489 /* Precompute checksum seed for all metadata */ 3510 /* Precompute checksum seed for all metadata */
3490 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 3511 if (ext4_has_metadata_csum(sb))
3491 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
3492 sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, 3512 sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
3493 sizeof(es->s_uuid)); 3513 sizeof(es->s_uuid));
3494 3514
@@ -3506,6 +3526,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3506#ifdef CONFIG_EXT4_FS_POSIX_ACL 3526#ifdef CONFIG_EXT4_FS_POSIX_ACL
3507 set_opt(sb, POSIX_ACL); 3527 set_opt(sb, POSIX_ACL);
3508#endif 3528#endif
3529 /* don't forget to enable journal_csum when metadata_csum is enabled. */
3530 if (ext4_has_metadata_csum(sb))
3531 set_opt(sb, JOURNAL_CHECKSUM);
3532
3509 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) 3533 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
3510 set_opt(sb, JOURNAL_DATA); 3534 set_opt(sb, JOURNAL_DATA);
3511 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) 3535 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
@@ -3519,8 +3543,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3519 set_opt(sb, ERRORS_CONT); 3543 set_opt(sb, ERRORS_CONT);
3520 else 3544 else
3521 set_opt(sb, ERRORS_RO); 3545 set_opt(sb, ERRORS_RO);
3522 if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY) 3546 /* block_validity enabled by default; disable with noblock_validity */
3523 set_opt(sb, BLOCK_VALIDITY); 3547 set_opt(sb, BLOCK_VALIDITY);
3524 if (def_mount_opts & EXT4_DEFM_DISCARD) 3548 if (def_mount_opts & EXT4_DEFM_DISCARD)
3525 set_opt(sb, DISCARD); 3549 set_opt(sb, DISCARD);
3526 3550
@@ -3646,7 +3670,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3646 brelse(bh); 3670 brelse(bh);
3647 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; 3671 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
3648 offset = do_div(logical_sb_block, blocksize); 3672 offset = do_div(logical_sb_block, blocksize);
3649 bh = sb_bread(sb, logical_sb_block); 3673 bh = sb_bread_unmovable(sb, logical_sb_block);
3650 if (!bh) { 3674 if (!bh) {
3651 ext4_msg(sb, KERN_ERR, 3675 ext4_msg(sb, KERN_ERR,
3652 "Can't read superblock on 2nd try"); 3676 "Can't read superblock on 2nd try");
@@ -3868,7 +3892,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3868 3892
3869 for (i = 0; i < db_count; i++) { 3893 for (i = 0; i < db_count; i++) {
3870 block = descriptor_loc(sb, logical_sb_block, i); 3894 block = descriptor_loc(sb, logical_sb_block, i);
3871 sbi->s_group_desc[i] = sb_bread(sb, block); 3895 sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
3872 if (!sbi->s_group_desc[i]) { 3896 if (!sbi->s_group_desc[i]) {
3873 ext4_msg(sb, KERN_ERR, 3897 ext4_msg(sb, KERN_ERR,
3874 "can't read group descriptor %d", i); 3898 "can't read group descriptor %d", i);
@@ -3890,13 +3914,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3890 sbi->s_err_report.data = (unsigned long) sb; 3914 sbi->s_err_report.data = (unsigned long) sb;
3891 3915
3892 /* Register extent status tree shrinker */ 3916 /* Register extent status tree shrinker */
3893 ext4_es_register_shrinker(sbi); 3917 if (ext4_es_register_shrinker(sbi))
3894
3895 err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL);
3896 if (err) {
3897 ext4_msg(sb, KERN_ERR, "insufficient memory");
3898 goto failed_mount3; 3918 goto failed_mount3;
3899 }
3900 3919
3901 sbi->s_stripe = ext4_get_stripe_size(sbi); 3920 sbi->s_stripe = ext4_get_stripe_size(sbi);
3902 sbi->s_extent_max_zeroout_kb = 32; 3921 sbi->s_extent_max_zeroout_kb = 32;
@@ -3904,11 +3923,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3904 /* 3923 /*
3905 * set up enough so that it can read an inode 3924 * set up enough so that it can read an inode
3906 */ 3925 */
3907 if (!test_opt(sb, NOLOAD) && 3926 sb->s_op = &ext4_sops;
3908 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
3909 sb->s_op = &ext4_sops;
3910 else
3911 sb->s_op = &ext4_nojournal_sops;
3912 sb->s_export_op = &ext4_export_ops; 3927 sb->s_export_op = &ext4_export_ops;
3913 sb->s_xattr = ext4_xattr_handlers; 3928 sb->s_xattr = ext4_xattr_handlers;
3914#ifdef CONFIG_QUOTA 3929#ifdef CONFIG_QUOTA
@@ -3932,7 +3947,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3932 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && 3947 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
3933 !(sb->s_flags & MS_RDONLY)) 3948 !(sb->s_flags & MS_RDONLY))
3934 if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) 3949 if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
3935 goto failed_mount3; 3950 goto failed_mount3a;
3936 3951
3937 /* 3952 /*
3938 * The first inode we look at is the journal inode. Don't try 3953 * The first inode we look at is the journal inode. Don't try
@@ -3941,7 +3956,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3941 if (!test_opt(sb, NOLOAD) && 3956 if (!test_opt(sb, NOLOAD) &&
3942 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { 3957 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
3943 if (ext4_load_journal(sb, es, journal_devnum)) 3958 if (ext4_load_journal(sb, es, journal_devnum))
3944 goto failed_mount3; 3959 goto failed_mount3a;
3945 } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && 3960 } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
3946 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { 3961 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
3947 ext4_msg(sb, KERN_ERR, "required journal recovery " 3962 ext4_msg(sb, KERN_ERR, "required journal recovery "
@@ -4229,10 +4244,10 @@ failed_mount_wq:
4229 jbd2_journal_destroy(sbi->s_journal); 4244 jbd2_journal_destroy(sbi->s_journal);
4230 sbi->s_journal = NULL; 4245 sbi->s_journal = NULL;
4231 } 4246 }
4232failed_mount3: 4247failed_mount3a:
4233 ext4_es_unregister_shrinker(sbi); 4248 ext4_es_unregister_shrinker(sbi);
4249failed_mount3:
4234 del_timer_sync(&sbi->s_err_report); 4250 del_timer_sync(&sbi->s_err_report);
4235 percpu_counter_destroy(&sbi->s_extent_cache_cnt);
4236 if (sbi->s_mmp_tsk) 4251 if (sbi->s_mmp_tsk)
4237 kthread_stop(sbi->s_mmp_tsk); 4252 kthread_stop(sbi->s_mmp_tsk);
4238failed_mount2: 4253failed_mount2:
@@ -4247,7 +4262,7 @@ failed_mount:
4247 remove_proc_entry(sb->s_id, ext4_proc_root); 4262 remove_proc_entry(sb->s_id, ext4_proc_root);
4248 } 4263 }
4249#ifdef CONFIG_QUOTA 4264#ifdef CONFIG_QUOTA
4250 for (i = 0; i < MAXQUOTAS; i++) 4265 for (i = 0; i < EXT4_MAXQUOTAS; i++)
4251 kfree(sbi->s_qf_names[i]); 4266 kfree(sbi->s_qf_names[i]);
4252#endif 4267#endif
4253 ext4_blkdev_remove(sbi); 4268 ext4_blkdev_remove(sbi);
@@ -4375,6 +4390,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
4375 goto out_bdev; 4390 goto out_bdev;
4376 } 4391 }
4377 4392
4393 if ((le32_to_cpu(es->s_feature_ro_compat) &
4394 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
4395 es->s_checksum != ext4_superblock_csum(sb, es)) {
4396 ext4_msg(sb, KERN_ERR, "external journal has "
4397 "corrupt superblock");
4398 brelse(bh);
4399 goto out_bdev;
4400 }
4401
4378 if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { 4402 if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
4379 ext4_msg(sb, KERN_ERR, "journal UUID does not match"); 4403 ext4_msg(sb, KERN_ERR, "journal UUID does not match");
4380 brelse(bh); 4404 brelse(bh);
@@ -4677,15 +4701,19 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
4677 * being sent at the end of the function. But we can skip it if 4701 * being sent at the end of the function. But we can skip it if
4678 * transaction_commit will do it for us. 4702 * transaction_commit will do it for us.
4679 */ 4703 */
4680 target = jbd2_get_latest_transaction(sbi->s_journal); 4704 if (sbi->s_journal) {
4681 if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && 4705 target = jbd2_get_latest_transaction(sbi->s_journal);
4682 !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) 4706 if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
4707 !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
4708 needs_barrier = true;
4709
4710 if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
4711 if (wait)
4712 ret = jbd2_log_wait_commit(sbi->s_journal,
4713 target);
4714 }
4715 } else if (wait && test_opt(sb, BARRIER))
4683 needs_barrier = true; 4716 needs_barrier = true;
4684
4685 if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
4686 if (wait)
4687 ret = jbd2_log_wait_commit(sbi->s_journal, target);
4688 }
4689 if (needs_barrier) { 4717 if (needs_barrier) {
4690 int err; 4718 int err;
4691 err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); 4719 err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
@@ -4696,19 +4724,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
4696 return ret; 4724 return ret;
4697} 4725}
4698 4726
4699static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
4700{
4701 int ret = 0;
4702
4703 trace_ext4_sync_fs(sb, wait);
4704 flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4705 dquot_writeback_dquots(sb, -1);
4706 if (wait && test_opt(sb, BARRIER))
4707 ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
4708
4709 return ret;
4710}
4711
4712/* 4727/*
4713 * LVM calls this function before a (read-only) snapshot is created. This 4728 * LVM calls this function before a (read-only) snapshot is created. This
4714 * gives us a chance to flush the journal completely and mark the fs clean. 4729 * gives us a chance to flush the journal completely and mark the fs clean.
@@ -4727,23 +4742,26 @@ static int ext4_freeze(struct super_block *sb)
4727 4742
4728 journal = EXT4_SB(sb)->s_journal; 4743 journal = EXT4_SB(sb)->s_journal;
4729 4744
4730 /* Now we set up the journal barrier. */ 4745 if (journal) {
4731 jbd2_journal_lock_updates(journal); 4746 /* Now we set up the journal barrier. */
4747 jbd2_journal_lock_updates(journal);
4732 4748
4733 /* 4749 /*
4734 * Don't clear the needs_recovery flag if we failed to flush 4750 * Don't clear the needs_recovery flag if we failed to
4735 * the journal. 4751 * flush the journal.
4736 */ 4752 */
4737 error = jbd2_journal_flush(journal); 4753 error = jbd2_journal_flush(journal);
4738 if (error < 0) 4754 if (error < 0)
4739 goto out; 4755 goto out;
4756 }
4740 4757
4741 /* Journal blocked and flushed, clear needs_recovery flag. */ 4758 /* Journal blocked and flushed, clear needs_recovery flag. */
4742 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 4759 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
4743 error = ext4_commit_super(sb, 1); 4760 error = ext4_commit_super(sb, 1);
4744out: 4761out:
4745 /* we rely on upper layer to stop further updates */ 4762 if (journal)
4746 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 4763 /* we rely on upper layer to stop further updates */
4764 jbd2_journal_unlock_updates(journal);
4747 return error; 4765 return error;
4748} 4766}
4749 4767
@@ -4774,7 +4792,7 @@ struct ext4_mount_options {
4774 u32 s_min_batch_time, s_max_batch_time; 4792 u32 s_min_batch_time, s_max_batch_time;
4775#ifdef CONFIG_QUOTA 4793#ifdef CONFIG_QUOTA
4776 int s_jquota_fmt; 4794 int s_jquota_fmt;
4777 char *s_qf_names[MAXQUOTAS]; 4795 char *s_qf_names[EXT4_MAXQUOTAS];
4778#endif 4796#endif
4779}; 4797};
4780 4798
@@ -4804,7 +4822,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4804 old_opts.s_max_batch_time = sbi->s_max_batch_time; 4822 old_opts.s_max_batch_time = sbi->s_max_batch_time;
4805#ifdef CONFIG_QUOTA 4823#ifdef CONFIG_QUOTA
4806 old_opts.s_jquota_fmt = sbi->s_jquota_fmt; 4824 old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
4807 for (i = 0; i < MAXQUOTAS; i++) 4825 for (i = 0; i < EXT4_MAXQUOTAS; i++)
4808 if (sbi->s_qf_names[i]) { 4826 if (sbi->s_qf_names[i]) {
4809 old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], 4827 old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
4810 GFP_KERNEL); 4828 GFP_KERNEL);
@@ -4828,6 +4846,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4828 goto restore_opts; 4846 goto restore_opts;
4829 } 4847 }
4830 4848
4849 if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
4850 test_opt(sb, JOURNAL_CHECKSUM)) {
4851 ext4_msg(sb, KERN_ERR, "changing journal_checksum "
4852 "during remount not supported");
4853 err = -EINVAL;
4854 goto restore_opts;
4855 }
4856
4831 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { 4857 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
4832 if (test_opt2(sb, EXPLICIT_DELALLOC)) { 4858 if (test_opt2(sb, EXPLICIT_DELALLOC)) {
4833 ext4_msg(sb, KERN_ERR, "can't mount with " 4859 ext4_msg(sb, KERN_ERR, "can't mount with "
@@ -4965,7 +4991,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4965 4991
4966#ifdef CONFIG_QUOTA 4992#ifdef CONFIG_QUOTA
4967 /* Release old quota file names */ 4993 /* Release old quota file names */
4968 for (i = 0; i < MAXQUOTAS; i++) 4994 for (i = 0; i < EXT4_MAXQUOTAS; i++)
4969 kfree(old_opts.s_qf_names[i]); 4995 kfree(old_opts.s_qf_names[i]);
4970 if (enable_quota) { 4996 if (enable_quota) {
4971 if (sb_any_quota_suspended(sb)) 4997 if (sb_any_quota_suspended(sb))
@@ -4994,7 +5020,7 @@ restore_opts:
4994 sbi->s_max_batch_time = old_opts.s_max_batch_time; 5020 sbi->s_max_batch_time = old_opts.s_max_batch_time;
4995#ifdef CONFIG_QUOTA 5021#ifdef CONFIG_QUOTA
4996 sbi->s_jquota_fmt = old_opts.s_jquota_fmt; 5022 sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
4997 for (i = 0; i < MAXQUOTAS; i++) { 5023 for (i = 0; i < EXT4_MAXQUOTAS; i++) {
4998 kfree(sbi->s_qf_names[i]); 5024 kfree(sbi->s_qf_names[i]);
4999 sbi->s_qf_names[i] = old_opts.s_qf_names[i]; 5025 sbi->s_qf_names[i] = old_opts.s_qf_names[i];
5000 } 5026 }
@@ -5197,7 +5223,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
5197{ 5223{
5198 int err; 5224 int err;
5199 struct inode *qf_inode; 5225 struct inode *qf_inode;
5200 unsigned long qf_inums[MAXQUOTAS] = { 5226 unsigned long qf_inums[EXT4_MAXQUOTAS] = {
5201 le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), 5227 le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
5202 le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) 5228 le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
5203 }; 5229 };
@@ -5225,13 +5251,13 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
5225static int ext4_enable_quotas(struct super_block *sb) 5251static int ext4_enable_quotas(struct super_block *sb)
5226{ 5252{
5227 int type, err = 0; 5253 int type, err = 0;
5228 unsigned long qf_inums[MAXQUOTAS] = { 5254 unsigned long qf_inums[EXT4_MAXQUOTAS] = {
5229 le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), 5255 le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
5230 le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) 5256 le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
5231 }; 5257 };
5232 5258
5233 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; 5259 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
5234 for (type = 0; type < MAXQUOTAS; type++) { 5260 for (type = 0; type < EXT4_MAXQUOTAS; type++) {
5235 if (qf_inums[type]) { 5261 if (qf_inums[type]) {
5236 err = ext4_quota_enable(sb, type, QFMT_VFS_V1, 5262 err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
5237 DQUOT_USAGE_ENABLED); 5263 DQUOT_USAGE_ENABLED);
@@ -5309,7 +5335,6 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
5309{ 5335{
5310 struct inode *inode = sb_dqopt(sb)->files[type]; 5336 struct inode *inode = sb_dqopt(sb)->files[type];
5311 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 5337 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
5312 int err = 0;
5313 int offset = off & (sb->s_blocksize - 1); 5338 int offset = off & (sb->s_blocksize - 1);
5314 int tocopy; 5339 int tocopy;
5315 size_t toread; 5340 size_t toread;
@@ -5324,9 +5349,9 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
5324 while (toread > 0) { 5349 while (toread > 0) {
5325 tocopy = sb->s_blocksize - offset < toread ? 5350 tocopy = sb->s_blocksize - offset < toread ?
5326 sb->s_blocksize - offset : toread; 5351 sb->s_blocksize - offset : toread;
5327 bh = ext4_bread(NULL, inode, blk, 0, &err); 5352 bh = ext4_bread(NULL, inode, blk, 0);
5328 if (err) 5353 if (IS_ERR(bh))
5329 return err; 5354 return PTR_ERR(bh);
5330 if (!bh) /* A hole? */ 5355 if (!bh) /* A hole? */
5331 memset(data, 0, tocopy); 5356 memset(data, 0, tocopy);
5332 else 5357 else
@@ -5347,8 +5372,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
5347{ 5372{
5348 struct inode *inode = sb_dqopt(sb)->files[type]; 5373 struct inode *inode = sb_dqopt(sb)->files[type];
5349 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); 5374 ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
5350 int err = 0; 5375 int err, offset = off & (sb->s_blocksize - 1);
5351 int offset = off & (sb->s_blocksize - 1);
5352 struct buffer_head *bh; 5376 struct buffer_head *bh;
5353 handle_t *handle = journal_current_handle(); 5377 handle_t *handle = journal_current_handle();
5354 5378
@@ -5369,14 +5393,16 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
5369 return -EIO; 5393 return -EIO;
5370 } 5394 }
5371 5395
5372 bh = ext4_bread(handle, inode, blk, 1, &err); 5396 bh = ext4_bread(handle, inode, blk, 1);
5397 if (IS_ERR(bh))
5398 return PTR_ERR(bh);
5373 if (!bh) 5399 if (!bh)
5374 goto out; 5400 goto out;
5375 BUFFER_TRACE(bh, "get write access"); 5401 BUFFER_TRACE(bh, "get write access");
5376 err = ext4_journal_get_write_access(handle, bh); 5402 err = ext4_journal_get_write_access(handle, bh);
5377 if (err) { 5403 if (err) {
5378 brelse(bh); 5404 brelse(bh);
5379 goto out; 5405 return err;
5380 } 5406 }
5381 lock_buffer(bh); 5407 lock_buffer(bh);
5382 memcpy(bh->b_data+offset, data, len); 5408 memcpy(bh->b_data+offset, data, len);
@@ -5385,8 +5411,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
5385 err = ext4_handle_dirty_metadata(handle, NULL, bh); 5411 err = ext4_handle_dirty_metadata(handle, NULL, bh);
5386 brelse(bh); 5412 brelse(bh);
5387out: 5413out:
5388 if (err)
5389 return err;
5390 if (inode->i_size < off + len) { 5414 if (inode->i_size < off + len) {
5391 i_size_write(inode, off + len); 5415 i_size_write(inode, off + len);
5392 EXT4_I(inode)->i_disksize = inode->i_size; 5416 EXT4_I(inode)->i_disksize = inode->i_size;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index e7387337060c..1e09fc77395c 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -142,8 +142,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode,
142 sector_t block_nr, 142 sector_t block_nr,
143 struct ext4_xattr_header *hdr) 143 struct ext4_xattr_header *hdr)
144{ 144{
145 if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 145 if (ext4_has_metadata_csum(inode->i_sb) &&
146 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
147 (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr))) 146 (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr)))
148 return 0; 147 return 0;
149 return 1; 148 return 1;
@@ -153,8 +152,7 @@ static void ext4_xattr_block_csum_set(struct inode *inode,
153 sector_t block_nr, 152 sector_t block_nr,
154 struct ext4_xattr_header *hdr) 153 struct ext4_xattr_header *hdr)
155{ 154{
156 if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, 155 if (!ext4_has_metadata_csum(inode->i_sb))
157 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
158 return; 156 return;
159 157
160 hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr); 158 hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr);
@@ -190,14 +188,28 @@ ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
190} 188}
191 189
192static int 190static int
193ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end) 191ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
192 void *value_start)
194{ 193{
195 while (!IS_LAST_ENTRY(entry)) { 194 struct ext4_xattr_entry *e = entry;
196 struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry); 195
196 while (!IS_LAST_ENTRY(e)) {
197 struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
197 if ((void *)next >= end) 198 if ((void *)next >= end)
198 return -EIO; 199 return -EIO;
199 entry = next; 200 e = next;
200 } 201 }
202
203 while (!IS_LAST_ENTRY(entry)) {
204 if (entry->e_value_size != 0 &&
205 (value_start + le16_to_cpu(entry->e_value_offs) <
206 (void *)e + sizeof(__u32) ||
207 value_start + le16_to_cpu(entry->e_value_offs) +
208 le32_to_cpu(entry->e_value_size) > end))
209 return -EIO;
210 entry = EXT4_XATTR_NEXT(entry);
211 }
212
201 return 0; 213 return 0;
202} 214}
203 215
@@ -214,7 +226,8 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
214 return -EIO; 226 return -EIO;
215 if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh))) 227 if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh)))
216 return -EIO; 228 return -EIO;
217 error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); 229 error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size,
230 bh->b_data);
218 if (!error) 231 if (!error)
219 set_buffer_verified(bh); 232 set_buffer_verified(bh);
220 return error; 233 return error;
@@ -331,7 +344,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
331 header = IHDR(inode, raw_inode); 344 header = IHDR(inode, raw_inode);
332 entry = IFIRST(header); 345 entry = IFIRST(header);
333 end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; 346 end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
334 error = ext4_xattr_check_names(entry, end); 347 error = ext4_xattr_check_names(entry, end, entry);
335 if (error) 348 if (error)
336 goto cleanup; 349 goto cleanup;
337 error = ext4_xattr_find_entry(&entry, name_index, name, 350 error = ext4_xattr_find_entry(&entry, name_index, name,
@@ -463,7 +476,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
463 raw_inode = ext4_raw_inode(&iloc); 476 raw_inode = ext4_raw_inode(&iloc);
464 header = IHDR(inode, raw_inode); 477 header = IHDR(inode, raw_inode);
465 end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; 478 end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
466 error = ext4_xattr_check_names(IFIRST(header), end); 479 error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header));
467 if (error) 480 if (error)
468 goto cleanup; 481 goto cleanup;
469 error = ext4_xattr_list_entries(dentry, IFIRST(header), 482 error = ext4_xattr_list_entries(dentry, IFIRST(header),
@@ -899,14 +912,8 @@ inserted:
899 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 912 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
900 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 913 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
901 914
902 /*
903 * take i_data_sem because we will test
904 * i_delalloc_reserved_flag in ext4_mb_new_blocks
905 */
906 down_read(&EXT4_I(inode)->i_data_sem);
907 block = ext4_new_meta_blocks(handle, inode, goal, 0, 915 block = ext4_new_meta_blocks(handle, inode, goal, 0,
908 NULL, &error); 916 NULL, &error);
909 up_read((&EXT4_I(inode)->i_data_sem));
910 if (error) 917 if (error)
911 goto cleanup; 918 goto cleanup;
912 919
@@ -986,7 +993,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
986 is->s.here = is->s.first; 993 is->s.here = is->s.first;
987 is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; 994 is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
988 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { 995 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
989 error = ext4_xattr_check_names(IFIRST(header), is->s.end); 996 error = ext4_xattr_check_names(IFIRST(header), is->s.end,
997 IFIRST(header));
990 if (error) 998 if (error)
991 return error; 999 return error;
992 /* Find the named attribute. */ 1000 /* Find the named attribute. */
diff --git a/fs/internal.h b/fs/internal.h
index 9477f8f6aefc..757ba2abf21e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -47,7 +47,6 @@ extern void __init chrdev_init(void);
47/* 47/*
48 * namei.c 48 * namei.c
49 */ 49 */
50extern int __inode_permission(struct inode *, int);
51extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); 50extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
52extern int vfs_path_lookup(struct dentry *, struct vfsmount *, 51extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
53 const char *, unsigned int, struct path *); 52 const char *, unsigned int, struct path *);
@@ -139,12 +138,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
139extern int rw_verify_area(int, struct file *, const loff_t *, size_t); 138extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
140 139
141/* 140/*
142 * splice.c
143 */
144extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
145 loff_t *opos, size_t len, unsigned int flags);
146
147/*
148 * pipe.c 141 * pipe.c
149 */ 142 */
150extern const struct file_operations pipefifo_fops; 143extern const struct file_operations pipefifo_fops;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 881b3bd0143f..fe839b915116 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -29,13 +29,9 @@
29#define BEQUIET 29#define BEQUIET
30 30
31static int isofs_hashi(const struct dentry *parent, struct qstr *qstr); 31static int isofs_hashi(const struct dentry *parent, struct qstr *qstr);
32static int isofs_hash(const struct dentry *parent, struct qstr *qstr);
33static int isofs_dentry_cmpi(const struct dentry *parent, 32static int isofs_dentry_cmpi(const struct dentry *parent,
34 const struct dentry *dentry, 33 const struct dentry *dentry,
35 unsigned int len, const char *str, const struct qstr *name); 34 unsigned int len, const char *str, const struct qstr *name);
36static int isofs_dentry_cmp(const struct dentry *parent,
37 const struct dentry *dentry,
38 unsigned int len, const char *str, const struct qstr *name);
39 35
40#ifdef CONFIG_JOLIET 36#ifdef CONFIG_JOLIET
41static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr); 37static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr);
@@ -135,10 +131,6 @@ static const struct super_operations isofs_sops = {
135 131
136static const struct dentry_operations isofs_dentry_ops[] = { 132static const struct dentry_operations isofs_dentry_ops[] = {
137 { 133 {
138 .d_hash = isofs_hash,
139 .d_compare = isofs_dentry_cmp,
140 },
141 {
142 .d_hash = isofs_hashi, 134 .d_hash = isofs_hashi,
143 .d_compare = isofs_dentry_cmpi, 135 .d_compare = isofs_dentry_cmpi,
144 }, 136 },
@@ -258,25 +250,12 @@ static int isofs_dentry_cmp_common(
258} 250}
259 251
260static int 252static int
261isofs_hash(const struct dentry *dentry, struct qstr *qstr)
262{
263 return isofs_hash_common(qstr, 0);
264}
265
266static int
267isofs_hashi(const struct dentry *dentry, struct qstr *qstr) 253isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
268{ 254{
269 return isofs_hashi_common(qstr, 0); 255 return isofs_hashi_common(qstr, 0);
270} 256}
271 257
272static int 258static int
273isofs_dentry_cmp(const struct dentry *parent, const struct dentry *dentry,
274 unsigned int len, const char *str, const struct qstr *name)
275{
276 return isofs_dentry_cmp_common(len, str, name, 0, 0);
277}
278
279static int
280isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry, 259isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
281 unsigned int len, const char *str, const struct qstr *name) 260 unsigned int len, const char *str, const struct qstr *name)
282{ 261{
@@ -930,7 +909,8 @@ root_found:
930 if (opt.check == 'r') 909 if (opt.check == 'r')
931 table++; 910 table++;
932 911
933 s->s_d_op = &isofs_dentry_ops[table]; 912 if (table)
913 s->s_d_op = &isofs_dentry_ops[table - 1];
934 914
935 /* get the root dentry */ 915 /* get the root dentry */
936 s->s_root = d_make_root(inode); 916 s->s_root = d_make_root(inode);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 95295640d9c8..7b543e6b6526 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -18,25 +18,10 @@ static int
18isofs_cmp(struct dentry *dentry, const char *compare, int dlen) 18isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
19{ 19{
20 struct qstr qstr; 20 struct qstr qstr;
21
22 if (!compare)
23 return 1;
24
25 /* check special "." and ".." files */
26 if (dlen == 1) {
27 /* "." */
28 if (compare[0] == 0) {
29 if (!dentry->d_name.len)
30 return 0;
31 compare = ".";
32 } else if (compare[0] == 1) {
33 compare = "..";
34 dlen = 2;
35 }
36 }
37
38 qstr.name = compare; 21 qstr.name = compare;
39 qstr.len = dlen; 22 qstr.len = dlen;
23 if (likely(!dentry->d_op))
24 return dentry->d_name.len != dlen || memcmp(dentry->d_name.name, compare, dlen);
40 return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr); 25 return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr);
41} 26}
42 27
@@ -146,7 +131,8 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
146 (!(de->flags[-sbi->s_high_sierra] & 1))) && 131 (!(de->flags[-sbi->s_high_sierra] & 1))) &&
147 (sbi->s_showassoc || 132 (sbi->s_showassoc ||
148 (!(de->flags[-sbi->s_high_sierra] & 4)))) { 133 (!(de->flags[-sbi->s_high_sierra] & 4)))) {
149 match = (isofs_cmp(dentry, dpnt, dlen) == 0); 134 if (dpnt && (dlen > 1 || dpnt[0] > 1))
135 match = (isofs_cmp(dentry, dpnt, dlen) == 0);
150 } 136 }
151 if (match) { 137 if (match) {
152 isofs_normalize_block_and_offset(de, 138 isofs_normalize_block_and_offset(de,
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 06fe11e0abfa..aab8549591e7 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -886,7 +886,7 @@ journal_t * journal_init_inode (struct inode *inode)
886 goto out_err; 886 goto out_err;
887 } 887 }
888 888
889 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 889 bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
890 if (!bh) { 890 if (!bh) {
891 printk(KERN_ERR 891 printk(KERN_ERR
892 "%s: Cannot get buffer for journal superblock\n", 892 "%s: Cannot get buffer for journal superblock\n",
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 8898bbd2b61e..dcead636c33b 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -93,6 +93,7 @@
93#include <linux/bio.h> 93#include <linux/bio.h>
94#endif 94#endif
95#include <linux/log2.h> 95#include <linux/log2.h>
96#include <linux/hash.h>
96 97
97static struct kmem_cache *revoke_record_cache; 98static struct kmem_cache *revoke_record_cache;
98static struct kmem_cache *revoke_table_cache; 99static struct kmem_cache *revoke_table_cache;
@@ -129,15 +130,11 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int);
129 130
130/* Utility functions to maintain the revoke table */ 131/* Utility functions to maintain the revoke table */
131 132
132/* Borrowed from buffer.c: this is a tried and tested block hash function */
133static inline int hash(journal_t *journal, unsigned int block) 133static inline int hash(journal_t *journal, unsigned int block)
134{ 134{
135 struct jbd_revoke_table_s *table = journal->j_revoke; 135 struct jbd_revoke_table_s *table = journal->j_revoke;
136 int hash_shift = table->hash_shift;
137 136
138 return ((block << (hash_shift - 6)) ^ 137 return hash_32(block, table->hash_shift);
139 (block >> 13) ^
140 (block << (hash_shift - 12))) & (table->hash_size - 1);
141} 138}
142 139
143static int insert_revoke_hash(journal_t *journal, unsigned int blocknr, 140static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 7f34f4716165..988b32ed4c87 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -96,15 +96,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
96 96
97 if (jh->b_transaction == NULL && !buffer_locked(bh) && 97 if (jh->b_transaction == NULL && !buffer_locked(bh) &&
98 !buffer_dirty(bh) && !buffer_write_io_error(bh)) { 98 !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
99 /*
100 * Get our reference so that bh cannot be freed before
101 * we unlock it
102 */
103 get_bh(bh);
104 JBUFFER_TRACE(jh, "remove from checkpoint list"); 99 JBUFFER_TRACE(jh, "remove from checkpoint list");
105 ret = __jbd2_journal_remove_checkpoint(jh) + 1; 100 ret = __jbd2_journal_remove_checkpoint(jh) + 1;
106 BUFFER_TRACE(bh, "release");
107 __brelse(bh);
108 } 101 }
109 return ret; 102 return ret;
110} 103}
@@ -122,8 +115,6 @@ void __jbd2_log_wait_for_space(journal_t *journal)
122 115
123 nblocks = jbd2_space_needed(journal); 116 nblocks = jbd2_space_needed(journal);
124 while (jbd2_log_space_left(journal) < nblocks) { 117 while (jbd2_log_space_left(journal) < nblocks) {
125 if (journal->j_flags & JBD2_ABORT)
126 return;
127 write_unlock(&journal->j_state_lock); 118 write_unlock(&journal->j_state_lock);
128 mutex_lock(&journal->j_checkpoint_mutex); 119 mutex_lock(&journal->j_checkpoint_mutex);
129 120
@@ -139,6 +130,10 @@ void __jbd2_log_wait_for_space(journal_t *journal)
139 * trace for forensic evidence. 130 * trace for forensic evidence.
140 */ 131 */
141 write_lock(&journal->j_state_lock); 132 write_lock(&journal->j_state_lock);
133 if (journal->j_flags & JBD2_ABORT) {
134 mutex_unlock(&journal->j_checkpoint_mutex);
135 return;
136 }
142 spin_lock(&journal->j_list_lock); 137 spin_lock(&journal->j_list_lock);
143 nblocks = jbd2_space_needed(journal); 138 nblocks = jbd2_space_needed(journal);
144 space_left = jbd2_log_space_left(journal); 139 space_left = jbd2_log_space_left(journal);
@@ -183,58 +178,6 @@ void __jbd2_log_wait_for_space(journal_t *journal)
183 } 178 }
184} 179}
185 180
186/*
187 * Clean up transaction's list of buffers submitted for io.
188 * We wait for any pending IO to complete and remove any clean
189 * buffers. Note that we take the buffers in the opposite ordering
190 * from the one in which they were submitted for IO.
191 *
192 * Return 0 on success, and return <0 if some buffers have failed
193 * to be written out.
194 *
195 * Called with j_list_lock held.
196 */
197static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
198{
199 struct journal_head *jh;
200 struct buffer_head *bh;
201 tid_t this_tid;
202 int released = 0;
203 int ret = 0;
204
205 this_tid = transaction->t_tid;
206restart:
207 /* Did somebody clean up the transaction in the meanwhile? */
208 if (journal->j_checkpoint_transactions != transaction ||
209 transaction->t_tid != this_tid)
210 return ret;
211 while (!released && transaction->t_checkpoint_io_list) {
212 jh = transaction->t_checkpoint_io_list;
213 bh = jh2bh(jh);
214 get_bh(bh);
215 if (buffer_locked(bh)) {
216 spin_unlock(&journal->j_list_lock);
217 wait_on_buffer(bh);
218 /* the journal_head may have gone by now */
219 BUFFER_TRACE(bh, "brelse");
220 __brelse(bh);
221 spin_lock(&journal->j_list_lock);
222 goto restart;
223 }
224 if (unlikely(buffer_write_io_error(bh)))
225 ret = -EIO;
226
227 /*
228 * Now in whatever state the buffer currently is, we know that
229 * it has been written out and so we can drop it from the list
230 */
231 released = __jbd2_journal_remove_checkpoint(jh);
232 __brelse(bh);
233 }
234
235 return ret;
236}
237
238static void 181static void
239__flush_batch(journal_t *journal, int *batch_count) 182__flush_batch(journal_t *journal, int *batch_count)
240{ 183{
@@ -255,81 +198,6 @@ __flush_batch(journal_t *journal, int *batch_count)
255} 198}
256 199
257/* 200/*
258 * Try to flush one buffer from the checkpoint list to disk.
259 *
260 * Return 1 if something happened which requires us to abort the current
261 * scan of the checkpoint list. Return <0 if the buffer has failed to
262 * be written out.
263 *
264 * Called with j_list_lock held and drops it if 1 is returned
265 */
266static int __process_buffer(journal_t *journal, struct journal_head *jh,
267 int *batch_count, transaction_t *transaction)
268{
269 struct buffer_head *bh = jh2bh(jh);
270 int ret = 0;
271
272 if (buffer_locked(bh)) {
273 get_bh(bh);
274 spin_unlock(&journal->j_list_lock);
275 wait_on_buffer(bh);
276 /* the journal_head may have gone by now */
277 BUFFER_TRACE(bh, "brelse");
278 __brelse(bh);
279 ret = 1;
280 } else if (jh->b_transaction != NULL) {
281 transaction_t *t = jh->b_transaction;
282 tid_t tid = t->t_tid;
283
284 transaction->t_chp_stats.cs_forced_to_close++;
285 spin_unlock(&journal->j_list_lock);
286 if (unlikely(journal->j_flags & JBD2_UNMOUNT))
287 /*
288 * The journal thread is dead; so starting and
289 * waiting for a commit to finish will cause
290 * us to wait for a _very_ long time.
291 */
292 printk(KERN_ERR "JBD2: %s: "
293 "Waiting for Godot: block %llu\n",
294 journal->j_devname,
295 (unsigned long long) bh->b_blocknr);
296 jbd2_log_start_commit(journal, tid);
297 jbd2_log_wait_commit(journal, tid);
298 ret = 1;
299 } else if (!buffer_dirty(bh)) {
300 ret = 1;
301 if (unlikely(buffer_write_io_error(bh)))
302 ret = -EIO;
303 get_bh(bh);
304 BUFFER_TRACE(bh, "remove from checkpoint");
305 __jbd2_journal_remove_checkpoint(jh);
306 spin_unlock(&journal->j_list_lock);
307 __brelse(bh);
308 } else {
309 /*
310 * Important: we are about to write the buffer, and
311 * possibly block, while still holding the journal lock.
312 * We cannot afford to let the transaction logic start
313 * messing around with this buffer before we write it to
314 * disk, as that would break recoverability.
315 */
316 BUFFER_TRACE(bh, "queue");
317 get_bh(bh);
318 J_ASSERT_BH(bh, !buffer_jwrite(bh));
319 journal->j_chkpt_bhs[*batch_count] = bh;
320 __buffer_relink_io(jh);
321 transaction->t_chp_stats.cs_written++;
322 (*batch_count)++;
323 if (*batch_count == JBD2_NR_BATCH) {
324 spin_unlock(&journal->j_list_lock);
325 __flush_batch(journal, batch_count);
326 ret = 1;
327 }
328 }
329 return ret;
330}
331
332/*
333 * Perform an actual checkpoint. We take the first transaction on the 201 * Perform an actual checkpoint. We take the first transaction on the
334 * list of transactions to be checkpointed and send all its buffers 202 * list of transactions to be checkpointed and send all its buffers
335 * to disk. We submit larger chunks of data at once. 203 * to disk. We submit larger chunks of data at once.
@@ -339,9 +207,11 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
339 */ 207 */
340int jbd2_log_do_checkpoint(journal_t *journal) 208int jbd2_log_do_checkpoint(journal_t *journal)
341{ 209{
342 transaction_t *transaction; 210 struct journal_head *jh;
343 tid_t this_tid; 211 struct buffer_head *bh;
344 int result; 212 transaction_t *transaction;
213 tid_t this_tid;
214 int result, batch_count = 0;
345 215
346 jbd_debug(1, "Start checkpoint\n"); 216 jbd_debug(1, "Start checkpoint\n");
347 217
@@ -374,45 +244,117 @@ restart:
374 * done (maybe it's a new transaction, but it fell at the same 244 * done (maybe it's a new transaction, but it fell at the same
375 * address). 245 * address).
376 */ 246 */
377 if (journal->j_checkpoint_transactions == transaction && 247 if (journal->j_checkpoint_transactions != transaction ||
378 transaction->t_tid == this_tid) { 248 transaction->t_tid != this_tid)
379 int batch_count = 0; 249 goto out;
380 struct journal_head *jh; 250
381 int retry = 0, err; 251 /* checkpoint all of the transaction's buffers */
382 252 while (transaction->t_checkpoint_list) {
383 while (!retry && transaction->t_checkpoint_list) { 253 jh = transaction->t_checkpoint_list;
384 jh = transaction->t_checkpoint_list; 254 bh = jh2bh(jh);
385 retry = __process_buffer(journal, jh, &batch_count, 255
386 transaction); 256 if (buffer_locked(bh)) {
387 if (retry < 0 && !result) 257 spin_unlock(&journal->j_list_lock);
388 result = retry; 258 get_bh(bh);
389 if (!retry && (need_resched() || 259 wait_on_buffer(bh);
390 spin_needbreak(&journal->j_list_lock))) { 260 /* the journal_head may have gone by now */
391 spin_unlock(&journal->j_list_lock); 261 BUFFER_TRACE(bh, "brelse");
392 retry = 1; 262 __brelse(bh);
393 break; 263 goto retry;
394 }
395 } 264 }
265 if (jh->b_transaction != NULL) {
266 transaction_t *t = jh->b_transaction;
267 tid_t tid = t->t_tid;
396 268
397 if (batch_count) { 269 transaction->t_chp_stats.cs_forced_to_close++;
398 if (!retry) { 270 spin_unlock(&journal->j_list_lock);
399 spin_unlock(&journal->j_list_lock); 271 if (unlikely(journal->j_flags & JBD2_UNMOUNT))
400 retry = 1; 272 /*
401 } 273 * The journal thread is dead; so
402 __flush_batch(journal, &batch_count); 274 * starting and waiting for a commit
275 * to finish will cause us to wait for
276 * a _very_ long time.
277 */
278 printk(KERN_ERR
279 "JBD2: %s: Waiting for Godot: block %llu\n",
280 journal->j_devname, (unsigned long long) bh->b_blocknr);
281
282 jbd2_log_start_commit(journal, tid);
283 jbd2_log_wait_commit(journal, tid);
284 goto retry;
285 }
286 if (!buffer_dirty(bh)) {
287 if (unlikely(buffer_write_io_error(bh)) && !result)
288 result = -EIO;
289 BUFFER_TRACE(bh, "remove from checkpoint");
290 if (__jbd2_journal_remove_checkpoint(jh))
291 /* The transaction was released; we're done */
292 goto out;
293 continue;
403 } 294 }
295 /*
296 * Important: we are about to write the buffer, and
297 * possibly block, while still holding the journal
298 * lock. We cannot afford to let the transaction
299 * logic start messing around with this buffer before
300 * we write it to disk, as that would break
301 * recoverability.
302 */
303 BUFFER_TRACE(bh, "queue");
304 get_bh(bh);
305 J_ASSERT_BH(bh, !buffer_jwrite(bh));
306 journal->j_chkpt_bhs[batch_count++] = bh;
307 __buffer_relink_io(jh);
308 transaction->t_chp_stats.cs_written++;
309 if ((batch_count == JBD2_NR_BATCH) ||
310 need_resched() ||
311 spin_needbreak(&journal->j_list_lock))
312 goto unlock_and_flush;
313 }
404 314
405 if (retry) { 315 if (batch_count) {
316 unlock_and_flush:
317 spin_unlock(&journal->j_list_lock);
318 retry:
319 if (batch_count)
320 __flush_batch(journal, &batch_count);
406 spin_lock(&journal->j_list_lock); 321 spin_lock(&journal->j_list_lock);
407 goto restart; 322 goto restart;
323 }
324
325 /*
326 * Now we issued all of the transaction's buffers, let's deal
327 * with the buffers that are out for I/O.
328 */
329restart2:
330 /* Did somebody clean up the transaction in the meanwhile? */
331 if (journal->j_checkpoint_transactions != transaction ||
332 transaction->t_tid != this_tid)
333 goto out;
334
335 while (transaction->t_checkpoint_io_list) {
336 jh = transaction->t_checkpoint_io_list;
337 bh = jh2bh(jh);
338 if (buffer_locked(bh)) {
339 spin_unlock(&journal->j_list_lock);
340 get_bh(bh);
341 wait_on_buffer(bh);
342 /* the journal_head may have gone by now */
343 BUFFER_TRACE(bh, "brelse");
344 __brelse(bh);
345 spin_lock(&journal->j_list_lock);
346 goto restart2;
408 } 347 }
348 if (unlikely(buffer_write_io_error(bh)) && !result)
349 result = -EIO;
350
409 /* 351 /*
410 * Now we have cleaned up the first transaction's checkpoint 352 * Now in whatever state the buffer currently is, we
411 * list. Let's clean up the second one 353 * know that it has been written out and so we can
354 * drop it from the list
412 */ 355 */
413 err = __wait_cp_io(journal, transaction); 356 if (__jbd2_journal_remove_checkpoint(jh))
414 if (!result) 357 break;
415 result = err;
416 } 358 }
417out: 359out:
418 spin_unlock(&journal->j_list_lock); 360 spin_unlock(&journal->j_list_lock);
@@ -478,18 +420,16 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
478 * Find all the written-back checkpoint buffers in the given list and 420 * Find all the written-back checkpoint buffers in the given list and
479 * release them. 421 * release them.
480 * 422 *
481 * Called with the journal locked.
482 * Called with j_list_lock held. 423 * Called with j_list_lock held.
483 * Returns number of buffers reaped (for debug) 424 * Returns 1 if we freed the transaction, 0 otherwise.
484 */ 425 */
485 426static int journal_clean_one_cp_list(struct journal_head *jh)
486static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
487{ 427{
488 struct journal_head *last_jh; 428 struct journal_head *last_jh;
489 struct journal_head *next_jh = jh; 429 struct journal_head *next_jh = jh;
490 int ret, freed = 0; 430 int ret;
431 int freed = 0;
491 432
492 *released = 0;
493 if (!jh) 433 if (!jh)
494 return 0; 434 return 0;
495 435
@@ -498,13 +438,11 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
498 jh = next_jh; 438 jh = next_jh;
499 next_jh = jh->b_cpnext; 439 next_jh = jh->b_cpnext;
500 ret = __try_to_free_cp_buf(jh); 440 ret = __try_to_free_cp_buf(jh);
501 if (ret) { 441 if (!ret)
502 freed++; 442 return freed;
503 if (ret == 2) { 443 if (ret == 2)
504 *released = 1; 444 return 1;
505 return freed; 445 freed = 1;
506 }
507 }
508 /* 446 /*
509 * This function only frees up some memory 447 * This function only frees up some memory
510 * if possible so we dont have an obligation 448 * if possible so we dont have an obligation
@@ -523,49 +461,49 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released)
523 * 461 *
524 * Find all the written-back checkpoint buffers in the journal and release them. 462 * Find all the written-back checkpoint buffers in the journal and release them.
525 * 463 *
526 * Called with the journal locked.
527 * Called with j_list_lock held. 464 * Called with j_list_lock held.
528 * Returns number of buffers reaped (for debug)
529 */ 465 */
530 466void __jbd2_journal_clean_checkpoint_list(journal_t *journal)
531int __jbd2_journal_clean_checkpoint_list(journal_t *journal)
532{ 467{
533 transaction_t *transaction, *last_transaction, *next_transaction; 468 transaction_t *transaction, *last_transaction, *next_transaction;
534 int ret = 0; 469 int ret;
535 int released;
536 470
537 transaction = journal->j_checkpoint_transactions; 471 transaction = journal->j_checkpoint_transactions;
538 if (!transaction) 472 if (!transaction)
539 goto out; 473 return;
540 474
541 last_transaction = transaction->t_cpprev; 475 last_transaction = transaction->t_cpprev;
542 next_transaction = transaction; 476 next_transaction = transaction;
543 do { 477 do {
544 transaction = next_transaction; 478 transaction = next_transaction;
545 next_transaction = transaction->t_cpnext; 479 next_transaction = transaction->t_cpnext;
546 ret += journal_clean_one_cp_list(transaction-> 480 ret = journal_clean_one_cp_list(transaction->t_checkpoint_list);
547 t_checkpoint_list, &released);
548 /* 481 /*
549 * This function only frees up some memory if possible so we 482 * This function only frees up some memory if possible so we
550 * dont have an obligation to finish processing. Bail out if 483 * dont have an obligation to finish processing. Bail out if
551 * preemption requested: 484 * preemption requested:
552 */ 485 */
553 if (need_resched()) 486 if (need_resched())
554 goto out; 487 return;
555 if (released) 488 if (ret)
556 continue; 489 continue;
557 /* 490 /*
558 * It is essential that we are as careful as in the case of 491 * It is essential that we are as careful as in the case of
559 * t_checkpoint_list with removing the buffer from the list as 492 * t_checkpoint_list with removing the buffer from the list as
560 * we can possibly see not yet submitted buffers on io_list 493 * we can possibly see not yet submitted buffers on io_list
561 */ 494 */
562 ret += journal_clean_one_cp_list(transaction-> 495 ret = journal_clean_one_cp_list(transaction->
563 t_checkpoint_io_list, &released); 496 t_checkpoint_io_list);
564 if (need_resched()) 497 if (need_resched())
565 goto out; 498 return;
499 /*
500 * Stop scanning if we couldn't free the transaction. This
501 * avoids pointless scanning of transactions which still
502 * weren't checkpointed.
503 */
504 if (!ret)
505 return;
566 } while (transaction != last_transaction); 506 } while (transaction != last_transaction);
567out:
568 return ret;
569} 507}
570 508
571/* 509/*
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 19d74d86d99c..e4dc74713a43 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1237,7 +1237,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
1237 goto out_err; 1237 goto out_err;
1238 } 1238 }
1239 1239
1240 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 1240 bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize);
1241 if (!bh) { 1241 if (!bh) {
1242 printk(KERN_ERR 1242 printk(KERN_ERR
1243 "%s: Cannot get buffer for journal superblock\n", 1243 "%s: Cannot get buffer for journal superblock\n",
@@ -1522,14 +1522,6 @@ static int journal_get_superblock(journal_t *journal)
1522 goto out; 1522 goto out;
1523 } 1523 }
1524 1524
1525 if (jbd2_journal_has_csum_v2or3(journal) &&
1526 JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
1527 /* Can't have checksum v1 and v2 on at the same time! */
1528 printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 "
1529 "at the same time!\n");
1530 goto out;
1531 }
1532
1533 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) && 1525 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) &&
1534 JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) { 1526 JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
1535 /* Can't have checksum v2 and v3 at the same time! */ 1527 /* Can't have checksum v2 and v3 at the same time! */
@@ -1538,6 +1530,14 @@ static int journal_get_superblock(journal_t *journal)
1538 goto out; 1530 goto out;
1539 } 1531 }
1540 1532
1533 if (jbd2_journal_has_csum_v2or3(journal) &&
1534 JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
1535 /* Can't have checksum v1 and v2 on at the same time! */
1536 printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
1537 "at the same time!\n");
1538 goto out;
1539 }
1540
1541 if (!jbd2_verify_csum_type(journal, sb)) { 1541 if (!jbd2_verify_csum_type(journal, sb)) {
1542 printk(KERN_ERR "JBD2: Unknown checksum type\n"); 1542 printk(KERN_ERR "JBD2: Unknown checksum type\n");
1543 goto out; 1543 goto out;
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 9b329b55ffe3..bcbef08a4d8f 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -525,6 +525,7 @@ static int do_one_pass(journal_t *journal,
525 !jbd2_descr_block_csum_verify(journal, 525 !jbd2_descr_block_csum_verify(journal,
526 bh->b_data)) { 526 bh->b_data)) {
527 err = -EIO; 527 err = -EIO;
528 brelse(bh);
528 goto failed; 529 goto failed;
529 } 530 }
530 531
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index d5e95a175c92..c6cbaef2bda1 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -92,6 +92,7 @@
92#include <linux/init.h> 92#include <linux/init.h>
93#include <linux/bio.h> 93#include <linux/bio.h>
94#include <linux/log2.h> 94#include <linux/log2.h>
95#include <linux/hash.h>
95#endif 96#endif
96 97
97static struct kmem_cache *jbd2_revoke_record_cache; 98static struct kmem_cache *jbd2_revoke_record_cache;
@@ -130,16 +131,9 @@ static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
130 131
131/* Utility functions to maintain the revoke table */ 132/* Utility functions to maintain the revoke table */
132 133
133/* Borrowed from buffer.c: this is a tried and tested block hash function */
134static inline int hash(journal_t *journal, unsigned long long block) 134static inline int hash(journal_t *journal, unsigned long long block)
135{ 135{
136 struct jbd2_revoke_table_s *table = journal->j_revoke; 136 return hash_64(block, journal->j_revoke->hash_shift);
137 int hash_shift = table->hash_shift;
138 int hash = (int)block ^ (int)((block >> 31) >> 1);
139
140 return ((hash << (hash_shift - 6)) ^
141 (hash >> 13) ^
142 (hash << (hash_shift - 12))) & (table->hash_size - 1);
143} 137}
144 138
145static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, 139static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
diff --git a/fs/namei.c b/fs/namei.c
index 43927d14db67..db5fe86319e6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -416,6 +416,7 @@ int __inode_permission(struct inode *inode, int mask)
416 416
417 return security_inode_permission(inode, mask); 417 return security_inode_permission(inode, mask);
418} 418}
419EXPORT_SYMBOL(__inode_permission);
419 420
420/** 421/**
421 * sb_permission - Check superblock-level permissions 422 * sb_permission - Check superblock-level permissions
@@ -2383,22 +2384,17 @@ kern_path_mountpoint(int dfd, const char *name, struct path *path,
2383} 2384}
2384EXPORT_SYMBOL(kern_path_mountpoint); 2385EXPORT_SYMBOL(kern_path_mountpoint);
2385 2386
2386/* 2387int __check_sticky(struct inode *dir, struct inode *inode)
2387 * It's inline, so penalty for filesystems that don't use sticky bit is
2388 * minimal.
2389 */
2390static inline int check_sticky(struct inode *dir, struct inode *inode)
2391{ 2388{
2392 kuid_t fsuid = current_fsuid(); 2389 kuid_t fsuid = current_fsuid();
2393 2390
2394 if (!(dir->i_mode & S_ISVTX))
2395 return 0;
2396 if (uid_eq(inode->i_uid, fsuid)) 2391 if (uid_eq(inode->i_uid, fsuid))
2397 return 0; 2392 return 0;
2398 if (uid_eq(dir->i_uid, fsuid)) 2393 if (uid_eq(dir->i_uid, fsuid))
2399 return 0; 2394 return 0;
2400 return !capable_wrt_inode_uidgid(inode, CAP_FOWNER); 2395 return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
2401} 2396}
2397EXPORT_SYMBOL(__check_sticky);
2402 2398
2403/* 2399/*
2404 * Check whether we can remove a link victim from directory dir, check 2400 * Check whether we can remove a link victim from directory dir, check
@@ -2501,7 +2497,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
2501 } 2497 }
2502 2498
2503 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); 2499 mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
2504 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); 2500 mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2);
2505 return NULL; 2501 return NULL;
2506} 2502}
2507EXPORT_SYMBOL(lock_rename); 2503EXPORT_SYMBOL(lock_rename);
@@ -3064,9 +3060,12 @@ finish_open_created:
3064 error = may_open(&nd->path, acc_mode, open_flag); 3060 error = may_open(&nd->path, acc_mode, open_flag);
3065 if (error) 3061 if (error)
3066 goto out; 3062 goto out;
3067 file->f_path.mnt = nd->path.mnt; 3063
3068 error = finish_open(file, nd->path.dentry, NULL, opened); 3064 BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
3069 if (error) { 3065 error = vfs_open(&nd->path, file, current_cred());
3066 if (!error) {
3067 *opened |= FILE_OPENED;
3068 } else {
3070 if (error == -EOPENSTALE) 3069 if (error == -EOPENSTALE)
3071 goto stale_open; 3070 goto stale_open;
3072 goto out; 3071 goto out;
@@ -3155,7 +3154,8 @@ static int do_tmpfile(int dfd, struct filename *pathname,
3155 if (error) 3154 if (error)
3156 goto out2; 3155 goto out2;
3157 audit_inode(pathname, nd->path.dentry, 0); 3156 audit_inode(pathname, nd->path.dentry, 0);
3158 error = may_open(&nd->path, op->acc_mode, op->open_flag); 3157 /* Don't check for other permissions, the inode was just created */
3158 error = may_open(&nd->path, MAY_OPEN, op->open_flag);
3159 if (error) 3159 if (error)
3160 goto out2; 3160 goto out2;
3161 file->f_path.mnt = nd->path.mnt; 3161 file->f_path.mnt = nd->path.mnt;
@@ -4210,12 +4210,16 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
4210 bool should_retry = false; 4210 bool should_retry = false;
4211 int error; 4211 int error;
4212 4212
4213 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) 4213 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
4214 return -EINVAL; 4214 return -EINVAL;
4215 4215
4216 if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE)) 4216 if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
4217 (flags & RENAME_EXCHANGE))
4217 return -EINVAL; 4218 return -EINVAL;
4218 4219
4220 if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
4221 return -EPERM;
4222
4219retry: 4223retry:
4220 from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); 4224 from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
4221 if (IS_ERR(from)) { 4225 if (IS_ERR(from)) {
@@ -4347,6 +4351,20 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
4347 return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); 4351 return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
4348} 4352}
4349 4353
4354int vfs_whiteout(struct inode *dir, struct dentry *dentry)
4355{
4356 int error = may_create(dir, dentry);
4357 if (error)
4358 return error;
4359
4360 if (!dir->i_op->mknod)
4361 return -EPERM;
4362
4363 return dir->i_op->mknod(dir, dentry,
4364 S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
4365}
4366EXPORT_SYMBOL(vfs_whiteout);
4367
4350int readlink_copy(char __user *buffer, int buflen, const char *link) 4368int readlink_copy(char __user *buffer, int buflen, const char *link)
4351{ 4369{
4352 int len = PTR_ERR(link); 4370 int len = PTR_ERR(link);
diff --git a/fs/namespace.c b/fs/namespace.c
index fbba8b17330d..5b66b2b3624d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1686,6 +1686,33 @@ void drop_collected_mounts(struct vfsmount *mnt)
1686 namespace_unlock(); 1686 namespace_unlock();
1687} 1687}
1688 1688
1689/**
1690 * clone_private_mount - create a private clone of a path
1691 *
1692 * This creates a new vfsmount, which will be the clone of @path. The new will
1693 * not be attached anywhere in the namespace and will be private (i.e. changes
1694 * to the originating mount won't be propagated into this).
1695 *
1696 * Release with mntput().
1697 */
1698struct vfsmount *clone_private_mount(struct path *path)
1699{
1700 struct mount *old_mnt = real_mount(path->mnt);
1701 struct mount *new_mnt;
1702
1703 if (IS_MNT_UNBINDABLE(old_mnt))
1704 return ERR_PTR(-EINVAL);
1705
1706 down_read(&namespace_sem);
1707 new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
1708 up_read(&namespace_sem);
1709 if (IS_ERR(new_mnt))
1710 return ERR_CAST(new_mnt);
1711
1712 return &new_mnt->mnt;
1713}
1714EXPORT_SYMBOL_GPL(clone_private_mount);
1715
1689int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, 1716int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
1690 struct vfsmount *root) 1717 struct vfsmount *root)
1691{ 1718{
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 5228f201d3d5..4f46f7a05289 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -378,7 +378,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
378 loff_t offset = header->args.offset; 378 loff_t offset = header->args.offset;
379 size_t count = header->args.count; 379 size_t count = header->args.count;
380 struct page **pages = header->args.pages; 380 struct page **pages = header->args.pages;
381 int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; 381 int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
382 unsigned int pg_len; 382 unsigned int pg_len;
383 struct blk_plug plug; 383 struct blk_plug plug;
384 int i; 384 int i;
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index e966c023b1b7..acbf9ca4018c 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -65,17 +65,18 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
65 65
66 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); 66 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
67 67
68 mutex_lock(&nn->bl_mutex);
68 bl_pipe_msg.bl_wq = &nn->bl_wq; 69 bl_pipe_msg.bl_wq = &nn->bl_wq;
69 70
70 b->simple.len += 4; /* single volume */ 71 b->simple.len += 4; /* single volume */
71 if (b->simple.len > PAGE_SIZE) 72 if (b->simple.len > PAGE_SIZE)
72 return -EIO; 73 goto out_unlock;
73 74
74 memset(msg, 0, sizeof(*msg)); 75 memset(msg, 0, sizeof(*msg));
75 msg->len = sizeof(*bl_msg) + b->simple.len; 76 msg->len = sizeof(*bl_msg) + b->simple.len;
76 msg->data = kzalloc(msg->len, gfp_mask); 77 msg->data = kzalloc(msg->len, gfp_mask);
77 if (!msg->data) 78 if (!msg->data)
78 goto out; 79 goto out_free_data;
79 80
80 bl_msg = msg->data; 81 bl_msg = msg->data;
81 bl_msg->type = BL_DEVICE_MOUNT, 82 bl_msg->type = BL_DEVICE_MOUNT,
@@ -87,7 +88,7 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
87 rc = rpc_queue_upcall(nn->bl_device_pipe, msg); 88 rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
88 if (rc < 0) { 89 if (rc < 0) {
89 remove_wait_queue(&nn->bl_wq, &wq); 90 remove_wait_queue(&nn->bl_wq, &wq);
90 goto out; 91 goto out_free_data;
91 } 92 }
92 93
93 set_current_state(TASK_UNINTERRUPTIBLE); 94 set_current_state(TASK_UNINTERRUPTIBLE);
@@ -97,12 +98,14 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
97 if (reply->status != BL_DEVICE_REQUEST_PROC) { 98 if (reply->status != BL_DEVICE_REQUEST_PROC) {
98 printk(KERN_WARNING "%s failed to decode device: %d\n", 99 printk(KERN_WARNING "%s failed to decode device: %d\n",
99 __func__, reply->status); 100 __func__, reply->status);
100 goto out; 101 goto out_free_data;
101 } 102 }
102 103
103 dev = MKDEV(reply->major, reply->minor); 104 dev = MKDEV(reply->major, reply->minor);
104out: 105out_free_data:
105 kfree(msg->data); 106 kfree(msg->data);
107out_unlock:
108 mutex_unlock(&nn->bl_mutex);
106 return dev; 109 return dev;
107} 110}
108 111
@@ -232,6 +235,7 @@ static int nfs4blocklayout_net_init(struct net *net)
232 struct nfs_net *nn = net_generic(net, nfs_net_id); 235 struct nfs_net *nn = net_generic(net, nfs_net_id);
233 struct dentry *dentry; 236 struct dentry *dentry;
234 237
238 mutex_init(&nn->bl_mutex);
235 init_waitqueue_head(&nn->bl_wq); 239 init_waitqueue_head(&nn->bl_wq);
236 nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); 240 nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
237 if (IS_ERR(nn->bl_device_pipe)) 241 if (IS_ERR(nn->bl_device_pipe))
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5853f53db732..7f3f60641344 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -125,6 +125,8 @@ again:
125 continue; 125 continue;
126 if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) 126 if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
127 continue; 127 continue;
128 if (!nfs4_valid_open_stateid(state))
129 continue;
128 if (!nfs4_stateid_match(&state->stateid, stateid)) 130 if (!nfs4_stateid_match(&state->stateid, stateid))
129 continue; 131 continue;
130 get_nfs_open_context(ctx); 132 get_nfs_open_context(ctx);
@@ -193,7 +195,11 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
193{ 195{
194 int res = 0; 196 int res = 0;
195 197
196 res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync); 198 if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
199 res = nfs4_proc_delegreturn(inode,
200 delegation->cred,
201 &delegation->stateid,
202 issync);
197 nfs_free_delegation(delegation); 203 nfs_free_delegation(delegation);
198 return res; 204 return res;
199} 205}
@@ -380,11 +386,13 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation
380{ 386{
381 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; 387 struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
382 struct nfs_inode *nfsi = NFS_I(inode); 388 struct nfs_inode *nfsi = NFS_I(inode);
383 int err; 389 int err = 0;
384 390
385 if (delegation == NULL) 391 if (delegation == NULL)
386 return 0; 392 return 0;
387 do { 393 do {
394 if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
395 break;
388 err = nfs_delegation_claim_opens(inode, &delegation->stateid); 396 err = nfs_delegation_claim_opens(inode, &delegation->stateid);
389 if (!issync || err != -EAGAIN) 397 if (!issync || err != -EAGAIN)
390 break; 398 break;
@@ -605,10 +613,23 @@ static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *cl
605 rcu_read_unlock(); 613 rcu_read_unlock();
606} 614}
607 615
616static void nfs_revoke_delegation(struct inode *inode)
617{
618 struct nfs_delegation *delegation;
619 rcu_read_lock();
620 delegation = rcu_dereference(NFS_I(inode)->delegation);
621 if (delegation != NULL) {
622 set_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
623 nfs_mark_return_delegation(NFS_SERVER(inode), delegation);
624 }
625 rcu_read_unlock();
626}
627
608void nfs_remove_bad_delegation(struct inode *inode) 628void nfs_remove_bad_delegation(struct inode *inode)
609{ 629{
610 struct nfs_delegation *delegation; 630 struct nfs_delegation *delegation;
611 631
632 nfs_revoke_delegation(inode);
612 delegation = nfs_inode_detach_delegation(inode); 633 delegation = nfs_inode_detach_delegation(inode);
613 if (delegation) { 634 if (delegation) {
614 nfs_inode_find_state_and_recover(inode, &delegation->stateid); 635 nfs_inode_find_state_and_recover(inode, &delegation->stateid);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 5c1cce39297f..e3c20a3ccc93 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -31,6 +31,7 @@ enum {
31 NFS_DELEGATION_RETURN_IF_CLOSED, 31 NFS_DELEGATION_RETURN_IF_CLOSED,
32 NFS_DELEGATION_REFERENCED, 32 NFS_DELEGATION_REFERENCED,
33 NFS_DELEGATION_RETURNING, 33 NFS_DELEGATION_RETURNING,
34 NFS_DELEGATION_REVOKED,
34}; 35};
35 36
36int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); 37int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 06e8cfcbb670..6e62155abf26 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1527,6 +1527,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
1527 case -ENOENT: 1527 case -ENOENT:
1528 d_drop(dentry); 1528 d_drop(dentry);
1529 d_add(dentry, NULL); 1529 d_add(dentry, NULL);
1530 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
1530 break; 1531 break;
1531 case -EISDIR: 1532 case -EISDIR:
1532 case -ENOTDIR: 1533 case -ENOTDIR:
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 20cffc830468..10bf07280f4a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -266,6 +266,7 @@ static void nfs_direct_req_free(struct kref *kref)
266{ 266{
267 struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); 267 struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
268 268
269 nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo);
269 if (dreq->l_ctx != NULL) 270 if (dreq->l_ctx != NULL)
270 nfs_put_lock_context(dreq->l_ctx); 271 nfs_put_lock_context(dreq->l_ctx);
271 if (dreq->ctx != NULL) 272 if (dreq->ctx != NULL)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 46fab1cb455a..7afb52f6a25a 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -145,9 +145,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
145 case -NFS4ERR_DELEG_REVOKED: 145 case -NFS4ERR_DELEG_REVOKED:
146 case -NFS4ERR_ADMIN_REVOKED: 146 case -NFS4ERR_ADMIN_REVOKED:
147 case -NFS4ERR_BAD_STATEID: 147 case -NFS4ERR_BAD_STATEID:
148 if (state == NULL)
149 break;
150 nfs_remove_bad_delegation(state->inode);
151 case -NFS4ERR_OPENMODE: 148 case -NFS4ERR_OPENMODE:
152 if (state == NULL) 149 if (state == NULL)
153 break; 150 break;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6388a59f2add..00689a8a85e4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -626,7 +626,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
626{ 626{
627 struct inode *inode = dentry->d_inode; 627 struct inode *inode = dentry->d_inode;
628 int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; 628 int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
629 int err; 629 int err = 0;
630 630
631 trace_nfs_getattr_enter(inode); 631 trace_nfs_getattr_enter(inode);
632 /* Flush out writes to the server in order to update c/mtime. */ 632 /* Flush out writes to the server in order to update c/mtime. */
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index ef221fb8a183..f0e06e4acbef 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -19,6 +19,7 @@ struct nfs_net {
19 struct rpc_pipe *bl_device_pipe; 19 struct rpc_pipe *bl_device_pipe;
20 struct bl_dev_msg bl_mount_reply; 20 struct bl_dev_msg bl_mount_reply;
21 wait_queue_head_t bl_wq; 21 wait_queue_head_t bl_wq;
22 struct mutex bl_mutex;
22 struct list_head nfs_client_list; 23 struct list_head nfs_client_list;
23 struct list_head nfs_volume_list; 24 struct list_head nfs_volume_list;
24#if IS_ENABLED(CONFIG_NFS_V4) 25#if IS_ENABLED(CONFIG_NFS_V4)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 405bd95c1f58..69dc20a743f9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -370,11 +370,6 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
370 case -NFS4ERR_DELEG_REVOKED: 370 case -NFS4ERR_DELEG_REVOKED:
371 case -NFS4ERR_ADMIN_REVOKED: 371 case -NFS4ERR_ADMIN_REVOKED:
372 case -NFS4ERR_BAD_STATEID: 372 case -NFS4ERR_BAD_STATEID:
373 if (inode != NULL && nfs4_have_delegation(inode, FMODE_READ)) {
374 nfs_remove_bad_delegation(inode);
375 exception->retry = 1;
376 break;
377 }
378 if (state == NULL) 373 if (state == NULL)
379 break; 374 break;
380 ret = nfs4_schedule_stateid_recovery(server, state); 375 ret = nfs4_schedule_stateid_recovery(server, state);
@@ -1654,7 +1649,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
1654 nfs_inode_find_state_and_recover(state->inode, 1649 nfs_inode_find_state_and_recover(state->inode,
1655 stateid); 1650 stateid);
1656 nfs4_schedule_stateid_recovery(server, state); 1651 nfs4_schedule_stateid_recovery(server, state);
1657 return 0; 1652 return -EAGAIN;
1658 case -NFS4ERR_DELAY: 1653 case -NFS4ERR_DELAY:
1659 case -NFS4ERR_GRACE: 1654 case -NFS4ERR_GRACE:
1660 set_bit(NFS_DELEGATED_STATE, &state->flags); 1655 set_bit(NFS_DELEGATED_STATE, &state->flags);
@@ -2109,46 +2104,60 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
2109 return ret; 2104 return ret;
2110} 2105}
2111 2106
2107static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state)
2108{
2109 nfs_remove_bad_delegation(state->inode);
2110 write_seqlock(&state->seqlock);
2111 nfs4_stateid_copy(&state->stateid, &state->open_stateid);
2112 write_sequnlock(&state->seqlock);
2113 clear_bit(NFS_DELEGATED_STATE, &state->flags);
2114}
2115
2116static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
2117{
2118 if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL)
2119 nfs_finish_clear_delegation_stateid(state);
2120}
2121
2122static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
2123{
2124 /* NFSv4.0 doesn't allow for delegation recovery on open expire */
2125 nfs40_clear_delegation_stateid(state);
2126 return nfs4_open_expired(sp, state);
2127}
2128
2112#if defined(CONFIG_NFS_V4_1) 2129#if defined(CONFIG_NFS_V4_1)
2113static void nfs41_clear_delegation_stateid(struct nfs4_state *state) 2130static void nfs41_check_delegation_stateid(struct nfs4_state *state)
2114{ 2131{
2115 struct nfs_server *server = NFS_SERVER(state->inode); 2132 struct nfs_server *server = NFS_SERVER(state->inode);
2116 nfs4_stateid *stateid = &state->stateid; 2133 nfs4_stateid stateid;
2117 struct nfs_delegation *delegation; 2134 struct nfs_delegation *delegation;
2118 struct rpc_cred *cred = NULL; 2135 struct rpc_cred *cred;
2119 int status = -NFS4ERR_BAD_STATEID; 2136 int status;
2120
2121 /* If a state reset has been done, test_stateid is unneeded */
2122 if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
2123 return;
2124 2137
2125 /* Get the delegation credential for use by test/free_stateid */ 2138 /* Get the delegation credential for use by test/free_stateid */
2126 rcu_read_lock(); 2139 rcu_read_lock();
2127 delegation = rcu_dereference(NFS_I(state->inode)->delegation); 2140 delegation = rcu_dereference(NFS_I(state->inode)->delegation);
2128 if (delegation != NULL && 2141 if (delegation == NULL) {
2129 nfs4_stateid_match(&delegation->stateid, stateid)) {
2130 cred = get_rpccred(delegation->cred);
2131 rcu_read_unlock();
2132 status = nfs41_test_stateid(server, stateid, cred);
2133 trace_nfs4_test_delegation_stateid(state, NULL, status);
2134 } else
2135 rcu_read_unlock(); 2142 rcu_read_unlock();
2143 return;
2144 }
2145
2146 nfs4_stateid_copy(&stateid, &delegation->stateid);
2147 cred = get_rpccred(delegation->cred);
2148 rcu_read_unlock();
2149 status = nfs41_test_stateid(server, &stateid, cred);
2150 trace_nfs4_test_delegation_stateid(state, NULL, status);
2136 2151
2137 if (status != NFS_OK) { 2152 if (status != NFS_OK) {
2138 /* Free the stateid unless the server explicitly 2153 /* Free the stateid unless the server explicitly
2139 * informs us the stateid is unrecognized. */ 2154 * informs us the stateid is unrecognized. */
2140 if (status != -NFS4ERR_BAD_STATEID) 2155 if (status != -NFS4ERR_BAD_STATEID)
2141 nfs41_free_stateid(server, stateid, cred); 2156 nfs41_free_stateid(server, &stateid, cred);
2142 nfs_remove_bad_delegation(state->inode); 2157 nfs_finish_clear_delegation_stateid(state);
2143
2144 write_seqlock(&state->seqlock);
2145 nfs4_stateid_copy(&state->stateid, &state->open_stateid);
2146 write_sequnlock(&state->seqlock);
2147 clear_bit(NFS_DELEGATED_STATE, &state->flags);
2148 } 2158 }
2149 2159
2150 if (cred != NULL) 2160 put_rpccred(cred);
2151 put_rpccred(cred);
2152} 2161}
2153 2162
2154/** 2163/**
@@ -2192,7 +2201,7 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
2192{ 2201{
2193 int status; 2202 int status;
2194 2203
2195 nfs41_clear_delegation_stateid(state); 2204 nfs41_check_delegation_stateid(state);
2196 status = nfs41_check_open_stateid(state); 2205 status = nfs41_check_open_stateid(state);
2197 if (status != NFS_OK) 2206 if (status != NFS_OK)
2198 status = nfs4_open_expired(sp, state); 2207 status = nfs4_open_expired(sp, state);
@@ -2231,19 +2240,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
2231 seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); 2240 seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
2232 2241
2233 ret = _nfs4_proc_open(opendata); 2242 ret = _nfs4_proc_open(opendata);
2234 if (ret != 0) { 2243 if (ret != 0)
2235 if (ret == -ENOENT) {
2236 dentry = opendata->dentry;
2237 if (dentry->d_inode)
2238 d_delete(dentry);
2239 else if (d_unhashed(dentry))
2240 d_add(dentry, NULL);
2241
2242 nfs_set_verifier(dentry,
2243 nfs_save_change_attribute(opendata->dir->d_inode));
2244 }
2245 goto out; 2244 goto out;
2246 }
2247 2245
2248 state = nfs4_opendata_to_nfs4_state(opendata); 2246 state = nfs4_opendata_to_nfs4_state(opendata);
2249 ret = PTR_ERR(state); 2247 ret = PTR_ERR(state);
@@ -4841,9 +4839,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
4841 case -NFS4ERR_DELEG_REVOKED: 4839 case -NFS4ERR_DELEG_REVOKED:
4842 case -NFS4ERR_ADMIN_REVOKED: 4840 case -NFS4ERR_ADMIN_REVOKED:
4843 case -NFS4ERR_BAD_STATEID: 4841 case -NFS4ERR_BAD_STATEID:
4844 if (state == NULL)
4845 break;
4846 nfs_remove_bad_delegation(state->inode);
4847 case -NFS4ERR_OPENMODE: 4842 case -NFS4ERR_OPENMODE:
4848 if (state == NULL) 4843 if (state == NULL)
4849 break; 4844 break;
@@ -8341,7 +8336,7 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
8341static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = { 8336static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
8342 .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, 8337 .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
8343 .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, 8338 .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
8344 .recover_open = nfs4_open_expired, 8339 .recover_open = nfs40_open_expired,
8345 .recover_lock = nfs4_lock_expired, 8340 .recover_lock = nfs4_lock_expired,
8346 .establish_clid = nfs4_init_clientid, 8341 .establish_clid = nfs4_init_clientid,
8347}; 8342};
@@ -8408,8 +8403,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
8408 | NFS_CAP_CHANGE_ATTR 8403 | NFS_CAP_CHANGE_ATTR
8409 | NFS_CAP_POSIX_LOCK 8404 | NFS_CAP_POSIX_LOCK
8410 | NFS_CAP_STATEID_NFSV41 8405 | NFS_CAP_STATEID_NFSV41
8411 | NFS_CAP_ATOMIC_OPEN_V1 8406 | NFS_CAP_ATOMIC_OPEN_V1,
8412 | NFS_CAP_SEEK,
8413 .init_client = nfs41_init_client, 8407 .init_client = nfs41_init_client,
8414 .shutdown_client = nfs41_shutdown_client, 8408 .shutdown_client = nfs41_shutdown_client,
8415 .match_stateid = nfs41_match_stateid, 8409 .match_stateid = nfs41_match_stateid,
@@ -8431,7 +8425,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
8431 | NFS_CAP_CHANGE_ATTR 8425 | NFS_CAP_CHANGE_ATTR
8432 | NFS_CAP_POSIX_LOCK 8426 | NFS_CAP_POSIX_LOCK
8433 | NFS_CAP_STATEID_NFSV41 8427 | NFS_CAP_STATEID_NFSV41
8434 | NFS_CAP_ATOMIC_OPEN_V1, 8428 | NFS_CAP_ATOMIC_OPEN_V1
8429 | NFS_CAP_SEEK,
8435 .init_client = nfs41_init_client, 8430 .init_client = nfs41_init_client,
8436 .shutdown_client = nfs41_shutdown_client, 8431 .shutdown_client = nfs41_shutdown_client,
8437 .match_stateid = nfs41_match_stateid, 8432 .match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index c6e4bda63000..9e5bc42180e4 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -5,7 +5,7 @@
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Benny Halevy <bhalevy@panasas.com> 7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com> 8 * Boaz Harrosh <ooo@electrozaur.com>
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify 10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 11 * it under the terms of the GNU General Public License version 2
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index c89357c7a914..919efd4a1a23 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -5,7 +5,7 @@
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Benny Halevy <bhalevy@panasas.com> 7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com> 8 * Boaz Harrosh <ooo@electrozaur.com>
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify 10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 11 * it under the terms of the GNU General Public License version 2
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 3a0828d57339..2641dbad345c 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -6,7 +6,7 @@
6 * All rights reserved. 6 * All rights reserved.
7 * 7 *
8 * Benny Halevy <bhalevy@panasas.com> 8 * Benny Halevy <bhalevy@panasas.com>
9 * Boaz Harrosh <bharrosh@panasas.com> 9 * Boaz Harrosh <ooo@electrozaur.com>
10 * 10 *
11 * This program is free software; you can redistribute it and/or modify 11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2 12 * it under the terms of the GNU General Public License version 2
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
index b3918f7ac34d..f093c7ec983b 100644
--- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -5,7 +5,7 @@
5 * All rights reserved. 5 * All rights reserved.
6 * 6 *
7 * Benny Halevy <bhalevy@panasas.com> 7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com> 8 * Boaz Harrosh <ooo@electrozaur.com>
9 * 9 *
10 * This program is free software; you can redistribute it and/or modify 10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 11 * it under the terms of the GNU General Public License version 2
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 12493846a2d3..f83b02dc9166 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -715,8 +715,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
715 715
716 if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) 716 if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
717 nfs_release_request(req); 717 nfs_release_request(req);
718 else
719 WARN_ON_ONCE(1);
720} 718}
721 719
722static void 720static void
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 747f3b95bd11..33a46a8dfaf7 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -335,12 +335,15 @@ void nfsd_lockd_shutdown(void);
335 (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) 335 (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
336 336
337#ifdef CONFIG_NFSD_V4_SECURITY_LABEL 337#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
338#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \ 338#define NFSD4_2_SECURITY_ATTRS FATTR4_WORD2_SECURITY_LABEL
339 (NFSD4_1_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SECURITY_LABEL)
340#else 339#else
341#define NFSD4_2_SUPPORTED_ATTRS_WORD2 0 340#define NFSD4_2_SECURITY_ATTRS 0
342#endif 341#endif
343 342
343#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
344 (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \
345 NFSD4_2_SECURITY_ATTRS)
346
344static inline u32 nfsd_suppattrs0(u32 minorversion) 347static inline u32 nfsd_suppattrs0(u32 minorversion)
345{ 348{
346 return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0 349 return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 9d3e9c50066a..89326acd4561 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -229,8 +229,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
229 &fsnotify_mark_srcu); 229 &fsnotify_mark_srcu);
230 } 230 }
231 231
232 /*
233 * We need to merge inode & vfsmount mark lists so that inode mark
234 * ignore masks are properly reflected for mount mark notifications.
235 * That's why this traversal is so complicated...
236 */
232 while (inode_node || vfsmount_node) { 237 while (inode_node || vfsmount_node) {
233 inode_group = vfsmount_group = NULL; 238 inode_group = NULL;
239 inode_mark = NULL;
240 vfsmount_group = NULL;
241 vfsmount_mark = NULL;
234 242
235 if (inode_node) { 243 if (inode_node) {
236 inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), 244 inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
@@ -244,21 +252,19 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
244 vfsmount_group = vfsmount_mark->group; 252 vfsmount_group = vfsmount_mark->group;
245 } 253 }
246 254
247 if (inode_group > vfsmount_group) { 255 if (inode_group && vfsmount_group) {
248 /* handle inode */ 256 int cmp = fsnotify_compare_groups(inode_group,
249 ret = send_to_group(to_tell, inode_mark, NULL, mask, 257 vfsmount_group);
250 data, data_is, cookie, file_name); 258 if (cmp > 0) {
251 /* we didn't use the vfsmount_mark */ 259 inode_group = NULL;
252 vfsmount_group = NULL; 260 inode_mark = NULL;
253 } else if (vfsmount_group > inode_group) { 261 } else if (cmp < 0) {
254 ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, 262 vfsmount_group = NULL;
255 data, data_is, cookie, file_name); 263 vfsmount_mark = NULL;
256 inode_group = NULL; 264 }
257 } else {
258 ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
259 mask, data, data_is, cookie,
260 file_name);
261 } 265 }
266 ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask,
267 data, data_is, cookie, file_name);
262 268
263 if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) 269 if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
264 goto out; 270 goto out;
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 9c0898c4cfe1..3b68b0ae0a97 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -12,6 +12,10 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group);
12/* protects reads of inode and vfsmount marks list */ 12/* protects reads of inode and vfsmount marks list */
13extern struct srcu_struct fsnotify_mark_srcu; 13extern struct srcu_struct fsnotify_mark_srcu;
14 14
15/* compare two groups for sorting of marks lists */
16extern int fsnotify_compare_groups(struct fsnotify_group *a,
17 struct fsnotify_group *b);
18
15extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark, 19extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
16 __u32 mask); 20 __u32 mask);
17/* add a mark to an inode */ 21/* add a mark to an inode */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 9ce062218de9..dfbf5447eea4 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -194,6 +194,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
194{ 194{
195 struct fsnotify_mark *lmark, *last = NULL; 195 struct fsnotify_mark *lmark, *last = NULL;
196 int ret = 0; 196 int ret = 0;
197 int cmp;
197 198
198 mark->flags |= FSNOTIFY_MARK_FLAG_INODE; 199 mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
199 200
@@ -219,11 +220,8 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
219 goto out; 220 goto out;
220 } 221 }
221 222
222 if (mark->group->priority < lmark->group->priority) 223 cmp = fsnotify_compare_groups(lmark->group, mark->group);
223 continue; 224 if (cmp < 0)
224
225 if ((mark->group->priority == lmark->group->priority) &&
226 (mark->group < lmark->group))
227 continue; 225 continue;
228 226
229 hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list); 227 hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
@@ -288,20 +286,25 @@ void fsnotify_unmount_inodes(struct list_head *list)
288 spin_unlock(&inode->i_lock); 286 spin_unlock(&inode->i_lock);
289 287
290 /* In case the dropping of a reference would nuke next_i. */ 288 /* In case the dropping of a reference would nuke next_i. */
291 if ((&next_i->i_sb_list != list) && 289 while (&next_i->i_sb_list != list) {
292 atomic_read(&next_i->i_count)) {
293 spin_lock(&next_i->i_lock); 290 spin_lock(&next_i->i_lock);
294 if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) { 291 if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
292 atomic_read(&next_i->i_count)) {
295 __iget(next_i); 293 __iget(next_i);
296 need_iput = next_i; 294 need_iput = next_i;
295 spin_unlock(&next_i->i_lock);
296 break;
297 } 297 }
298 spin_unlock(&next_i->i_lock); 298 spin_unlock(&next_i->i_lock);
299 next_i = list_entry(next_i->i_sb_list.next,
300 struct inode, i_sb_list);
299 } 301 }
300 302
301 /* 303 /*
302 * We can safely drop inode_sb_list_lock here because we hold 304 * We can safely drop inode_sb_list_lock here because either
303 * references on both inode and next_i. Also no new inodes 305 * we actually hold references on both inode and next_i or
304 * will be added since the umount has begun. 306 * end of list. Also no new inodes will be added since the
307 * umount has begun.
305 */ 308 */
306 spin_unlock(&inode_sb_list_lock); 309 spin_unlock(&inode_sb_list_lock);
307 310
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d90deaa08e78..34c38fabf514 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -210,6 +210,42 @@ void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mas
210} 210}
211 211
212/* 212/*
213 * Sorting function for lists of fsnotify marks.
214 *
215 * Fanotify supports different notification classes (reflected as priority of
216 * notification group). Events shall be passed to notification groups in
217 * decreasing priority order. To achieve this marks in notification lists for
218 * inodes and vfsmounts are sorted so that priorities of corresponding groups
219 * are descending.
220 *
221 * Furthermore correct handling of the ignore mask requires processing inode
222 * and vfsmount marks of each group together. Using the group address as
223 * further sort criterion provides a unique sorting order and thus we can
224 * merge inode and vfsmount lists of marks in linear time and find groups
225 * present in both lists.
226 *
227 * A return value of 1 signifies that b has priority over a.
228 * A return value of 0 signifies that the two marks have to be handled together.
229 * A return value of -1 signifies that a has priority over b.
230 */
231int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
232{
233 if (a == b)
234 return 0;
235 if (!a)
236 return 1;
237 if (!b)
238 return -1;
239 if (a->priority < b->priority)
240 return 1;
241 if (a->priority > b->priority)
242 return -1;
243 if (a < b)
244 return 1;
245 return -1;
246}
247
248/*
213 * Attach an initialized mark to a given group and fs object. 249 * Attach an initialized mark to a given group and fs object.
214 * These marks may be used for the fsnotify backend to determine which 250 * These marks may be used for the fsnotify backend to determine which
215 * event types should be delivered to which group. 251 * event types should be delivered to which group.
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index ac851e8376b1..faefa72a11eb 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -153,6 +153,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
153 struct mount *m = real_mount(mnt); 153 struct mount *m = real_mount(mnt);
154 struct fsnotify_mark *lmark, *last = NULL; 154 struct fsnotify_mark *lmark, *last = NULL;
155 int ret = 0; 155 int ret = 0;
156 int cmp;
156 157
157 mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; 158 mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
158 159
@@ -178,11 +179,8 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
178 goto out; 179 goto out;
179 } 180 }
180 181
181 if (mark->group->priority < lmark->group->priority) 182 cmp = fsnotify_compare_groups(lmark->group, mark->group);
182 continue; 183 if (cmp < 0)
183
184 if ((mark->group->priority == lmark->group->priority) &&
185 (mark->group < lmark->group))
186 continue; 184 continue;
187 185
188 hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list); 186 hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 97de0fbd9f78..a96044004064 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -925,7 +925,7 @@ static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
925 size_t veclen, size_t total) 925 size_t veclen, size_t total)
926{ 926{
927 int ret; 927 int ret;
928 struct msghdr msg; 928 struct msghdr msg = {.msg_flags = 0,};
929 929
930 if (sock == NULL) { 930 if (sock == NULL) {
931 ret = -EINVAL; 931 ret = -EINVAL;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8add6f1030d7..b931e04e3388 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -158,7 +158,7 @@ bail_add:
158 * NOTE: This dentry already has ->d_op set from 158 * NOTE: This dentry already has ->d_op set from
159 * ocfs2_get_parent() and ocfs2_get_dentry() 159 * ocfs2_get_parent() and ocfs2_get_dentry()
160 */ 160 */
161 if (ret) 161 if (!IS_ERR_OR_NULL(ret))
162 dentry = ret; 162 dentry = ret;
163 163
164 status = ocfs2_dentry_attach_lock(dentry, inode, 164 status = ocfs2_dentry_attach_lock(dentry, inode,
diff --git a/fs/open.c b/fs/open.c
index c94449b2e582..192c429f1fbc 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -824,8 +824,7 @@ struct file *dentry_open(const struct path *path, int flags,
824 f = get_empty_filp(); 824 f = get_empty_filp();
825 if (!IS_ERR(f)) { 825 if (!IS_ERR(f)) {
826 f->f_flags = flags; 826 f->f_flags = flags;
827 f->f_path = *path; 827 error = vfs_open(path, f, cred);
828 error = do_dentry_open(f, NULL, cred);
829 if (!error) { 828 if (!error) {
830 /* from now on we need fput() to dispose of f */ 829 /* from now on we need fput() to dispose of f */
831 error = open_check_o_direct(f); 830 error = open_check_o_direct(f);
@@ -842,6 +841,26 @@ struct file *dentry_open(const struct path *path, int flags,
842} 841}
843EXPORT_SYMBOL(dentry_open); 842EXPORT_SYMBOL(dentry_open);
844 843
844/**
845 * vfs_open - open the file at the given path
846 * @path: path to open
847 * @filp: newly allocated file with f_flag initialized
848 * @cred: credentials to use
849 */
850int vfs_open(const struct path *path, struct file *filp,
851 const struct cred *cred)
852{
853 struct inode *inode = path->dentry->d_inode;
854
855 if (inode->i_op->dentry_open)
856 return inode->i_op->dentry_open(path->dentry, filp, cred);
857 else {
858 filp->f_path = *path;
859 return do_dentry_open(filp, NULL, cred);
860 }
861}
862EXPORT_SYMBOL(vfs_open);
863
845static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) 864static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
846{ 865{
847 int lookup_flags = 0; 866 int lookup_flags = 0;
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
new file mode 100644
index 000000000000..e60125976873
--- /dev/null
+++ b/fs/overlayfs/Kconfig
@@ -0,0 +1,10 @@
1config OVERLAYFS_FS
2 tristate "Overlay filesystem support"
3 help
4 An overlay filesystem combines two filesystems - an 'upper' filesystem
5 and a 'lower' filesystem. When a name exists in both filesystems, the
6 object in the 'upper' filesystem is visible while the object in the
7 'lower' filesystem is either hidden or, in the case of directories,
8 merged with the 'upper' object.
9
10 For more information see Documentation/filesystems/overlayfs.txt
diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile
new file mode 100644
index 000000000000..8f91889480d0
--- /dev/null
+++ b/fs/overlayfs/Makefile
@@ -0,0 +1,7 @@
1#
2# Makefile for the overlay filesystem.
3#
4
5obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o
6
7overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
new file mode 100644
index 000000000000..ea10a8719107
--- /dev/null
+++ b/fs/overlayfs/copy_up.c
@@ -0,0 +1,414 @@
1/*
2 *
3 * Copyright (C) 2011 Novell Inc.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published by
7 * the Free Software Foundation.
8 */
9
10#include <linux/fs.h>
11#include <linux/slab.h>
12#include <linux/file.h>
13#include <linux/splice.h>
14#include <linux/xattr.h>
15#include <linux/security.h>
16#include <linux/uaccess.h>
17#include <linux/sched.h>
18#include <linux/namei.h>
19#include "overlayfs.h"
20
21#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
22
23int ovl_copy_xattr(struct dentry *old, struct dentry *new)
24{
25 ssize_t list_size, size;
26 char *buf, *name, *value;
27 int error;
28
29 if (!old->d_inode->i_op->getxattr ||
30 !new->d_inode->i_op->getxattr)
31 return 0;
32
33 list_size = vfs_listxattr(old, NULL, 0);
34 if (list_size <= 0) {
35 if (list_size == -EOPNOTSUPP)
36 return 0;
37 return list_size;
38 }
39
40 buf = kzalloc(list_size, GFP_KERNEL);
41 if (!buf)
42 return -ENOMEM;
43
44 error = -ENOMEM;
45 value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
46 if (!value)
47 goto out;
48
49 list_size = vfs_listxattr(old, buf, list_size);
50 if (list_size <= 0) {
51 error = list_size;
52 goto out_free_value;
53 }
54
55 for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
56 size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
57 if (size <= 0) {
58 error = size;
59 goto out_free_value;
60 }
61 error = vfs_setxattr(new, name, value, size, 0);
62 if (error)
63 goto out_free_value;
64 }
65
66out_free_value:
67 kfree(value);
68out:
69 kfree(buf);
70 return error;
71}
72
73static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
74{
75 struct file *old_file;
76 struct file *new_file;
77 loff_t old_pos = 0;
78 loff_t new_pos = 0;
79 int error = 0;
80
81 if (len == 0)
82 return 0;
83
84 old_file = ovl_path_open(old, O_RDONLY);
85 if (IS_ERR(old_file))
86 return PTR_ERR(old_file);
87
88 new_file = ovl_path_open(new, O_WRONLY);
89 if (IS_ERR(new_file)) {
90 error = PTR_ERR(new_file);
91 goto out_fput;
92 }
93
94 /* FIXME: copy up sparse files efficiently */
95 while (len) {
96 size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
97 long bytes;
98
99 if (len < this_len)
100 this_len = len;
101
102 if (signal_pending_state(TASK_KILLABLE, current)) {
103 error = -EINTR;
104 break;
105 }
106
107 bytes = do_splice_direct(old_file, &old_pos,
108 new_file, &new_pos,
109 this_len, SPLICE_F_MOVE);
110 if (bytes <= 0) {
111 error = bytes;
112 break;
113 }
114 WARN_ON(old_pos != new_pos);
115
116 len -= bytes;
117 }
118
119 fput(new_file);
120out_fput:
121 fput(old_file);
122 return error;
123}
124
125static char *ovl_read_symlink(struct dentry *realdentry)
126{
127 int res;
128 char *buf;
129 struct inode *inode = realdentry->d_inode;
130 mm_segment_t old_fs;
131
132 res = -EINVAL;
133 if (!inode->i_op->readlink)
134 goto err;
135
136 res = -ENOMEM;
137 buf = (char *) __get_free_page(GFP_KERNEL);
138 if (!buf)
139 goto err;
140
141 old_fs = get_fs();
142 set_fs(get_ds());
143 /* The cast to a user pointer is valid due to the set_fs() */
144 res = inode->i_op->readlink(realdentry,
145 (char __user *)buf, PAGE_SIZE - 1);
146 set_fs(old_fs);
147 if (res < 0) {
148 free_page((unsigned long) buf);
149 goto err;
150 }
151 buf[res] = '\0';
152
153 return buf;
154
155err:
156 return ERR_PTR(res);
157}
158
159static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
160{
161 struct iattr attr = {
162 .ia_valid =
163 ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
164 .ia_atime = stat->atime,
165 .ia_mtime = stat->mtime,
166 };
167
168 return notify_change(upperdentry, &attr, NULL);
169}
170
171int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
172{
173 int err = 0;
174
175 if (!S_ISLNK(stat->mode)) {
176 struct iattr attr = {
177 .ia_valid = ATTR_MODE,
178 .ia_mode = stat->mode,
179 };
180 err = notify_change(upperdentry, &attr, NULL);
181 }
182 if (!err) {
183 struct iattr attr = {
184 .ia_valid = ATTR_UID | ATTR_GID,
185 .ia_uid = stat->uid,
186 .ia_gid = stat->gid,
187 };
188 err = notify_change(upperdentry, &attr, NULL);
189 }
190 if (!err)
191 ovl_set_timestamps(upperdentry, stat);
192
193 return err;
194
195}
196
197static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
198 struct dentry *dentry, struct path *lowerpath,
199 struct kstat *stat, struct iattr *attr,
200 const char *link)
201{
202 struct inode *wdir = workdir->d_inode;
203 struct inode *udir = upperdir->d_inode;
204 struct dentry *newdentry = NULL;
205 struct dentry *upper = NULL;
206 umode_t mode = stat->mode;
207 int err;
208
209 newdentry = ovl_lookup_temp(workdir, dentry);
210 err = PTR_ERR(newdentry);
211 if (IS_ERR(newdentry))
212 goto out;
213
214 upper = lookup_one_len(dentry->d_name.name, upperdir,
215 dentry->d_name.len);
216 err = PTR_ERR(upper);
217 if (IS_ERR(upper))
218 goto out1;
219
220 /* Can't properly set mode on creation because of the umask */
221 stat->mode &= S_IFMT;
222 err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
223 stat->mode = mode;
224 if (err)
225 goto out2;
226
227 if (S_ISREG(stat->mode)) {
228 struct path upperpath;
229 ovl_path_upper(dentry, &upperpath);
230 BUG_ON(upperpath.dentry != NULL);
231 upperpath.dentry = newdentry;
232
233 err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
234 if (err)
235 goto out_cleanup;
236 }
237
238 err = ovl_copy_xattr(lowerpath->dentry, newdentry);
239 if (err)
240 goto out_cleanup;
241
242 mutex_lock(&newdentry->d_inode->i_mutex);
243 err = ovl_set_attr(newdentry, stat);
244 if (!err && attr)
245 err = notify_change(newdentry, attr, NULL);
246 mutex_unlock(&newdentry->d_inode->i_mutex);
247 if (err)
248 goto out_cleanup;
249
250 err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
251 if (err)
252 goto out_cleanup;
253
254 ovl_dentry_update(dentry, newdentry);
255 newdentry = NULL;
256
257 /*
258 * Non-directores become opaque when copied up.
259 */
260 if (!S_ISDIR(stat->mode))
261 ovl_dentry_set_opaque(dentry, true);
262out2:
263 dput(upper);
264out1:
265 dput(newdentry);
266out:
267 return err;
268
269out_cleanup:
270 ovl_cleanup(wdir, newdentry);
271 goto out;
272}
273
274/*
275 * Copy up a single dentry
276 *
277 * Directory renames only allowed on "pure upper" (already created on
278 * upper filesystem, never copied up). Directories which are on lower or
279 * are merged may not be renamed. For these -EXDEV is returned and
280 * userspace has to deal with it. This means, when copying up a
281 * directory we can rely on it and ancestors being stable.
282 *
283 * Non-directory renames start with copy up of source if necessary. The
284 * actual rename will only proceed once the copy up was successful. Copy
285 * up uses upper parent i_mutex for exclusion. Since rename can change
286 * d_parent it is possible that the copy up will lock the old parent. At
287 * that point the file will have already been copied up anyway.
288 */
289int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
290 struct path *lowerpath, struct kstat *stat,
291 struct iattr *attr)
292{
293 struct dentry *workdir = ovl_workdir(dentry);
294 int err;
295 struct kstat pstat;
296 struct path parentpath;
297 struct dentry *upperdir;
298 struct dentry *upperdentry;
299 const struct cred *old_cred;
300 struct cred *override_cred;
301 char *link = NULL;
302
303 ovl_path_upper(parent, &parentpath);
304 upperdir = parentpath.dentry;
305
306 err = vfs_getattr(&parentpath, &pstat);
307 if (err)
308 return err;
309
310 if (S_ISLNK(stat->mode)) {
311 link = ovl_read_symlink(lowerpath->dentry);
312 if (IS_ERR(link))
313 return PTR_ERR(link);
314 }
315
316 err = -ENOMEM;
317 override_cred = prepare_creds();
318 if (!override_cred)
319 goto out_free_link;
320
321 override_cred->fsuid = stat->uid;
322 override_cred->fsgid = stat->gid;
323 /*
324 * CAP_SYS_ADMIN for copying up extended attributes
325 * CAP_DAC_OVERRIDE for create
326 * CAP_FOWNER for chmod, timestamp update
327 * CAP_FSETID for chmod
328 * CAP_CHOWN for chown
329 * CAP_MKNOD for mknod
330 */
331 cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
332 cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
333 cap_raise(override_cred->cap_effective, CAP_FOWNER);
334 cap_raise(override_cred->cap_effective, CAP_FSETID);
335 cap_raise(override_cred->cap_effective, CAP_CHOWN);
336 cap_raise(override_cred->cap_effective, CAP_MKNOD);
337 old_cred = override_creds(override_cred);
338
339 err = -EIO;
340 if (lock_rename(workdir, upperdir) != NULL) {
341 pr_err("overlayfs: failed to lock workdir+upperdir\n");
342 goto out_unlock;
343 }
344 upperdentry = ovl_dentry_upper(dentry);
345 if (upperdentry) {
346 unlock_rename(workdir, upperdir);
347 err = 0;
348 /* Raced with another copy-up? Do the setattr here */
349 if (attr) {
350 mutex_lock(&upperdentry->d_inode->i_mutex);
351 err = notify_change(upperdentry, attr, NULL);
352 mutex_unlock(&upperdentry->d_inode->i_mutex);
353 }
354 goto out_put_cred;
355 }
356
357 err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
358 stat, attr, link);
359 if (!err) {
360 /* Restore timestamps on parent (best effort) */
361 ovl_set_timestamps(upperdir, &pstat);
362 }
363out_unlock:
364 unlock_rename(workdir, upperdir);
365out_put_cred:
366 revert_creds(old_cred);
367 put_cred(override_cred);
368
369out_free_link:
370 if (link)
371 free_page((unsigned long) link);
372
373 return err;
374}
375
376int ovl_copy_up(struct dentry *dentry)
377{
378 int err;
379
380 err = 0;
381 while (!err) {
382 struct dentry *next;
383 struct dentry *parent;
384 struct path lowerpath;
385 struct kstat stat;
386 enum ovl_path_type type = ovl_path_type(dentry);
387
388 if (type != OVL_PATH_LOWER)
389 break;
390
391 next = dget(dentry);
392 /* find the topmost dentry not yet copied up */
393 for (;;) {
394 parent = dget_parent(next);
395
396 type = ovl_path_type(parent);
397 if (type != OVL_PATH_LOWER)
398 break;
399
400 dput(next);
401 next = parent;
402 }
403
404 ovl_path_lower(next, &lowerpath);
405 err = vfs_getattr(&lowerpath, &stat);
406 if (!err)
407 err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
408
409 dput(parent);
410 dput(next);
411 }
412
413 return err;
414}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
new file mode 100644
index 000000000000..15cd91ad9940
--- /dev/null
+++ b/fs/overlayfs/dir.c
@@ -0,0 +1,921 @@
1/*
2 *
3 * Copyright (C) 2011 Novell Inc.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published by
7 * the Free Software Foundation.
8 */
9
10#include <linux/fs.h>
11#include <linux/namei.h>
12#include <linux/xattr.h>
13#include <linux/security.h>
14#include <linux/cred.h>
15#include "overlayfs.h"
16
17void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
18{
19 int err;
20
21 dget(wdentry);
22 if (S_ISDIR(wdentry->d_inode->i_mode))
23 err = ovl_do_rmdir(wdir, wdentry);
24 else
25 err = ovl_do_unlink(wdir, wdentry);
26 dput(wdentry);
27
28 if (err) {
29 pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
30 wdentry, err);
31 }
32}
33
34struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
35{
36 struct dentry *temp;
37 char name[20];
38
39 snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
40
41 temp = lookup_one_len(name, workdir, strlen(name));
42 if (!IS_ERR(temp) && temp->d_inode) {
43 pr_err("overlayfs: workdir/%s already exists\n", name);
44 dput(temp);
45 temp = ERR_PTR(-EIO);
46 }
47
48 return temp;
49}
50
51/* caller holds i_mutex on workdir */
52static struct dentry *ovl_whiteout(struct dentry *workdir,
53 struct dentry *dentry)
54{
55 int err;
56 struct dentry *whiteout;
57 struct inode *wdir = workdir->d_inode;
58
59 whiteout = ovl_lookup_temp(workdir, dentry);
60 if (IS_ERR(whiteout))
61 return whiteout;
62
63 err = ovl_do_whiteout(wdir, whiteout);
64 if (err) {
65 dput(whiteout);
66 whiteout = ERR_PTR(err);
67 }
68
69 return whiteout;
70}
71
72int ovl_create_real(struct inode *dir, struct dentry *newdentry,
73 struct kstat *stat, const char *link,
74 struct dentry *hardlink, bool debug)
75{
76 int err;
77
78 if (newdentry->d_inode)
79 return -ESTALE;
80
81 if (hardlink) {
82 err = ovl_do_link(hardlink, dir, newdentry, debug);
83 } else {
84 switch (stat->mode & S_IFMT) {
85 case S_IFREG:
86 err = ovl_do_create(dir, newdentry, stat->mode, debug);
87 break;
88
89 case S_IFDIR:
90 err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
91 break;
92
93 case S_IFCHR:
94 case S_IFBLK:
95 case S_IFIFO:
96 case S_IFSOCK:
97 err = ovl_do_mknod(dir, newdentry,
98 stat->mode, stat->rdev, debug);
99 break;
100
101 case S_IFLNK:
102 err = ovl_do_symlink(dir, newdentry, link, debug);
103 break;
104
105 default:
106 err = -EPERM;
107 }
108 }
109 if (!err && WARN_ON(!newdentry->d_inode)) {
110 /*
111 * Not quite sure if non-instantiated dentry is legal or not.
112 * VFS doesn't seem to care so check and warn here.
113 */
114 err = -ENOENT;
115 }
116 return err;
117}
118
119static int ovl_set_opaque(struct dentry *upperdentry)
120{
121 return ovl_do_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0);
122}
123
124static void ovl_remove_opaque(struct dentry *upperdentry)
125{
126 int err;
127
128 err = ovl_do_removexattr(upperdentry, ovl_opaque_xattr);
129 if (err) {
130 pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
131 upperdentry->d_name.name, err);
132 }
133}
134
135static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
136 struct kstat *stat)
137{
138 int err;
139 enum ovl_path_type type;
140 struct path realpath;
141
142 type = ovl_path_real(dentry, &realpath);
143 err = vfs_getattr(&realpath, stat);
144 if (err)
145 return err;
146
147 stat->dev = dentry->d_sb->s_dev;
148 stat->ino = dentry->d_inode->i_ino;
149
150 /*
151 * It's probably not worth it to count subdirs to get the
152 * correct link count. nlink=1 seems to pacify 'find' and
153 * other utilities.
154 */
155 if (type == OVL_PATH_MERGE)
156 stat->nlink = 1;
157
158 return 0;
159}
160
161static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
162 struct kstat *stat, const char *link,
163 struct dentry *hardlink)
164{
165 struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
166 struct inode *udir = upperdir->d_inode;
167 struct dentry *newdentry;
168 int err;
169
170 mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
171 newdentry = lookup_one_len(dentry->d_name.name, upperdir,
172 dentry->d_name.len);
173 err = PTR_ERR(newdentry);
174 if (IS_ERR(newdentry))
175 goto out_unlock;
176 err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
177 if (err)
178 goto out_dput;
179
180 ovl_dentry_version_inc(dentry->d_parent);
181 ovl_dentry_update(dentry, newdentry);
182 ovl_copyattr(newdentry->d_inode, inode);
183 d_instantiate(dentry, inode);
184 newdentry = NULL;
185out_dput:
186 dput(newdentry);
187out_unlock:
188 mutex_unlock(&udir->i_mutex);
189 return err;
190}
191
192static int ovl_lock_rename_workdir(struct dentry *workdir,
193 struct dentry *upperdir)
194{
195 /* Workdir should not be the same as upperdir */
196 if (workdir == upperdir)
197 goto err;
198
199 /* Workdir should not be subdir of upperdir and vice versa */
200 if (lock_rename(workdir, upperdir) != NULL)
201 goto err_unlock;
202
203 return 0;
204
205err_unlock:
206 unlock_rename(workdir, upperdir);
207err:
208 pr_err("overlayfs: failed to lock workdir+upperdir\n");
209 return -EIO;
210}
211
212static struct dentry *ovl_clear_empty(struct dentry *dentry,
213 struct list_head *list)
214{
215 struct dentry *workdir = ovl_workdir(dentry);
216 struct inode *wdir = workdir->d_inode;
217 struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
218 struct inode *udir = upperdir->d_inode;
219 struct path upperpath;
220 struct dentry *upper;
221 struct dentry *opaquedir;
222 struct kstat stat;
223 int err;
224
225 err = ovl_lock_rename_workdir(workdir, upperdir);
226 if (err)
227 goto out;
228
229 ovl_path_upper(dentry, &upperpath);
230 err = vfs_getattr(&upperpath, &stat);
231 if (err)
232 goto out_unlock;
233
234 err = -ESTALE;
235 if (!S_ISDIR(stat.mode))
236 goto out_unlock;
237 upper = upperpath.dentry;
238 if (upper->d_parent->d_inode != udir)
239 goto out_unlock;
240
241 opaquedir = ovl_lookup_temp(workdir, dentry);
242 err = PTR_ERR(opaquedir);
243 if (IS_ERR(opaquedir))
244 goto out_unlock;
245
246 err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
247 if (err)
248 goto out_dput;
249
250 err = ovl_copy_xattr(upper, opaquedir);
251 if (err)
252 goto out_cleanup;
253
254 err = ovl_set_opaque(opaquedir);
255 if (err)
256 goto out_cleanup;
257
258 mutex_lock(&opaquedir->d_inode->i_mutex);
259 err = ovl_set_attr(opaquedir, &stat);
260 mutex_unlock(&opaquedir->d_inode->i_mutex);
261 if (err)
262 goto out_cleanup;
263
264 err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
265 if (err)
266 goto out_cleanup;
267
268 ovl_cleanup_whiteouts(upper, list);
269 ovl_cleanup(wdir, upper);
270 unlock_rename(workdir, upperdir);
271
272 /* dentry's upper doesn't match now, get rid of it */
273 d_drop(dentry);
274
275 return opaquedir;
276
277out_cleanup:
278 ovl_cleanup(wdir, opaquedir);
279out_dput:
280 dput(opaquedir);
281out_unlock:
282 unlock_rename(workdir, upperdir);
283out:
284 return ERR_PTR(err);
285}
286
287static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry,
288 enum ovl_path_type type)
289{
290 int err;
291 struct dentry *ret = NULL;
292 LIST_HEAD(list);
293
294 err = ovl_check_empty_dir(dentry, &list);
295 if (err)
296 ret = ERR_PTR(err);
297 else if (type == OVL_PATH_MERGE)
298 ret = ovl_clear_empty(dentry, &list);
299
300 ovl_cache_free(&list);
301
302 return ret;
303}
304
305static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
306 struct kstat *stat, const char *link,
307 struct dentry *hardlink)
308{
309 struct dentry *workdir = ovl_workdir(dentry);
310 struct inode *wdir = workdir->d_inode;
311 struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
312 struct inode *udir = upperdir->d_inode;
313 struct dentry *upper;
314 struct dentry *newdentry;
315 int err;
316
317 err = ovl_lock_rename_workdir(workdir, upperdir);
318 if (err)
319 goto out;
320
321 newdentry = ovl_lookup_temp(workdir, dentry);
322 err = PTR_ERR(newdentry);
323 if (IS_ERR(newdentry))
324 goto out_unlock;
325
326 upper = lookup_one_len(dentry->d_name.name, upperdir,
327 dentry->d_name.len);
328 err = PTR_ERR(upper);
329 if (IS_ERR(upper))
330 goto out_dput;
331
332 err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
333 if (err)
334 goto out_dput2;
335
336 if (S_ISDIR(stat->mode)) {
337 err = ovl_set_opaque(newdentry);
338 if (err)
339 goto out_cleanup;
340
341 err = ovl_do_rename(wdir, newdentry, udir, upper,
342 RENAME_EXCHANGE);
343 if (err)
344 goto out_cleanup;
345
346 ovl_cleanup(wdir, upper);
347 } else {
348 err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
349 if (err)
350 goto out_cleanup;
351 }
352 ovl_dentry_version_inc(dentry->d_parent);
353 ovl_dentry_update(dentry, newdentry);
354 ovl_copyattr(newdentry->d_inode, inode);
355 d_instantiate(dentry, inode);
356 newdentry = NULL;
357out_dput2:
358 dput(upper);
359out_dput:
360 dput(newdentry);
361out_unlock:
362 unlock_rename(workdir, upperdir);
363out:
364 return err;
365
366out_cleanup:
367 ovl_cleanup(wdir, newdentry);
368 goto out_dput2;
369}
370
371static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
372 const char *link, struct dentry *hardlink)
373{
374 int err;
375 struct inode *inode;
376 struct kstat stat = {
377 .mode = mode,
378 .rdev = rdev,
379 };
380
381 err = -ENOMEM;
382 inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
383 if (!inode)
384 goto out;
385
386 err = ovl_copy_up(dentry->d_parent);
387 if (err)
388 goto out_iput;
389
390 if (!ovl_dentry_is_opaque(dentry)) {
391 err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
392 } else {
393 const struct cred *old_cred;
394 struct cred *override_cred;
395
396 err = -ENOMEM;
397 override_cred = prepare_creds();
398 if (!override_cred)
399 goto out_iput;
400
401 /*
402 * CAP_SYS_ADMIN for setting opaque xattr
403 * CAP_DAC_OVERRIDE for create in workdir, rename
404 * CAP_FOWNER for removing whiteout from sticky dir
405 */
406 cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
407 cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
408 cap_raise(override_cred->cap_effective, CAP_FOWNER);
409 old_cred = override_creds(override_cred);
410
411 err = ovl_create_over_whiteout(dentry, inode, &stat, link,
412 hardlink);
413
414 revert_creds(old_cred);
415 put_cred(override_cred);
416 }
417
418 if (!err)
419 inode = NULL;
420out_iput:
421 iput(inode);
422out:
423 return err;
424}
425
426static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
427 const char *link)
428{
429 int err;
430
431 err = ovl_want_write(dentry);
432 if (!err) {
433 err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
434 ovl_drop_write(dentry);
435 }
436
437 return err;
438}
439
440static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
441 bool excl)
442{
443 return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
444}
445
446static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
447{
448 return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
449}
450
451static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
452 dev_t rdev)
453{
454 /* Don't allow creation of "whiteout" on overlay */
455 if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
456 return -EPERM;
457
458 return ovl_create_object(dentry, mode, rdev, NULL);
459}
460
461static int ovl_symlink(struct inode *dir, struct dentry *dentry,
462 const char *link)
463{
464 return ovl_create_object(dentry, S_IFLNK, 0, link);
465}
466
467static int ovl_link(struct dentry *old, struct inode *newdir,
468 struct dentry *new)
469{
470 int err;
471 struct dentry *upper;
472
473 err = ovl_want_write(old);
474 if (err)
475 goto out;
476
477 err = ovl_copy_up(old);
478 if (err)
479 goto out_drop_write;
480
481 upper = ovl_dentry_upper(old);
482 err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
483
484out_drop_write:
485 ovl_drop_write(old);
486out:
487 return err;
488}
489
490static int ovl_remove_and_whiteout(struct dentry *dentry,
491 enum ovl_path_type type, bool is_dir)
492{
493 struct dentry *workdir = ovl_workdir(dentry);
494 struct inode *wdir = workdir->d_inode;
495 struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
496 struct inode *udir = upperdir->d_inode;
497 struct dentry *whiteout;
498 struct dentry *upper;
499 struct dentry *opaquedir = NULL;
500 int err;
501
502 if (is_dir) {
503 opaquedir = ovl_check_empty_and_clear(dentry, type);
504 err = PTR_ERR(opaquedir);
505 if (IS_ERR(opaquedir))
506 goto out;
507 }
508
509 err = ovl_lock_rename_workdir(workdir, upperdir);
510 if (err)
511 goto out_dput;
512
513 whiteout = ovl_whiteout(workdir, dentry);
514 err = PTR_ERR(whiteout);
515 if (IS_ERR(whiteout))
516 goto out_unlock;
517
518 if (type == OVL_PATH_LOWER) {
519 upper = lookup_one_len(dentry->d_name.name, upperdir,
520 dentry->d_name.len);
521 err = PTR_ERR(upper);
522 if (IS_ERR(upper))
523 goto kill_whiteout;
524
525 err = ovl_do_rename(wdir, whiteout, udir, upper, 0);
526 dput(upper);
527 if (err)
528 goto kill_whiteout;
529 } else {
530 int flags = 0;
531
532 upper = ovl_dentry_upper(dentry);
533 if (opaquedir)
534 upper = opaquedir;
535 err = -ESTALE;
536 if (upper->d_parent != upperdir)
537 goto kill_whiteout;
538
539 if (is_dir)
540 flags |= RENAME_EXCHANGE;
541
542 err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
543 if (err)
544 goto kill_whiteout;
545
546 if (is_dir)
547 ovl_cleanup(wdir, upper);
548 }
549 ovl_dentry_version_inc(dentry->d_parent);
550out_d_drop:
551 d_drop(dentry);
552 dput(whiteout);
553out_unlock:
554 unlock_rename(workdir, upperdir);
555out_dput:
556 dput(opaquedir);
557out:
558 return err;
559
560kill_whiteout:
561 ovl_cleanup(wdir, whiteout);
562 goto out_d_drop;
563}
564
565static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
566{
567 struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
568 struct inode *dir = upperdir->d_inode;
569 struct dentry *upper = ovl_dentry_upper(dentry);
570 int err;
571
572 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
573 err = -ESTALE;
574 if (upper->d_parent == upperdir) {
575 /* Don't let d_delete() think it can reset d_inode */
576 dget(upper);
577 if (is_dir)
578 err = vfs_rmdir(dir, upper);
579 else
580 err = vfs_unlink(dir, upper, NULL);
581 dput(upper);
582 ovl_dentry_version_inc(dentry->d_parent);
583 }
584
585 /*
586 * Keeping this dentry hashed would mean having to release
587 * upperpath/lowerpath, which could only be done if we are the
588 * sole user of this dentry. Too tricky... Just unhash for
589 * now.
590 */
591 d_drop(dentry);
592 mutex_unlock(&dir->i_mutex);
593
594 return err;
595}
596
597static inline int ovl_check_sticky(struct dentry *dentry)
598{
599 struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
600 struct inode *inode = ovl_dentry_real(dentry)->d_inode;
601
602 if (check_sticky(dir, inode))
603 return -EPERM;
604
605 return 0;
606}
607
608static int ovl_do_remove(struct dentry *dentry, bool is_dir)
609{
610 enum ovl_path_type type;
611 int err;
612
613 err = ovl_check_sticky(dentry);
614 if (err)
615 goto out;
616
617 err = ovl_want_write(dentry);
618 if (err)
619 goto out;
620
621 err = ovl_copy_up(dentry->d_parent);
622 if (err)
623 goto out_drop_write;
624
625 type = ovl_path_type(dentry);
626 if (type == OVL_PATH_PURE_UPPER) {
627 err = ovl_remove_upper(dentry, is_dir);
628 } else {
629 const struct cred *old_cred;
630 struct cred *override_cred;
631
632 err = -ENOMEM;
633 override_cred = prepare_creds();
634 if (!override_cred)
635 goto out_drop_write;
636
637 /*
638 * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
639 * CAP_DAC_OVERRIDE for create in workdir, rename
640 * CAP_FOWNER for removing whiteout from sticky dir
641 * CAP_FSETID for chmod of opaque dir
642 * CAP_CHOWN for chown of opaque dir
643 */
644 cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
645 cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
646 cap_raise(override_cred->cap_effective, CAP_FOWNER);
647 cap_raise(override_cred->cap_effective, CAP_FSETID);
648 cap_raise(override_cred->cap_effective, CAP_CHOWN);
649 old_cred = override_creds(override_cred);
650
651 err = ovl_remove_and_whiteout(dentry, type, is_dir);
652
653 revert_creds(old_cred);
654 put_cred(override_cred);
655 }
656out_drop_write:
657 ovl_drop_write(dentry);
658out:
659 return err;
660}
661
662static int ovl_unlink(struct inode *dir, struct dentry *dentry)
663{
664 return ovl_do_remove(dentry, false);
665}
666
667static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
668{
669 return ovl_do_remove(dentry, true);
670}
671
672static int ovl_rename2(struct inode *olddir, struct dentry *old,
673 struct inode *newdir, struct dentry *new,
674 unsigned int flags)
675{
676 int err;
677 enum ovl_path_type old_type;
678 enum ovl_path_type new_type;
679 struct dentry *old_upperdir;
680 struct dentry *new_upperdir;
681 struct dentry *olddentry;
682 struct dentry *newdentry;
683 struct dentry *trap;
684 bool old_opaque;
685 bool new_opaque;
686 bool new_create = false;
687 bool cleanup_whiteout = false;
688 bool overwrite = !(flags & RENAME_EXCHANGE);
689 bool is_dir = S_ISDIR(old->d_inode->i_mode);
690 bool new_is_dir = false;
691 struct dentry *opaquedir = NULL;
692 const struct cred *old_cred = NULL;
693 struct cred *override_cred = NULL;
694
695 err = -EINVAL;
696 if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
697 goto out;
698
699 flags &= ~RENAME_NOREPLACE;
700
701 err = ovl_check_sticky(old);
702 if (err)
703 goto out;
704
705 /* Don't copy up directory trees */
706 old_type = ovl_path_type(old);
707 err = -EXDEV;
708 if ((old_type == OVL_PATH_LOWER || old_type == OVL_PATH_MERGE) && is_dir)
709 goto out;
710
711 if (new->d_inode) {
712 err = ovl_check_sticky(new);
713 if (err)
714 goto out;
715
716 if (S_ISDIR(new->d_inode->i_mode))
717 new_is_dir = true;
718
719 new_type = ovl_path_type(new);
720 err = -EXDEV;
721 if (!overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir)
722 goto out;
723
724 err = 0;
725 if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) {
726 if (ovl_dentry_lower(old)->d_inode ==
727 ovl_dentry_lower(new)->d_inode)
728 goto out;
729 }
730 if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) {
731 if (ovl_dentry_upper(old)->d_inode ==
732 ovl_dentry_upper(new)->d_inode)
733 goto out;
734 }
735 } else {
736 if (ovl_dentry_is_opaque(new))
737 new_type = OVL_PATH_UPPER;
738 else
739 new_type = OVL_PATH_PURE_UPPER;
740 }
741
742 err = ovl_want_write(old);
743 if (err)
744 goto out;
745
746 err = ovl_copy_up(old);
747 if (err)
748 goto out_drop_write;
749
750 err = ovl_copy_up(new->d_parent);
751 if (err)
752 goto out_drop_write;
753 if (!overwrite) {
754 err = ovl_copy_up(new);
755 if (err)
756 goto out_drop_write;
757 }
758
759 old_opaque = old_type != OVL_PATH_PURE_UPPER;
760 new_opaque = new_type != OVL_PATH_PURE_UPPER;
761
762 if (old_opaque || new_opaque) {
763 err = -ENOMEM;
764 override_cred = prepare_creds();
765 if (!override_cred)
766 goto out_drop_write;
767
768 /*
769 * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
770 * CAP_DAC_OVERRIDE for create in workdir
771 * CAP_FOWNER for removing whiteout from sticky dir
772 * CAP_FSETID for chmod of opaque dir
773 * CAP_CHOWN for chown of opaque dir
774 */
775 cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
776 cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
777 cap_raise(override_cred->cap_effective, CAP_FOWNER);
778 cap_raise(override_cred->cap_effective, CAP_FSETID);
779 cap_raise(override_cred->cap_effective, CAP_CHOWN);
780 old_cred = override_creds(override_cred);
781 }
782
783 if (overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) {
784 opaquedir = ovl_check_empty_and_clear(new, new_type);
785 err = PTR_ERR(opaquedir);
786 if (IS_ERR(opaquedir)) {
787 opaquedir = NULL;
788 goto out_revert_creds;
789 }
790 }
791
792 if (overwrite) {
793 if (old_opaque) {
794 if (new->d_inode || !new_opaque) {
795 /* Whiteout source */
796 flags |= RENAME_WHITEOUT;
797 } else {
798 /* Switch whiteouts */
799 flags |= RENAME_EXCHANGE;
800 }
801 } else if (is_dir && !new->d_inode && new_opaque) {
802 flags |= RENAME_EXCHANGE;
803 cleanup_whiteout = true;
804 }
805 }
806
807 old_upperdir = ovl_dentry_upper(old->d_parent);
808 new_upperdir = ovl_dentry_upper(new->d_parent);
809
810 trap = lock_rename(new_upperdir, old_upperdir);
811
812 olddentry = ovl_dentry_upper(old);
813 newdentry = ovl_dentry_upper(new);
814 if (newdentry) {
815 if (opaquedir) {
816 newdentry = opaquedir;
817 opaquedir = NULL;
818 } else {
819 dget(newdentry);
820 }
821 } else {
822 new_create = true;
823 newdentry = lookup_one_len(new->d_name.name, new_upperdir,
824 new->d_name.len);
825 err = PTR_ERR(newdentry);
826 if (IS_ERR(newdentry))
827 goto out_unlock;
828 }
829
830 err = -ESTALE;
831 if (olddentry->d_parent != old_upperdir)
832 goto out_dput;
833 if (newdentry->d_parent != new_upperdir)
834 goto out_dput;
835 if (olddentry == trap)
836 goto out_dput;
837 if (newdentry == trap)
838 goto out_dput;
839
840 if (is_dir && !old_opaque && new_opaque) {
841 err = ovl_set_opaque(olddentry);
842 if (err)
843 goto out_dput;
844 }
845 if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
846 err = ovl_set_opaque(newdentry);
847 if (err)
848 goto out_dput;
849 }
850
851 if (old_opaque || new_opaque) {
852 err = ovl_do_rename(old_upperdir->d_inode, olddentry,
853 new_upperdir->d_inode, newdentry,
854 flags);
855 } else {
856 /* No debug for the plain case */
857 BUG_ON(flags & ~RENAME_EXCHANGE);
858 err = vfs_rename(old_upperdir->d_inode, olddentry,
859 new_upperdir->d_inode, newdentry,
860 NULL, flags);
861 }
862
863 if (err) {
864 if (is_dir && !old_opaque && new_opaque)
865 ovl_remove_opaque(olddentry);
866 if (!overwrite && new_is_dir && old_opaque && !new_opaque)
867 ovl_remove_opaque(newdentry);
868 goto out_dput;
869 }
870
871 if (is_dir && old_opaque && !new_opaque)
872 ovl_remove_opaque(olddentry);
873 if (!overwrite && new_is_dir && !old_opaque && new_opaque)
874 ovl_remove_opaque(newdentry);
875
876 if (old_opaque != new_opaque) {
877 ovl_dentry_set_opaque(old, new_opaque);
878 if (!overwrite)
879 ovl_dentry_set_opaque(new, old_opaque);
880 }
881
882 if (cleanup_whiteout)
883 ovl_cleanup(old_upperdir->d_inode, newdentry);
884
885 ovl_dentry_version_inc(old->d_parent);
886 ovl_dentry_version_inc(new->d_parent);
887
888out_dput:
889 dput(newdentry);
890out_unlock:
891 unlock_rename(new_upperdir, old_upperdir);
892out_revert_creds:
893 if (old_opaque || new_opaque) {
894 revert_creds(old_cred);
895 put_cred(override_cred);
896 }
897out_drop_write:
898 ovl_drop_write(old);
899out:
900 dput(opaquedir);
901 return err;
902}
903
904const struct inode_operations ovl_dir_inode_operations = {
905 .lookup = ovl_lookup,
906 .mkdir = ovl_mkdir,
907 .symlink = ovl_symlink,
908 .unlink = ovl_unlink,
909 .rmdir = ovl_rmdir,
910 .rename2 = ovl_rename2,
911 .link = ovl_link,
912 .setattr = ovl_setattr,
913 .create = ovl_create,
914 .mknod = ovl_mknod,
915 .permission = ovl_permission,
916 .getattr = ovl_dir_getattr,
917 .setxattr = ovl_setxattr,
918 .getxattr = ovl_getxattr,
919 .listxattr = ovl_listxattr,
920 .removexattr = ovl_removexattr,
921};
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
new file mode 100644
index 000000000000..af2d18c9fcee
--- /dev/null
+++ b/fs/overlayfs/inode.c
@@ -0,0 +1,425 @@
1/*
2 *
3 * Copyright (C) 2011 Novell Inc.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published by
7 * the Free Software Foundation.
8 */
9
10#include <linux/fs.h>
11#include <linux/slab.h>
12#include <linux/xattr.h>
13#include "overlayfs.h"
14
15static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
16 bool no_data)
17{
18 int err;
19 struct dentry *parent;
20 struct kstat stat;
21 struct path lowerpath;
22
23 parent = dget_parent(dentry);
24 err = ovl_copy_up(parent);
25 if (err)
26 goto out_dput_parent;
27
28 ovl_path_lower(dentry, &lowerpath);
29 err = vfs_getattr(&lowerpath, &stat);
30 if (err)
31 goto out_dput_parent;
32
33 if (no_data)
34 stat.size = 0;
35
36 err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
37
38out_dput_parent:
39 dput(parent);
40 return err;
41}
42
43int ovl_setattr(struct dentry *dentry, struct iattr *attr)
44{
45 int err;
46 struct dentry *upperdentry;
47
48 err = ovl_want_write(dentry);
49 if (err)
50 goto out;
51
52 upperdentry = ovl_dentry_upper(dentry);
53 if (upperdentry) {
54 mutex_lock(&upperdentry->d_inode->i_mutex);
55 err = notify_change(upperdentry, attr, NULL);
56 mutex_unlock(&upperdentry->d_inode->i_mutex);
57 } else {
58 err = ovl_copy_up_last(dentry, attr, false);
59 }
60 ovl_drop_write(dentry);
61out:
62 return err;
63}
64
65static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
66 struct kstat *stat)
67{
68 struct path realpath;
69
70 ovl_path_real(dentry, &realpath);
71 return vfs_getattr(&realpath, stat);
72}
73
74int ovl_permission(struct inode *inode, int mask)
75{
76 struct ovl_entry *oe;
77 struct dentry *alias = NULL;
78 struct inode *realinode;
79 struct dentry *realdentry;
80 bool is_upper;
81 int err;
82
83 if (S_ISDIR(inode->i_mode)) {
84 oe = inode->i_private;
85 } else if (mask & MAY_NOT_BLOCK) {
86 return -ECHILD;
87 } else {
88 /*
89 * For non-directories find an alias and get the info
90 * from there.
91 */
92 alias = d_find_any_alias(inode);
93 if (WARN_ON(!alias))
94 return -ENOENT;
95
96 oe = alias->d_fsdata;
97 }
98
99 realdentry = ovl_entry_real(oe, &is_upper);
100
101 /* Careful in RCU walk mode */
102 realinode = ACCESS_ONCE(realdentry->d_inode);
103 if (!realinode) {
104 WARN_ON(!(mask & MAY_NOT_BLOCK));
105 err = -ENOENT;
106 goto out_dput;
107 }
108
109 if (mask & MAY_WRITE) {
110 umode_t mode = realinode->i_mode;
111
112 /*
113 * Writes will always be redirected to upper layer, so
114 * ignore lower layer being read-only.
115 *
116 * If the overlay itself is read-only then proceed
117 * with the permission check, don't return EROFS.
118 * This will only happen if this is the lower layer of
119 * another overlayfs.
120 *
121 * If upper fs becomes read-only after the overlay was
122 * constructed return EROFS to prevent modification of
123 * upper layer.
124 */
125 err = -EROFS;
126 if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
127 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
128 goto out_dput;
129 }
130
131 err = __inode_permission(realinode, mask);
132out_dput:
133 dput(alias);
134 return err;
135}
136
137
138struct ovl_link_data {
139 struct dentry *realdentry;
140 void *cookie;
141};
142
143static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
144{
145 void *ret;
146 struct dentry *realdentry;
147 struct inode *realinode;
148
149 realdentry = ovl_dentry_real(dentry);
150 realinode = realdentry->d_inode;
151
152 if (WARN_ON(!realinode->i_op->follow_link))
153 return ERR_PTR(-EPERM);
154
155 ret = realinode->i_op->follow_link(realdentry, nd);
156 if (IS_ERR(ret))
157 return ret;
158
159 if (realinode->i_op->put_link) {
160 struct ovl_link_data *data;
161
162 data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
163 if (!data) {
164 realinode->i_op->put_link(realdentry, nd, ret);
165 return ERR_PTR(-ENOMEM);
166 }
167 data->realdentry = realdentry;
168 data->cookie = ret;
169
170 return data;
171 } else {
172 return NULL;
173 }
174}
175
176static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
177{
178 struct inode *realinode;
179 struct ovl_link_data *data = c;
180
181 if (!data)
182 return;
183
184 realinode = data->realdentry->d_inode;
185 realinode->i_op->put_link(data->realdentry, nd, data->cookie);
186 kfree(data);
187}
188
189static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
190{
191 struct path realpath;
192 struct inode *realinode;
193
194 ovl_path_real(dentry, &realpath);
195 realinode = realpath.dentry->d_inode;
196
197 if (!realinode->i_op->readlink)
198 return -EINVAL;
199
200 touch_atime(&realpath);
201
202 return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
203}
204
205
206static bool ovl_is_private_xattr(const char *name)
207{
208 return strncmp(name, "trusted.overlay.", 14) == 0;
209}
210
211int ovl_setxattr(struct dentry *dentry, const char *name,
212 const void *value, size_t size, int flags)
213{
214 int err;
215 struct dentry *upperdentry;
216
217 err = ovl_want_write(dentry);
218 if (err)
219 goto out;
220
221 err = -EPERM;
222 if (ovl_is_private_xattr(name))
223 goto out_drop_write;
224
225 err = ovl_copy_up(dentry);
226 if (err)
227 goto out_drop_write;
228
229 upperdentry = ovl_dentry_upper(dentry);
230 err = vfs_setxattr(upperdentry, name, value, size, flags);
231
232out_drop_write:
233 ovl_drop_write(dentry);
234out:
235 return err;
236}
237
238ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
239 void *value, size_t size)
240{
241 if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
242 ovl_is_private_xattr(name))
243 return -ENODATA;
244
245 return vfs_getxattr(ovl_dentry_real(dentry), name, value, size);
246}
247
248ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
249{
250 ssize_t res;
251 int off;
252
253 res = vfs_listxattr(ovl_dentry_real(dentry), list, size);
254 if (res <= 0 || size == 0)
255 return res;
256
257 if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE)
258 return res;
259
260 /* filter out private xattrs */
261 for (off = 0; off < res;) {
262 char *s = list + off;
263 size_t slen = strlen(s) + 1;
264
265 BUG_ON(off + slen > res);
266
267 if (ovl_is_private_xattr(s)) {
268 res -= slen;
269 memmove(s, s + slen, res - off);
270 } else {
271 off += slen;
272 }
273 }
274
275 return res;
276}
277
278int ovl_removexattr(struct dentry *dentry, const char *name)
279{
280 int err;
281 struct path realpath;
282 enum ovl_path_type type;
283
284 err = ovl_want_write(dentry);
285 if (err)
286 goto out;
287
288 if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
289 ovl_is_private_xattr(name))
290 goto out_drop_write;
291
292 type = ovl_path_real(dentry, &realpath);
293 if (type == OVL_PATH_LOWER) {
294 err = vfs_getxattr(realpath.dentry, name, NULL, 0);
295 if (err < 0)
296 goto out_drop_write;
297
298 err = ovl_copy_up(dentry);
299 if (err)
300 goto out_drop_write;
301
302 ovl_path_upper(dentry, &realpath);
303 }
304
305 err = vfs_removexattr(realpath.dentry, name);
306out_drop_write:
307 ovl_drop_write(dentry);
308out:
309 return err;
310}
311
312static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
313 struct dentry *realdentry)
314{
315 if (type != OVL_PATH_LOWER)
316 return false;
317
318 if (special_file(realdentry->d_inode->i_mode))
319 return false;
320
321 if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
322 return false;
323
324 return true;
325}
326
327static int ovl_dentry_open(struct dentry *dentry, struct file *file,
328 const struct cred *cred)
329{
330 int err;
331 struct path realpath;
332 enum ovl_path_type type;
333 bool want_write = false;
334
335 type = ovl_path_real(dentry, &realpath);
336 if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
337 want_write = true;
338 err = ovl_want_write(dentry);
339 if (err)
340 goto out;
341
342 if (file->f_flags & O_TRUNC)
343 err = ovl_copy_up_last(dentry, NULL, true);
344 else
345 err = ovl_copy_up(dentry);
346 if (err)
347 goto out_drop_write;
348
349 ovl_path_upper(dentry, &realpath);
350 }
351
352 err = vfs_open(&realpath, file, cred);
353out_drop_write:
354 if (want_write)
355 ovl_drop_write(dentry);
356out:
357 return err;
358}
359
360static const struct inode_operations ovl_file_inode_operations = {
361 .setattr = ovl_setattr,
362 .permission = ovl_permission,
363 .getattr = ovl_getattr,
364 .setxattr = ovl_setxattr,
365 .getxattr = ovl_getxattr,
366 .listxattr = ovl_listxattr,
367 .removexattr = ovl_removexattr,
368 .dentry_open = ovl_dentry_open,
369};
370
371static const struct inode_operations ovl_symlink_inode_operations = {
372 .setattr = ovl_setattr,
373 .follow_link = ovl_follow_link,
374 .put_link = ovl_put_link,
375 .readlink = ovl_readlink,
376 .getattr = ovl_getattr,
377 .setxattr = ovl_setxattr,
378 .getxattr = ovl_getxattr,
379 .listxattr = ovl_listxattr,
380 .removexattr = ovl_removexattr,
381};
382
383struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
384 struct ovl_entry *oe)
385{
386 struct inode *inode;
387
388 inode = new_inode(sb);
389 if (!inode)
390 return NULL;
391
392 mode &= S_IFMT;
393
394 inode->i_ino = get_next_ino();
395 inode->i_mode = mode;
396 inode->i_flags |= S_NOATIME | S_NOCMTIME;
397
398 switch (mode) {
399 case S_IFDIR:
400 inode->i_private = oe;
401 inode->i_op = &ovl_dir_inode_operations;
402 inode->i_fop = &ovl_dir_operations;
403 break;
404
405 case S_IFLNK:
406 inode->i_op = &ovl_symlink_inode_operations;
407 break;
408
409 case S_IFREG:
410 case S_IFSOCK:
411 case S_IFBLK:
412 case S_IFCHR:
413 case S_IFIFO:
414 inode->i_op = &ovl_file_inode_operations;
415 break;
416
417 default:
418 WARN(1, "illegal file type: %i\n", mode);
419 iput(inode);
420 inode = NULL;
421 }
422
423 return inode;
424
425}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
new file mode 100644
index 000000000000..814bed33dd07
--- /dev/null
+++ b/fs/overlayfs/overlayfs.h
@@ -0,0 +1,191 @@
1/*
2 *
3 * Copyright (C) 2011 Novell Inc.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published by
7 * the Free Software Foundation.
8 */
9
10#include <linux/kernel.h>
11
12struct ovl_entry;
13
14enum ovl_path_type {
15 OVL_PATH_PURE_UPPER,
16 OVL_PATH_UPPER,
17 OVL_PATH_MERGE,
18 OVL_PATH_LOWER,
19};
20
21extern const char *ovl_opaque_xattr;
22
23static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
24{
25 int err = vfs_rmdir(dir, dentry);
26 pr_debug("rmdir(%pd2) = %i\n", dentry, err);
27 return err;
28}
29
30static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
31{
32 int err = vfs_unlink(dir, dentry, NULL);
33 pr_debug("unlink(%pd2) = %i\n", dentry, err);
34 return err;
35}
36
37static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
38 struct dentry *new_dentry, bool debug)
39{
40 int err = vfs_link(old_dentry, dir, new_dentry, NULL);
41 if (debug) {
42 pr_debug("link(%pd2, %pd2) = %i\n",
43 old_dentry, new_dentry, err);
44 }
45 return err;
46}
47
48static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
49 umode_t mode, bool debug)
50{
51 int err = vfs_create(dir, dentry, mode, true);
52 if (debug)
53 pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
54 return err;
55}
56
57static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
58 umode_t mode, bool debug)
59{
60 int err = vfs_mkdir(dir, dentry, mode);
61 if (debug)
62 pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
63 return err;
64}
65
66static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
67 umode_t mode, dev_t dev, bool debug)
68{
69 int err = vfs_mknod(dir, dentry, mode, dev);
70 if (debug) {
71 pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
72 dentry, mode, dev, err);
73 }
74 return err;
75}
76
77static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
78 const char *oldname, bool debug)
79{
80 int err = vfs_symlink(dir, dentry, oldname);
81 if (debug)
82 pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
83 return err;
84}
85
86static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
87 const void *value, size_t size, int flags)
88{
89 int err = vfs_setxattr(dentry, name, value, size, flags);
90 pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
91 dentry, name, (int) size, (char *) value, flags, err);
92 return err;
93}
94
95static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
96{
97 int err = vfs_removexattr(dentry, name);
98 pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
99 return err;
100}
101
102static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
103 struct inode *newdir, struct dentry *newdentry,
104 unsigned int flags)
105{
106 int err;
107
108 pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
109 olddentry, newdentry, flags);
110
111 err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
112
113 if (err) {
114 pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
115 olddentry, newdentry, err);
116 }
117 return err;
118}
119
120static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
121{
122 int err = vfs_whiteout(dir, dentry);
123 pr_debug("whiteout(%pd2) = %i\n", dentry, err);
124 return err;
125}
126
127enum ovl_path_type ovl_path_type(struct dentry *dentry);
128u64 ovl_dentry_version_get(struct dentry *dentry);
129void ovl_dentry_version_inc(struct dentry *dentry);
130void ovl_path_upper(struct dentry *dentry, struct path *path);
131void ovl_path_lower(struct dentry *dentry, struct path *path);
132enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
133struct dentry *ovl_dentry_upper(struct dentry *dentry);
134struct dentry *ovl_dentry_lower(struct dentry *dentry);
135struct dentry *ovl_dentry_real(struct dentry *dentry);
136struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
137struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
138void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
139struct dentry *ovl_workdir(struct dentry *dentry);
140int ovl_want_write(struct dentry *dentry);
141void ovl_drop_write(struct dentry *dentry);
142bool ovl_dentry_is_opaque(struct dentry *dentry);
143void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
144bool ovl_is_whiteout(struct dentry *dentry);
145void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
146struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
147 unsigned int flags);
148struct file *ovl_path_open(struct path *path, int flags);
149
150struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
151 struct kstat *stat, const char *link);
152
153/* readdir.c */
154extern const struct file_operations ovl_dir_operations;
155int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
156void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
157void ovl_cache_free(struct list_head *list);
158
159/* inode.c */
160int ovl_setattr(struct dentry *dentry, struct iattr *attr);
161int ovl_permission(struct inode *inode, int mask);
162int ovl_setxattr(struct dentry *dentry, const char *name,
163 const void *value, size_t size, int flags);
164ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
165 void *value, size_t size);
166ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
167int ovl_removexattr(struct dentry *dentry, const char *name);
168
169struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
170 struct ovl_entry *oe);
171static inline void ovl_copyattr(struct inode *from, struct inode *to)
172{
173 to->i_uid = from->i_uid;
174 to->i_gid = from->i_gid;
175}
176
177/* dir.c */
178extern const struct inode_operations ovl_dir_inode_operations;
179struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
180int ovl_create_real(struct inode *dir, struct dentry *newdentry,
181 struct kstat *stat, const char *link,
182 struct dentry *hardlink, bool debug);
183void ovl_cleanup(struct inode *dir, struct dentry *dentry);
184
185/* copy_up.c */
186int ovl_copy_up(struct dentry *dentry);
187int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
188 struct path *lowerpath, struct kstat *stat,
189 struct iattr *attr);
190int ovl_copy_xattr(struct dentry *old, struct dentry *new);
191int ovl_set_attr(struct dentry *upper, struct kstat *stat);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
new file mode 100644
index 000000000000..2a7ef4f8e2a6
--- /dev/null
+++ b/fs/overlayfs/readdir.c
@@ -0,0 +1,593 @@
1/*
2 *
3 * Copyright (C) 2011 Novell Inc.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published by
7 * the Free Software Foundation.
8 */
9
10#include <linux/fs.h>
11#include <linux/slab.h>
12#include <linux/namei.h>
13#include <linux/file.h>
14#include <linux/xattr.h>
15#include <linux/rbtree.h>
16#include <linux/security.h>
17#include <linux/cred.h>
18#include "overlayfs.h"
19
20struct ovl_cache_entry {
21 unsigned int len;
22 unsigned int type;
23 u64 ino;
24 struct list_head l_node;
25 struct rb_node node;
26 bool is_whiteout;
27 bool is_cursor;
28 char name[];
29};
30
31struct ovl_dir_cache {
32 long refcount;
33 u64 version;
34 struct list_head entries;
35};
36
37struct ovl_readdir_data {
38 struct dir_context ctx;
39 bool is_merge;
40 struct rb_root root;
41 struct list_head *list;
42 struct list_head middle;
43 int count;
44 int err;
45};
46
47struct ovl_dir_file {
48 bool is_real;
49 bool is_upper;
50 struct ovl_dir_cache *cache;
51 struct ovl_cache_entry cursor;
52 struct file *realfile;
53 struct file *upperfile;
54};
55
56static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
57{
58 return container_of(n, struct ovl_cache_entry, node);
59}
60
61static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
62 const char *name, int len)
63{
64 struct rb_node *node = root->rb_node;
65 int cmp;
66
67 while (node) {
68 struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
69
70 cmp = strncmp(name, p->name, len);
71 if (cmp > 0)
72 node = p->node.rb_right;
73 else if (cmp < 0 || len < p->len)
74 node = p->node.rb_left;
75 else
76 return p;
77 }
78
79 return NULL;
80}
81
82static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len,
83 u64 ino, unsigned int d_type)
84{
85 struct ovl_cache_entry *p;
86 size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
87
88 p = kmalloc(size, GFP_KERNEL);
89 if (p) {
90 memcpy(p->name, name, len);
91 p->name[len] = '\0';
92 p->len = len;
93 p->type = d_type;
94 p->ino = ino;
95 p->is_whiteout = false;
96 p->is_cursor = false;
97 }
98
99 return p;
100}
101
102static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
103 const char *name, int len, u64 ino,
104 unsigned int d_type)
105{
106 struct rb_node **newp = &rdd->root.rb_node;
107 struct rb_node *parent = NULL;
108 struct ovl_cache_entry *p;
109
110 while (*newp) {
111 int cmp;
112 struct ovl_cache_entry *tmp;
113
114 parent = *newp;
115 tmp = ovl_cache_entry_from_node(*newp);
116 cmp = strncmp(name, tmp->name, len);
117 if (cmp > 0)
118 newp = &tmp->node.rb_right;
119 else if (cmp < 0 || len < tmp->len)
120 newp = &tmp->node.rb_left;
121 else
122 return 0;
123 }
124
125 p = ovl_cache_entry_new(name, len, ino, d_type);
126 if (p == NULL)
127 return -ENOMEM;
128
129 list_add_tail(&p->l_node, rdd->list);
130 rb_link_node(&p->node, parent, newp);
131 rb_insert_color(&p->node, &rdd->root);
132
133 return 0;
134}
135
136static int ovl_fill_lower(struct ovl_readdir_data *rdd,
137 const char *name, int namelen,
138 loff_t offset, u64 ino, unsigned int d_type)
139{
140 struct ovl_cache_entry *p;
141
142 p = ovl_cache_entry_find(&rdd->root, name, namelen);
143 if (p) {
144 list_move_tail(&p->l_node, &rdd->middle);
145 } else {
146 p = ovl_cache_entry_new(name, namelen, ino, d_type);
147 if (p == NULL)
148 rdd->err = -ENOMEM;
149 else
150 list_add_tail(&p->l_node, &rdd->middle);
151 }
152
153 return rdd->err;
154}
155
156void ovl_cache_free(struct list_head *list)
157{
158 struct ovl_cache_entry *p;
159 struct ovl_cache_entry *n;
160
161 list_for_each_entry_safe(p, n, list, l_node)
162 kfree(p);
163
164 INIT_LIST_HEAD(list);
165}
166
167static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
168{
169 struct ovl_dir_cache *cache = od->cache;
170
171 list_del_init(&od->cursor.l_node);
172 WARN_ON(cache->refcount <= 0);
173 cache->refcount--;
174 if (!cache->refcount) {
175 if (ovl_dir_cache(dentry) == cache)
176 ovl_set_dir_cache(dentry, NULL);
177
178 ovl_cache_free(&cache->entries);
179 kfree(cache);
180 }
181}
182
183static int ovl_fill_merge(void *buf, const char *name, int namelen,
184 loff_t offset, u64 ino, unsigned int d_type)
185{
186 struct ovl_readdir_data *rdd = buf;
187
188 rdd->count++;
189 if (!rdd->is_merge)
190 return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
191 else
192 return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
193}
194
195static inline int ovl_dir_read(struct path *realpath,
196 struct ovl_readdir_data *rdd)
197{
198 struct file *realfile;
199 int err;
200
201 realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
202 if (IS_ERR(realfile))
203 return PTR_ERR(realfile);
204
205 rdd->ctx.pos = 0;
206 do {
207 rdd->count = 0;
208 rdd->err = 0;
209 err = iterate_dir(realfile, &rdd->ctx);
210 if (err >= 0)
211 err = rdd->err;
212 } while (!err && rdd->count);
213 fput(realfile);
214
215 return err;
216}
217
218static void ovl_dir_reset(struct file *file)
219{
220 struct ovl_dir_file *od = file->private_data;
221 struct ovl_dir_cache *cache = od->cache;
222 struct dentry *dentry = file->f_path.dentry;
223 enum ovl_path_type type = ovl_path_type(dentry);
224
225 if (cache && ovl_dentry_version_get(dentry) != cache->version) {
226 ovl_cache_put(od, dentry);
227 od->cache = NULL;
228 }
229 WARN_ON(!od->is_real && type != OVL_PATH_MERGE);
230 if (od->is_real && type == OVL_PATH_MERGE)
231 od->is_real = false;
232}
233
234static int ovl_dir_mark_whiteouts(struct dentry *dir,
235 struct ovl_readdir_data *rdd)
236{
237 struct ovl_cache_entry *p;
238 struct dentry *dentry;
239 const struct cred *old_cred;
240 struct cred *override_cred;
241
242 override_cred = prepare_creds();
243 if (!override_cred) {
244 ovl_cache_free(rdd->list);
245 return -ENOMEM;
246 }
247
248 /*
249 * CAP_DAC_OVERRIDE for lookup
250 */
251 cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
252 old_cred = override_creds(override_cred);
253
254 mutex_lock(&dir->d_inode->i_mutex);
255 list_for_each_entry(p, rdd->list, l_node) {
256 if (p->is_cursor)
257 continue;
258
259 if (p->type != DT_CHR)
260 continue;
261
262 dentry = lookup_one_len(p->name, dir, p->len);
263 if (IS_ERR(dentry))
264 continue;
265
266 p->is_whiteout = ovl_is_whiteout(dentry);
267 dput(dentry);
268 }
269 mutex_unlock(&dir->d_inode->i_mutex);
270
271 revert_creds(old_cred);
272 put_cred(override_cred);
273
274 return 0;
275}
276
277static inline int ovl_dir_read_merged(struct path *upperpath,
278 struct path *lowerpath,
279 struct list_head *list)
280{
281 int err;
282 struct ovl_readdir_data rdd = {
283 .ctx.actor = ovl_fill_merge,
284 .list = list,
285 .root = RB_ROOT,
286 .is_merge = false,
287 };
288
289 if (upperpath->dentry) {
290 err = ovl_dir_read(upperpath, &rdd);
291 if (err)
292 goto out;
293
294 if (lowerpath->dentry) {
295 err = ovl_dir_mark_whiteouts(upperpath->dentry, &rdd);
296 if (err)
297 goto out;
298 }
299 }
300 if (lowerpath->dentry) {
301 /*
302 * Insert lowerpath entries before upperpath ones, this allows
303 * offsets to be reasonably constant
304 */
305 list_add(&rdd.middle, rdd.list);
306 rdd.is_merge = true;
307 err = ovl_dir_read(lowerpath, &rdd);
308 list_del(&rdd.middle);
309 }
310out:
311 return err;
312}
313
314static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
315{
316 struct ovl_cache_entry *p;
317 loff_t off = 0;
318
319 list_for_each_entry(p, &od->cache->entries, l_node) {
320 if (p->is_cursor)
321 continue;
322 if (off >= pos)
323 break;
324 off++;
325 }
326 list_move_tail(&od->cursor.l_node, &p->l_node);
327}
328
329static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
330{
331 int res;
332 struct path lowerpath;
333 struct path upperpath;
334 struct ovl_dir_cache *cache;
335
336 cache = ovl_dir_cache(dentry);
337 if (cache && ovl_dentry_version_get(dentry) == cache->version) {
338 cache->refcount++;
339 return cache;
340 }
341 ovl_set_dir_cache(dentry, NULL);
342
343 cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
344 if (!cache)
345 return ERR_PTR(-ENOMEM);
346
347 cache->refcount = 1;
348 INIT_LIST_HEAD(&cache->entries);
349
350 ovl_path_lower(dentry, &lowerpath);
351 ovl_path_upper(dentry, &upperpath);
352
353 res = ovl_dir_read_merged(&upperpath, &lowerpath, &cache->entries);
354 if (res) {
355 ovl_cache_free(&cache->entries);
356 kfree(cache);
357 return ERR_PTR(res);
358 }
359
360 cache->version = ovl_dentry_version_get(dentry);
361 ovl_set_dir_cache(dentry, cache);
362
363 return cache;
364}
365
366static int ovl_iterate(struct file *file, struct dir_context *ctx)
367{
368 struct ovl_dir_file *od = file->private_data;
369 struct dentry *dentry = file->f_path.dentry;
370
371 if (!ctx->pos)
372 ovl_dir_reset(file);
373
374 if (od->is_real)
375 return iterate_dir(od->realfile, ctx);
376
377 if (!od->cache) {
378 struct ovl_dir_cache *cache;
379
380 cache = ovl_cache_get(dentry);
381 if (IS_ERR(cache))
382 return PTR_ERR(cache);
383
384 od->cache = cache;
385 ovl_seek_cursor(od, ctx->pos);
386 }
387
388 while (od->cursor.l_node.next != &od->cache->entries) {
389 struct ovl_cache_entry *p;
390
391 p = list_entry(od->cursor.l_node.next, struct ovl_cache_entry, l_node);
392 /* Skip cursors */
393 if (!p->is_cursor) {
394 if (!p->is_whiteout) {
395 if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
396 break;
397 }
398 ctx->pos++;
399 }
400 list_move(&od->cursor.l_node, &p->l_node);
401 }
402 return 0;
403}
404
405static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
406{
407 loff_t res;
408 struct ovl_dir_file *od = file->private_data;
409
410 mutex_lock(&file_inode(file)->i_mutex);
411 if (!file->f_pos)
412 ovl_dir_reset(file);
413
414 if (od->is_real) {
415 res = vfs_llseek(od->realfile, offset, origin);
416 file->f_pos = od->realfile->f_pos;
417 } else {
418 res = -EINVAL;
419
420 switch (origin) {
421 case SEEK_CUR:
422 offset += file->f_pos;
423 break;
424 case SEEK_SET:
425 break;
426 default:
427 goto out_unlock;
428 }
429 if (offset < 0)
430 goto out_unlock;
431
432 if (offset != file->f_pos) {
433 file->f_pos = offset;
434 if (od->cache)
435 ovl_seek_cursor(od, offset);
436 }
437 res = offset;
438 }
439out_unlock:
440 mutex_unlock(&file_inode(file)->i_mutex);
441
442 return res;
443}
444
445static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
446 int datasync)
447{
448 struct ovl_dir_file *od = file->private_data;
449 struct dentry *dentry = file->f_path.dentry;
450 struct file *realfile = od->realfile;
451
452 /*
453 * Need to check if we started out being a lower dir, but got copied up
454 */
455 if (!od->is_upper && ovl_path_type(dentry) == OVL_PATH_MERGE) {
456 struct inode *inode = file_inode(file);
457
458 realfile =lockless_dereference(od->upperfile);
459 if (!realfile) {
460 struct path upperpath;
461
462 ovl_path_upper(dentry, &upperpath);
463 realfile = ovl_path_open(&upperpath, O_RDONLY);
464 smp_mb__before_spinlock();
465 mutex_lock(&inode->i_mutex);
466 if (!od->upperfile) {
467 if (IS_ERR(realfile)) {
468 mutex_unlock(&inode->i_mutex);
469 return PTR_ERR(realfile);
470 }
471 od->upperfile = realfile;
472 } else {
473 /* somebody has beaten us to it */
474 if (!IS_ERR(realfile))
475 fput(realfile);
476 realfile = od->upperfile;
477 }
478 mutex_unlock(&inode->i_mutex);
479 }
480 }
481
482 return vfs_fsync_range(realfile, start, end, datasync);
483}
484
485static int ovl_dir_release(struct inode *inode, struct file *file)
486{
487 struct ovl_dir_file *od = file->private_data;
488
489 if (od->cache) {
490 mutex_lock(&inode->i_mutex);
491 ovl_cache_put(od, file->f_path.dentry);
492 mutex_unlock(&inode->i_mutex);
493 }
494 fput(od->realfile);
495 if (od->upperfile)
496 fput(od->upperfile);
497 kfree(od);
498
499 return 0;
500}
501
502static int ovl_dir_open(struct inode *inode, struct file *file)
503{
504 struct path realpath;
505 struct file *realfile;
506 struct ovl_dir_file *od;
507 enum ovl_path_type type;
508
509 od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
510 if (!od)
511 return -ENOMEM;
512
513 type = ovl_path_real(file->f_path.dentry, &realpath);
514 realfile = ovl_path_open(&realpath, file->f_flags);
515 if (IS_ERR(realfile)) {
516 kfree(od);
517 return PTR_ERR(realfile);
518 }
519 INIT_LIST_HEAD(&od->cursor.l_node);
520 od->realfile = realfile;
521 od->is_real = (type != OVL_PATH_MERGE);
522 od->is_upper = (type != OVL_PATH_LOWER);
523 od->cursor.is_cursor = true;
524 file->private_data = od;
525
526 return 0;
527}
528
529const struct file_operations ovl_dir_operations = {
530 .read = generic_read_dir,
531 .open = ovl_dir_open,
532 .iterate = ovl_iterate,
533 .llseek = ovl_dir_llseek,
534 .fsync = ovl_dir_fsync,
535 .release = ovl_dir_release,
536};
537
538int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
539{
540 int err;
541 struct path lowerpath;
542 struct path upperpath;
543 struct ovl_cache_entry *p;
544
545 ovl_path_upper(dentry, &upperpath);
546 ovl_path_lower(dentry, &lowerpath);
547
548 err = ovl_dir_read_merged(&upperpath, &lowerpath, list);
549 if (err)
550 return err;
551
552 err = 0;
553
554 list_for_each_entry(p, list, l_node) {
555 if (p->is_whiteout)
556 continue;
557
558 if (p->name[0] == '.') {
559 if (p->len == 1)
560 continue;
561 if (p->len == 2 && p->name[1] == '.')
562 continue;
563 }
564 err = -ENOTEMPTY;
565 break;
566 }
567
568 return err;
569}
570
571void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
572{
573 struct ovl_cache_entry *p;
574
575 mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
576 list_for_each_entry(p, list, l_node) {
577 struct dentry *dentry;
578
579 if (!p->is_whiteout)
580 continue;
581
582 dentry = lookup_one_len(p->name, upper, p->len);
583 if (IS_ERR(dentry)) {
584 pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
585 upper->d_name.name, p->len, p->name,
586 (int) PTR_ERR(dentry));
587 continue;
588 }
589 ovl_cleanup(upper->d_inode, dentry);
590 dput(dentry);
591 }
592 mutex_unlock(&upper->d_inode->i_mutex);
593}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
new file mode 100644
index 000000000000..08b704cebfc4
--- /dev/null
+++ b/fs/overlayfs/super.c
@@ -0,0 +1,796 @@
1/*
2 *
3 * Copyright (C) 2011 Novell Inc.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published by
7 * the Free Software Foundation.
8 */
9
10#include <linux/fs.h>
11#include <linux/namei.h>
12#include <linux/xattr.h>
13#include <linux/security.h>
14#include <linux/mount.h>
15#include <linux/slab.h>
16#include <linux/parser.h>
17#include <linux/module.h>
18#include <linux/sched.h>
19#include <linux/statfs.h>
20#include <linux/seq_file.h>
21#include "overlayfs.h"
22
23MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
24MODULE_DESCRIPTION("Overlay filesystem");
25MODULE_LICENSE("GPL");
26
27#define OVERLAYFS_SUPER_MAGIC 0x794c764f
28
29struct ovl_config {
30 char *lowerdir;
31 char *upperdir;
32 char *workdir;
33};
34
35/* private information held for overlayfs's superblock */
36struct ovl_fs {
37 struct vfsmount *upper_mnt;
38 struct vfsmount *lower_mnt;
39 struct dentry *workdir;
40 long lower_namelen;
41 /* pathnames of lower and upper dirs, for show_options */
42 struct ovl_config config;
43};
44
45struct ovl_dir_cache;
46
47/* private information held for every overlayfs dentry */
48struct ovl_entry {
49 struct dentry *__upperdentry;
50 struct dentry *lowerdentry;
51 struct ovl_dir_cache *cache;
52 union {
53 struct {
54 u64 version;
55 bool opaque;
56 };
57 struct rcu_head rcu;
58 };
59};
60
61const char *ovl_opaque_xattr = "trusted.overlay.opaque";
62
63
64enum ovl_path_type ovl_path_type(struct dentry *dentry)
65{
66 struct ovl_entry *oe = dentry->d_fsdata;
67
68 if (oe->__upperdentry) {
69 if (oe->lowerdentry) {
70 if (S_ISDIR(dentry->d_inode->i_mode))
71 return OVL_PATH_MERGE;
72 else
73 return OVL_PATH_UPPER;
74 } else {
75 if (oe->opaque)
76 return OVL_PATH_UPPER;
77 else
78 return OVL_PATH_PURE_UPPER;
79 }
80 } else {
81 return OVL_PATH_LOWER;
82 }
83}
84
85static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
86{
87 struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry);
88 /*
89 * Make sure to order reads to upperdentry wrt ovl_dentry_update()
90 */
91 smp_read_barrier_depends();
92 return upperdentry;
93}
94
95void ovl_path_upper(struct dentry *dentry, struct path *path)
96{
97 struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
98 struct ovl_entry *oe = dentry->d_fsdata;
99
100 path->mnt = ofs->upper_mnt;
101 path->dentry = ovl_upperdentry_dereference(oe);
102}
103
104enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
105{
106
107 enum ovl_path_type type = ovl_path_type(dentry);
108
109 if (type == OVL_PATH_LOWER)
110 ovl_path_lower(dentry, path);
111 else
112 ovl_path_upper(dentry, path);
113
114 return type;
115}
116
117struct dentry *ovl_dentry_upper(struct dentry *dentry)
118{
119 struct ovl_entry *oe = dentry->d_fsdata;
120
121 return ovl_upperdentry_dereference(oe);
122}
123
124struct dentry *ovl_dentry_lower(struct dentry *dentry)
125{
126 struct ovl_entry *oe = dentry->d_fsdata;
127
128 return oe->lowerdentry;
129}
130
131struct dentry *ovl_dentry_real(struct dentry *dentry)
132{
133 struct ovl_entry *oe = dentry->d_fsdata;
134 struct dentry *realdentry;
135
136 realdentry = ovl_upperdentry_dereference(oe);
137 if (!realdentry)
138 realdentry = oe->lowerdentry;
139
140 return realdentry;
141}
142
143struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
144{
145 struct dentry *realdentry;
146
147 realdentry = ovl_upperdentry_dereference(oe);
148 if (realdentry) {
149 *is_upper = true;
150 } else {
151 realdentry = oe->lowerdentry;
152 *is_upper = false;
153 }
154 return realdentry;
155}
156
157struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
158{
159 struct ovl_entry *oe = dentry->d_fsdata;
160
161 return oe->cache;
162}
163
164void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
165{
166 struct ovl_entry *oe = dentry->d_fsdata;
167
168 oe->cache = cache;
169}
170
171void ovl_path_lower(struct dentry *dentry, struct path *path)
172{
173 struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
174 struct ovl_entry *oe = dentry->d_fsdata;
175
176 path->mnt = ofs->lower_mnt;
177 path->dentry = oe->lowerdentry;
178}
179
180int ovl_want_write(struct dentry *dentry)
181{
182 struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
183 return mnt_want_write(ofs->upper_mnt);
184}
185
186void ovl_drop_write(struct dentry *dentry)
187{
188 struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
189 mnt_drop_write(ofs->upper_mnt);
190}
191
192struct dentry *ovl_workdir(struct dentry *dentry)
193{
194 struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
195 return ofs->workdir;
196}
197
198bool ovl_dentry_is_opaque(struct dentry *dentry)
199{
200 struct ovl_entry *oe = dentry->d_fsdata;
201 return oe->opaque;
202}
203
204void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque)
205{
206 struct ovl_entry *oe = dentry->d_fsdata;
207 oe->opaque = opaque;
208}
209
210void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
211{
212 struct ovl_entry *oe = dentry->d_fsdata;
213
214 WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex));
215 WARN_ON(oe->__upperdentry);
216 BUG_ON(!upperdentry->d_inode);
217 /*
218 * Make sure upperdentry is consistent before making it visible to
219 * ovl_upperdentry_dereference().
220 */
221 smp_wmb();
222 oe->__upperdentry = upperdentry;
223}
224
225void ovl_dentry_version_inc(struct dentry *dentry)
226{
227 struct ovl_entry *oe = dentry->d_fsdata;
228
229 WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
230 oe->version++;
231}
232
233u64 ovl_dentry_version_get(struct dentry *dentry)
234{
235 struct ovl_entry *oe = dentry->d_fsdata;
236
237 WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
238 return oe->version;
239}
240
241bool ovl_is_whiteout(struct dentry *dentry)
242{
243 struct inode *inode = dentry->d_inode;
244
245 return inode && IS_WHITEOUT(inode);
246}
247
248static bool ovl_is_opaquedir(struct dentry *dentry)
249{
250 int res;
251 char val;
252 struct inode *inode = dentry->d_inode;
253
254 if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr)
255 return false;
256
257 res = inode->i_op->getxattr(dentry, ovl_opaque_xattr, &val, 1);
258 if (res == 1 && val == 'y')
259 return true;
260
261 return false;
262}
263
264static void ovl_dentry_release(struct dentry *dentry)
265{
266 struct ovl_entry *oe = dentry->d_fsdata;
267
268 if (oe) {
269 dput(oe->__upperdentry);
270 dput(oe->lowerdentry);
271 kfree_rcu(oe, rcu);
272 }
273}
274
275static const struct dentry_operations ovl_dentry_operations = {
276 .d_release = ovl_dentry_release,
277};
278
279static struct ovl_entry *ovl_alloc_entry(void)
280{
281 return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL);
282}
283
284static inline struct dentry *ovl_lookup_real(struct dentry *dir,
285 struct qstr *name)
286{
287 struct dentry *dentry;
288
289 mutex_lock(&dir->d_inode->i_mutex);
290 dentry = lookup_one_len(name->name, dir, name->len);
291 mutex_unlock(&dir->d_inode->i_mutex);
292
293 if (IS_ERR(dentry)) {
294 if (PTR_ERR(dentry) == -ENOENT)
295 dentry = NULL;
296 } else if (!dentry->d_inode) {
297 dput(dentry);
298 dentry = NULL;
299 }
300 return dentry;
301}
302
303struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
304 unsigned int flags)
305{
306 struct ovl_entry *oe;
307 struct dentry *upperdir;
308 struct dentry *lowerdir;
309 struct dentry *upperdentry = NULL;
310 struct dentry *lowerdentry = NULL;
311 struct inode *inode = NULL;
312 int err;
313
314 err = -ENOMEM;
315 oe = ovl_alloc_entry();
316 if (!oe)
317 goto out;
318
319 upperdir = ovl_dentry_upper(dentry->d_parent);
320 lowerdir = ovl_dentry_lower(dentry->d_parent);
321
322 if (upperdir) {
323 upperdentry = ovl_lookup_real(upperdir, &dentry->d_name);
324 err = PTR_ERR(upperdentry);
325 if (IS_ERR(upperdentry))
326 goto out_put_dir;
327
328 if (lowerdir && upperdentry) {
329 if (ovl_is_whiteout(upperdentry)) {
330 dput(upperdentry);
331 upperdentry = NULL;
332 oe->opaque = true;
333 } else if (ovl_is_opaquedir(upperdentry)) {
334 oe->opaque = true;
335 }
336 }
337 }
338 if (lowerdir && !oe->opaque) {
339 lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name);
340 err = PTR_ERR(lowerdentry);
341 if (IS_ERR(lowerdentry))
342 goto out_dput_upper;
343 }
344
345 if (lowerdentry && upperdentry &&
346 (!S_ISDIR(upperdentry->d_inode->i_mode) ||
347 !S_ISDIR(lowerdentry->d_inode->i_mode))) {
348 dput(lowerdentry);
349 lowerdentry = NULL;
350 oe->opaque = true;
351 }
352
353 if (lowerdentry || upperdentry) {
354 struct dentry *realdentry;
355
356 realdentry = upperdentry ? upperdentry : lowerdentry;
357 err = -ENOMEM;
358 inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode,
359 oe);
360 if (!inode)
361 goto out_dput;
362 ovl_copyattr(realdentry->d_inode, inode);
363 }
364
365 oe->__upperdentry = upperdentry;
366 oe->lowerdentry = lowerdentry;
367
368 dentry->d_fsdata = oe;
369 d_add(dentry, inode);
370
371 return NULL;
372
373out_dput:
374 dput(lowerdentry);
375out_dput_upper:
376 dput(upperdentry);
377out_put_dir:
378 kfree(oe);
379out:
380 return ERR_PTR(err);
381}
382
383struct file *ovl_path_open(struct path *path, int flags)
384{
385 return dentry_open(path, flags, current_cred());
386}
387
388static void ovl_put_super(struct super_block *sb)
389{
390 struct ovl_fs *ufs = sb->s_fs_info;
391
392 dput(ufs->workdir);
393 mntput(ufs->upper_mnt);
394 mntput(ufs->lower_mnt);
395
396 kfree(ufs->config.lowerdir);
397 kfree(ufs->config.upperdir);
398 kfree(ufs->config.workdir);
399 kfree(ufs);
400}
401
402/**
403 * ovl_statfs
404 * @sb: The overlayfs super block
405 * @buf: The struct kstatfs to fill in with stats
406 *
407 * Get the filesystem statistics. As writes always target the upper layer
408 * filesystem pass the statfs to the same filesystem.
409 */
410static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
411{
412 struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
413 struct dentry *root_dentry = dentry->d_sb->s_root;
414 struct path path;
415 int err;
416
417 ovl_path_upper(root_dentry, &path);
418
419 err = vfs_statfs(&path, buf);
420 if (!err) {
421 buf->f_namelen = max(buf->f_namelen, ofs->lower_namelen);
422 buf->f_type = OVERLAYFS_SUPER_MAGIC;
423 }
424
425 return err;
426}
427
428/**
429 * ovl_show_options
430 *
431 * Prints the mount options for a given superblock.
432 * Returns zero; does not fail.
433 */
434static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
435{
436 struct super_block *sb = dentry->d_sb;
437 struct ovl_fs *ufs = sb->s_fs_info;
438
439 seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
440 seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
441 seq_printf(m, ",workdir=%s", ufs->config.workdir);
442 return 0;
443}
444
445static const struct super_operations ovl_super_operations = {
446 .put_super = ovl_put_super,
447 .statfs = ovl_statfs,
448 .show_options = ovl_show_options,
449};
450
451enum {
452 OPT_LOWERDIR,
453 OPT_UPPERDIR,
454 OPT_WORKDIR,
455 OPT_ERR,
456};
457
458static const match_table_t ovl_tokens = {
459 {OPT_LOWERDIR, "lowerdir=%s"},
460 {OPT_UPPERDIR, "upperdir=%s"},
461 {OPT_WORKDIR, "workdir=%s"},
462 {OPT_ERR, NULL}
463};
464
465static int ovl_parse_opt(char *opt, struct ovl_config *config)
466{
467 char *p;
468
469 while ((p = strsep(&opt, ",")) != NULL) {
470 int token;
471 substring_t args[MAX_OPT_ARGS];
472
473 if (!*p)
474 continue;
475
476 token = match_token(p, ovl_tokens, args);
477 switch (token) {
478 case OPT_UPPERDIR:
479 kfree(config->upperdir);
480 config->upperdir = match_strdup(&args[0]);
481 if (!config->upperdir)
482 return -ENOMEM;
483 break;
484
485 case OPT_LOWERDIR:
486 kfree(config->lowerdir);
487 config->lowerdir = match_strdup(&args[0]);
488 if (!config->lowerdir)
489 return -ENOMEM;
490 break;
491
492 case OPT_WORKDIR:
493 kfree(config->workdir);
494 config->workdir = match_strdup(&args[0]);
495 if (!config->workdir)
496 return -ENOMEM;
497 break;
498
499 default:
500 return -EINVAL;
501 }
502 }
503 return 0;
504}
505
506#define OVL_WORKDIR_NAME "work"
507
508static struct dentry *ovl_workdir_create(struct vfsmount *mnt,
509 struct dentry *dentry)
510{
511 struct inode *dir = dentry->d_inode;
512 struct dentry *work;
513 int err;
514 bool retried = false;
515
516 err = mnt_want_write(mnt);
517 if (err)
518 return ERR_PTR(err);
519
520 mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
521retry:
522 work = lookup_one_len(OVL_WORKDIR_NAME, dentry,
523 strlen(OVL_WORKDIR_NAME));
524
525 if (!IS_ERR(work)) {
526 struct kstat stat = {
527 .mode = S_IFDIR | 0,
528 };
529
530 if (work->d_inode) {
531 err = -EEXIST;
532 if (retried)
533 goto out_dput;
534
535 retried = true;
536 ovl_cleanup(dir, work);
537 dput(work);
538 goto retry;
539 }
540
541 err = ovl_create_real(dir, work, &stat, NULL, NULL, true);
542 if (err)
543 goto out_dput;
544 }
545out_unlock:
546 mutex_unlock(&dir->i_mutex);
547 mnt_drop_write(mnt);
548
549 return work;
550
551out_dput:
552 dput(work);
553 work = ERR_PTR(err);
554 goto out_unlock;
555}
556
557static int ovl_mount_dir(const char *name, struct path *path)
558{
559 int err;
560
561 err = kern_path(name, LOOKUP_FOLLOW, path);
562 if (err) {
563 pr_err("overlayfs: failed to resolve '%s': %i\n", name, err);
564 err = -EINVAL;
565 }
566 return err;
567}
568
569static bool ovl_is_allowed_fs_type(struct dentry *root)
570{
571 const struct dentry_operations *dop = root->d_op;
572
573 /*
574 * We don't support:
575 * - automount filesystems
576 * - filesystems with revalidate (FIXME for lower layer)
577 * - filesystems with case insensitive names
578 */
579 if (dop &&
580 (dop->d_manage || dop->d_automount ||
581 dop->d_revalidate || dop->d_weak_revalidate ||
582 dop->d_compare || dop->d_hash)) {
583 return false;
584 }
585 return true;
586}
587
588/* Workdir should not be subdir of upperdir and vice versa */
589static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
590{
591 bool ok = false;
592
593 if (workdir != upperdir) {
594 ok = (lock_rename(workdir, upperdir) == NULL);
595 unlock_rename(workdir, upperdir);
596 }
597 return ok;
598}
599
600static int ovl_fill_super(struct super_block *sb, void *data, int silent)
601{
602 struct path lowerpath;
603 struct path upperpath;
604 struct path workpath;
605 struct inode *root_inode;
606 struct dentry *root_dentry;
607 struct ovl_entry *oe;
608 struct ovl_fs *ufs;
609 struct kstatfs statfs;
610 int err;
611
612 err = -ENOMEM;
613 ufs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL);
614 if (!ufs)
615 goto out;
616
617 err = ovl_parse_opt((char *) data, &ufs->config);
618 if (err)
619 goto out_free_config;
620
621 /* FIXME: workdir is not needed for a R/O mount */
622 err = -EINVAL;
623 if (!ufs->config.upperdir || !ufs->config.lowerdir ||
624 !ufs->config.workdir) {
625 pr_err("overlayfs: missing upperdir or lowerdir or workdir\n");
626 goto out_free_config;
627 }
628
629 err = -ENOMEM;
630 oe = ovl_alloc_entry();
631 if (oe == NULL)
632 goto out_free_config;
633
634 err = ovl_mount_dir(ufs->config.upperdir, &upperpath);
635 if (err)
636 goto out_free_oe;
637
638 err = ovl_mount_dir(ufs->config.lowerdir, &lowerpath);
639 if (err)
640 goto out_put_upperpath;
641
642 err = ovl_mount_dir(ufs->config.workdir, &workpath);
643 if (err)
644 goto out_put_lowerpath;
645
646 err = -EINVAL;
647 if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) ||
648 !S_ISDIR(lowerpath.dentry->d_inode->i_mode) ||
649 !S_ISDIR(workpath.dentry->d_inode->i_mode)) {
650 pr_err("overlayfs: upperdir or lowerdir or workdir not a directory\n");
651 goto out_put_workpath;
652 }
653
654 if (upperpath.mnt != workpath.mnt) {
655 pr_err("overlayfs: workdir and upperdir must reside under the same mount\n");
656 goto out_put_workpath;
657 }
658 if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) {
659 pr_err("overlayfs: workdir and upperdir must be separate subtrees\n");
660 goto out_put_workpath;
661 }
662
663 if (!ovl_is_allowed_fs_type(upperpath.dentry)) {
664 pr_err("overlayfs: filesystem of upperdir is not supported\n");
665 goto out_put_workpath;
666 }
667
668 if (!ovl_is_allowed_fs_type(lowerpath.dentry)) {
669 pr_err("overlayfs: filesystem of lowerdir is not supported\n");
670 goto out_put_workpath;
671 }
672
673 err = vfs_statfs(&lowerpath, &statfs);
674 if (err) {
675 pr_err("overlayfs: statfs failed on lowerpath\n");
676 goto out_put_workpath;
677 }
678 ufs->lower_namelen = statfs.f_namelen;
679
680 sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth,
681 lowerpath.mnt->mnt_sb->s_stack_depth) + 1;
682
683 err = -EINVAL;
684 if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) {
685 pr_err("overlayfs: maximum fs stacking depth exceeded\n");
686 goto out_put_workpath;
687 }
688
689 ufs->upper_mnt = clone_private_mount(&upperpath);
690 err = PTR_ERR(ufs->upper_mnt);
691 if (IS_ERR(ufs->upper_mnt)) {
692 pr_err("overlayfs: failed to clone upperpath\n");
693 goto out_put_workpath;
694 }
695
696 ufs->lower_mnt = clone_private_mount(&lowerpath);
697 err = PTR_ERR(ufs->lower_mnt);
698 if (IS_ERR(ufs->lower_mnt)) {
699 pr_err("overlayfs: failed to clone lowerpath\n");
700 goto out_put_upper_mnt;
701 }
702
703 ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry);
704 err = PTR_ERR(ufs->workdir);
705 if (IS_ERR(ufs->workdir)) {
706 pr_err("overlayfs: failed to create directory %s/%s\n",
707 ufs->config.workdir, OVL_WORKDIR_NAME);
708 goto out_put_lower_mnt;
709 }
710
711 /*
712 * Make lower_mnt R/O. That way fchmod/fchown on lower file
713 * will fail instead of modifying lower fs.
714 */
715 ufs->lower_mnt->mnt_flags |= MNT_READONLY;
716
717 /* If the upper fs is r/o, we mark overlayfs r/o too */
718 if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)
719 sb->s_flags |= MS_RDONLY;
720
721 sb->s_d_op = &ovl_dentry_operations;
722
723 err = -ENOMEM;
724 root_inode = ovl_new_inode(sb, S_IFDIR, oe);
725 if (!root_inode)
726 goto out_put_workdir;
727
728 root_dentry = d_make_root(root_inode);
729 if (!root_dentry)
730 goto out_put_workdir;
731
732 mntput(upperpath.mnt);
733 mntput(lowerpath.mnt);
734 path_put(&workpath);
735
736 oe->__upperdentry = upperpath.dentry;
737 oe->lowerdentry = lowerpath.dentry;
738
739 root_dentry->d_fsdata = oe;
740
741 sb->s_magic = OVERLAYFS_SUPER_MAGIC;
742 sb->s_op = &ovl_super_operations;
743 sb->s_root = root_dentry;
744 sb->s_fs_info = ufs;
745
746 return 0;
747
748out_put_workdir:
749 dput(ufs->workdir);
750out_put_lower_mnt:
751 mntput(ufs->lower_mnt);
752out_put_upper_mnt:
753 mntput(ufs->upper_mnt);
754out_put_workpath:
755 path_put(&workpath);
756out_put_lowerpath:
757 path_put(&lowerpath);
758out_put_upperpath:
759 path_put(&upperpath);
760out_free_oe:
761 kfree(oe);
762out_free_config:
763 kfree(ufs->config.lowerdir);
764 kfree(ufs->config.upperdir);
765 kfree(ufs->config.workdir);
766 kfree(ufs);
767out:
768 return err;
769}
770
771static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
772 const char *dev_name, void *raw_data)
773{
774 return mount_nodev(fs_type, flags, raw_data, ovl_fill_super);
775}
776
777static struct file_system_type ovl_fs_type = {
778 .owner = THIS_MODULE,
779 .name = "overlayfs",
780 .mount = ovl_mount,
781 .kill_sb = kill_anon_super,
782};
783MODULE_ALIAS_FS("overlayfs");
784
785static int __init ovl_init(void)
786{
787 return register_filesystem(&ovl_fs_type);
788}
789
790static void __exit ovl_exit(void)
791{
792 unregister_filesystem(&ovl_fs_type);
793}
794
795module_init(ovl_init);
796module_exit(ovl_exit);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 8b663b2d9562..6b4527216a7f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -634,7 +634,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
634 dqstats_inc(DQST_LOOKUPS); 634 dqstats_inc(DQST_LOOKUPS);
635 err = sb->dq_op->write_dquot(dquot); 635 err = sb->dq_op->write_dquot(dquot);
636 if (!ret && err) 636 if (!ret && err)
637 err = ret; 637 ret = err;
638 dqput(dquot); 638 dqput(dquot);
639 spin_lock(&dq_list_lock); 639 spin_lock(&dq_list_lock);
640 } 640 }
diff --git a/fs/splice.c b/fs/splice.c
index f5cb9ba84510..75c6058eabf2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1330,6 +1330,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1330 1330
1331 return ret; 1331 return ret;
1332} 1332}
1333EXPORT_SYMBOL(do_splice_direct);
1333 1334
1334static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1335static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1335 struct pipe_inode_info *opipe, 1336 struct pipe_inode_info *opipe,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 92e8f99a5857..281002689d64 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1338,7 +1338,10 @@ xfs_free_file_space(
1338 goto out; 1338 goto out;
1339} 1339}
1340 1340
1341 1341/*
1342 * Preallocate and zero a range of a file. This mechanism has the allocation
1343 * semantics of fallocate and in addition converts data in the range to zeroes.
1344 */
1342int 1345int
1343xfs_zero_file_space( 1346xfs_zero_file_space(
1344 struct xfs_inode *ip, 1347 struct xfs_inode *ip,
@@ -1346,65 +1349,30 @@ xfs_zero_file_space(
1346 xfs_off_t len) 1349 xfs_off_t len)
1347{ 1350{
1348 struct xfs_mount *mp = ip->i_mount; 1351 struct xfs_mount *mp = ip->i_mount;
1349 uint granularity; 1352 uint blksize;
1350 xfs_off_t start_boundary;
1351 xfs_off_t end_boundary;
1352 int error; 1353 int error;
1353 1354
1354 trace_xfs_zero_file_space(ip); 1355 trace_xfs_zero_file_space(ip);
1355 1356
1356 granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); 1357 blksize = 1 << mp->m_sb.sb_blocklog;
1357 1358
1358 /* 1359 /*
1359 * Round the range of extents we are going to convert inwards. If the 1360 * Punch a hole and prealloc the range. We use hole punch rather than
1360 * offset is aligned, then it doesn't get changed so we zero from the 1361 * unwritten extent conversion for two reasons:
1361 * start of the block offset points to. 1362 *
1363 * 1.) Hole punch handles partial block zeroing for us.
1364 *
1365 * 2.) If prealloc returns ENOSPC, the file range is still zero-valued
1366 * by virtue of the hole punch.
1362 */ 1367 */
1363 start_boundary = round_up(offset, granularity); 1368 error = xfs_free_file_space(ip, offset, len);
1364 end_boundary = round_down(offset + len, granularity); 1369 if (error)
1365 1370 goto out;
1366 ASSERT(start_boundary >= offset);
1367 ASSERT(end_boundary <= offset + len);
1368
1369 if (start_boundary < end_boundary - 1) {
1370 /*
1371 * Writeback the range to ensure any inode size updates due to
1372 * appending writes make it to disk (otherwise we could just
1373 * punch out the delalloc blocks).
1374 */
1375 error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1376 start_boundary, end_boundary - 1);
1377 if (error)
1378 goto out;
1379 truncate_pagecache_range(VFS_I(ip), start_boundary,
1380 end_boundary - 1);
1381
1382 /* convert the blocks */
1383 error = xfs_alloc_file_space(ip, start_boundary,
1384 end_boundary - start_boundary - 1,
1385 XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT);
1386 if (error)
1387 goto out;
1388
1389 /* We've handled the interior of the range, now for the edges */
1390 if (start_boundary != offset) {
1391 error = xfs_iozero(ip, offset, start_boundary - offset);
1392 if (error)
1393 goto out;
1394 }
1395
1396 if (end_boundary != offset + len)
1397 error = xfs_iozero(ip, end_boundary,
1398 offset + len - end_boundary);
1399
1400 } else {
1401 /*
1402 * It's either a sub-granularity range or the range spanned lies
1403 * partially across two adjacent blocks.
1404 */
1405 error = xfs_iozero(ip, offset, len);
1406 }
1407 1371
1372 error = xfs_alloc_file_space(ip, round_down(offset, blksize),
1373 round_up(offset + len, blksize) -
1374 round_down(offset, blksize),
1375 XFS_BMAPI_PREALLOC);
1408out: 1376out:
1409 return error; 1377 return error;
1410 1378
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f1deb961a296..894924a5129b 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -236,8 +236,10 @@ xfs_bulkstat_grab_ichunk(
236 XFS_WANT_CORRUPTED_RETURN(stat == 1); 236 XFS_WANT_CORRUPTED_RETURN(stat == 1);
237 237
238 /* Check if the record contains the inode in request */ 238 /* Check if the record contains the inode in request */
239 if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) 239 if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) {
240 return -EINVAL; 240 *icount = 0;
241 return 0;
242 }
241 243
242 idx = agino - irec->ir_startino + 1; 244 idx = agino - irec->ir_startino + 1;
243 if (idx < XFS_INODES_PER_CHUNK && 245 if (idx < XFS_INODES_PER_CHUNK &&
@@ -262,75 +264,76 @@ xfs_bulkstat_grab_ichunk(
262 264
263#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size) 265#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size)
264 266
267struct xfs_bulkstat_agichunk {
268 char __user **ac_ubuffer;/* pointer into user's buffer */
269 int ac_ubleft; /* bytes left in user's buffer */
270 int ac_ubelem; /* spaces used in user's buffer */
271};
272
265/* 273/*
266 * Process inodes in chunk with a pointer to a formatter function 274 * Process inodes in chunk with a pointer to a formatter function
267 * that will iget the inode and fill in the appropriate structure. 275 * that will iget the inode and fill in the appropriate structure.
268 */ 276 */
269int 277static int
270xfs_bulkstat_ag_ichunk( 278xfs_bulkstat_ag_ichunk(
271 struct xfs_mount *mp, 279 struct xfs_mount *mp,
272 xfs_agnumber_t agno, 280 xfs_agnumber_t agno,
273 struct xfs_inobt_rec_incore *irbp, 281 struct xfs_inobt_rec_incore *irbp,
274 bulkstat_one_pf formatter, 282 bulkstat_one_pf formatter,
275 size_t statstruct_size, 283 size_t statstruct_size,
276 struct xfs_bulkstat_agichunk *acp) 284 struct xfs_bulkstat_agichunk *acp,
285 xfs_agino_t *last_agino)
277{ 286{
278 xfs_ino_t lastino = acp->ac_lastino;
279 char __user **ubufp = acp->ac_ubuffer; 287 char __user **ubufp = acp->ac_ubuffer;
280 int ubleft = acp->ac_ubleft; 288 int chunkidx;
281 int ubelem = acp->ac_ubelem;
282 int chunkidx, clustidx;
283 int error = 0; 289 int error = 0;
284 xfs_agino_t agino; 290 xfs_agino_t agino = irbp->ir_startino;
285 291
286 for (agino = irbp->ir_startino, chunkidx = clustidx = 0; 292 for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK;
287 XFS_BULKSTAT_UBLEFT(ubleft) && 293 chunkidx++, agino++) {
288 irbp->ir_freecount < XFS_INODES_PER_CHUNK; 294 int fmterror;
289 chunkidx++, clustidx++, agino++) {
290 int fmterror; /* bulkstat formatter result */
291 int ubused; 295 int ubused;
292 xfs_ino_t ino = XFS_AGINO_TO_INO(mp, agno, agino);
293 296
294 ASSERT(chunkidx < XFS_INODES_PER_CHUNK); 297 /* inode won't fit in buffer, we are done */
298 if (acp->ac_ubleft < statstruct_size)
299 break;
295 300
296 /* Skip if this inode is free */ 301 /* Skip if this inode is free */
297 if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) { 302 if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free)
298 lastino = ino;
299 continue; 303 continue;
300 }
301
302 /*
303 * Count used inodes as free so we can tell when the
304 * chunk is used up.
305 */
306 irbp->ir_freecount++;
307 304
308 /* Get the inode and fill in a single buffer */ 305 /* Get the inode and fill in a single buffer */
309 ubused = statstruct_size; 306 ubused = statstruct_size;
310 error = formatter(mp, ino, *ubufp, ubleft, &ubused, &fmterror); 307 error = formatter(mp, XFS_AGINO_TO_INO(mp, agno, agino),
311 if (fmterror == BULKSTAT_RV_NOTHING) { 308 *ubufp, acp->ac_ubleft, &ubused, &fmterror);
312 if (error && error != -ENOENT && error != -EINVAL) { 309
313 ubleft = 0; 310 if (fmterror == BULKSTAT_RV_GIVEUP ||
314 break; 311 (error && error != -ENOENT && error != -EINVAL)) {
315 } 312 acp->ac_ubleft = 0;
316 lastino = ino;
317 continue;
318 }
319 if (fmterror == BULKSTAT_RV_GIVEUP) {
320 ubleft = 0;
321 ASSERT(error); 313 ASSERT(error);
322 break; 314 break;
323 } 315 }
324 if (*ubufp) 316
325 *ubufp += ubused; 317 /* be careful not to leak error if at end of chunk */
326 ubleft -= ubused; 318 if (fmterror == BULKSTAT_RV_NOTHING || error) {
327 ubelem++; 319 error = 0;
328 lastino = ino; 320 continue;
321 }
322
323 *ubufp += ubused;
324 acp->ac_ubleft -= ubused;
325 acp->ac_ubelem++;
329 } 326 }
330 327
331 acp->ac_lastino = lastino; 328 /*
332 acp->ac_ubleft = ubleft; 329 * Post-update *last_agino. At this point, agino will always point one
333 acp->ac_ubelem = ubelem; 330 * inode past the last inode we processed successfully. Hence we
331 * substract that inode when setting the *last_agino cursor so that we
332 * return the correct cookie to userspace. On the next bulkstat call,
333 * the inode under the lastino cookie will be skipped as we have already
334 * processed it here.
335 */
336 *last_agino = agino - 1;
334 337
335 return error; 338 return error;
336} 339}
@@ -353,45 +356,33 @@ xfs_bulkstat(
353 xfs_agino_t agino; /* inode # in allocation group */ 356 xfs_agino_t agino; /* inode # in allocation group */
354 xfs_agnumber_t agno; /* allocation group number */ 357 xfs_agnumber_t agno; /* allocation group number */
355 xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ 358 xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */
356 int end_of_ag; /* set if we've seen the ag end */
357 int error; /* error code */
358 int fmterror;/* bulkstat formatter result */
359 int i; /* loop index */
360 int icount; /* count of inodes good in irbuf */
361 size_t irbsize; /* size of irec buffer in bytes */ 359 size_t irbsize; /* size of irec buffer in bytes */
362 xfs_ino_t ino; /* inode number (filesystem) */
363 xfs_inobt_rec_incore_t *irbp; /* current irec buffer pointer */
364 xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ 360 xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */
365 xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */
366 xfs_ino_t lastino; /* last inode number returned */
367 int nirbuf; /* size of irbuf */ 361 int nirbuf; /* size of irbuf */
368 int rval; /* return value error code */
369 int tmp; /* result value from btree calls */
370 int ubcount; /* size of user's buffer */ 362 int ubcount; /* size of user's buffer */
371 int ubleft; /* bytes left in user's buffer */ 363 struct xfs_bulkstat_agichunk ac;
372 char __user *ubufp; /* pointer into user's buffer */ 364 int error = 0;
373 int ubelem; /* spaces used in user's buffer */
374 365
375 /* 366 /*
376 * Get the last inode value, see if there's nothing to do. 367 * Get the last inode value, see if there's nothing to do.
377 */ 368 */
378 ino = (xfs_ino_t)*lastinop; 369 agno = XFS_INO_TO_AGNO(mp, *lastinop);
379 lastino = ino; 370 agino = XFS_INO_TO_AGINO(mp, *lastinop);
380 agno = XFS_INO_TO_AGNO(mp, ino);
381 agino = XFS_INO_TO_AGINO(mp, ino);
382 if (agno >= mp->m_sb.sb_agcount || 371 if (agno >= mp->m_sb.sb_agcount ||
383 ino != XFS_AGINO_TO_INO(mp, agno, agino)) { 372 *lastinop != XFS_AGINO_TO_INO(mp, agno, agino)) {
384 *done = 1; 373 *done = 1;
385 *ubcountp = 0; 374 *ubcountp = 0;
386 return 0; 375 return 0;
387 } 376 }
388 377
389 ubcount = *ubcountp; /* statstruct's */ 378 ubcount = *ubcountp; /* statstruct's */
390 ubleft = ubcount * statstruct_size; /* bytes */ 379 ac.ac_ubuffer = &ubuffer;
391 *ubcountp = ubelem = 0; 380 ac.ac_ubleft = ubcount * statstruct_size; /* bytes */;
381 ac.ac_ubelem = 0;
382
383 *ubcountp = 0;
392 *done = 0; 384 *done = 0;
393 fmterror = 0; 385
394 ubufp = ubuffer;
395 irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); 386 irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
396 if (!irbuf) 387 if (!irbuf)
397 return -ENOMEM; 388 return -ENOMEM;
@@ -402,9 +393,13 @@ xfs_bulkstat(
402 * Loop over the allocation groups, starting from the last 393 * Loop over the allocation groups, starting from the last
403 * inode returned; 0 means start of the allocation group. 394 * inode returned; 0 means start of the allocation group.
404 */ 395 */
405 rval = 0; 396 while (agno < mp->m_sb.sb_agcount) {
406 while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { 397 struct xfs_inobt_rec_incore *irbp = irbuf;
407 cond_resched(); 398 struct xfs_inobt_rec_incore *irbufend = irbuf + nirbuf;
399 bool end_of_ag = false;
400 int icount = 0;
401 int stat;
402
408 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); 403 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
409 if (error) 404 if (error)
410 break; 405 break;
@@ -414,10 +409,6 @@ xfs_bulkstat(
414 */ 409 */
415 cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, 410 cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
416 XFS_BTNUM_INO); 411 XFS_BTNUM_INO);
417 irbp = irbuf;
418 irbufend = irbuf + nirbuf;
419 end_of_ag = 0;
420 icount = 0;
421 if (agino > 0) { 412 if (agino > 0) {
422 /* 413 /*
423 * In the middle of an allocation group, we need to get 414 * In the middle of an allocation group, we need to get
@@ -427,22 +418,23 @@ xfs_bulkstat(
427 418
428 error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r); 419 error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r);
429 if (error) 420 if (error)
430 break; 421 goto del_cursor;
431 if (icount) { 422 if (icount) {
432 irbp->ir_startino = r.ir_startino; 423 irbp->ir_startino = r.ir_startino;
433 irbp->ir_freecount = r.ir_freecount; 424 irbp->ir_freecount = r.ir_freecount;
434 irbp->ir_free = r.ir_free; 425 irbp->ir_free = r.ir_free;
435 irbp++; 426 irbp++;
436 agino = r.ir_startino + XFS_INODES_PER_CHUNK;
437 } 427 }
438 /* Increment to the next record */ 428 /* Increment to the next record */
439 error = xfs_btree_increment(cur, 0, &tmp); 429 error = xfs_btree_increment(cur, 0, &stat);
440 } else { 430 } else {
441 /* Start of ag. Lookup the first inode chunk */ 431 /* Start of ag. Lookup the first inode chunk */
442 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp); 432 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat);
433 }
434 if (error || stat == 0) {
435 end_of_ag = true;
436 goto del_cursor;
443 } 437 }
444 if (error)
445 break;
446 438
447 /* 439 /*
448 * Loop through inode btree records in this ag, 440 * Loop through inode btree records in this ag,
@@ -451,10 +443,10 @@ xfs_bulkstat(
451 while (irbp < irbufend && icount < ubcount) { 443 while (irbp < irbufend && icount < ubcount) {
452 struct xfs_inobt_rec_incore r; 444 struct xfs_inobt_rec_incore r;
453 445
454 error = xfs_inobt_get_rec(cur, &r, &i); 446 error = xfs_inobt_get_rec(cur, &r, &stat);
455 if (error || i == 0) { 447 if (error || stat == 0) {
456 end_of_ag = 1; 448 end_of_ag = true;
457 break; 449 goto del_cursor;
458 } 450 }
459 451
460 /* 452 /*
@@ -469,77 +461,79 @@ xfs_bulkstat(
469 irbp++; 461 irbp++;
470 icount += XFS_INODES_PER_CHUNK - r.ir_freecount; 462 icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
471 } 463 }
472 /* 464 error = xfs_btree_increment(cur, 0, &stat);
473 * Set agino to after this chunk and bump the cursor. 465 if (error || stat == 0) {
474 */ 466 end_of_ag = true;
475 agino = r.ir_startino + XFS_INODES_PER_CHUNK; 467 goto del_cursor;
476 error = xfs_btree_increment(cur, 0, &tmp); 468 }
477 cond_resched(); 469 cond_resched();
478 } 470 }
471
479 /* 472 /*
480 * Drop the btree buffers and the agi buffer. 473 * Drop the btree buffers and the agi buffer as we can't hold any
481 * We can't hold any of the locks these represent 474 * of the locks these represent when calling iget. If there is a
482 * when calling iget. 475 * pending error, then we are done.
483 */ 476 */
477del_cursor:
484 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 478 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
485 xfs_buf_relse(agbp); 479 xfs_buf_relse(agbp);
480 if (error)
481 break;
486 /* 482 /*
487 * Now format all the good inodes into the user's buffer. 483 * Now format all the good inodes into the user's buffer. The
484 * call to xfs_bulkstat_ag_ichunk() sets up the agino pointer
485 * for the next loop iteration.
488 */ 486 */
489 irbufend = irbp; 487 irbufend = irbp;
490 for (irbp = irbuf; 488 for (irbp = irbuf;
491 irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) { 489 irbp < irbufend && ac.ac_ubleft >= statstruct_size;
492 struct xfs_bulkstat_agichunk ac; 490 irbp++) {
493
494 ac.ac_lastino = lastino;
495 ac.ac_ubuffer = &ubuffer;
496 ac.ac_ubleft = ubleft;
497 ac.ac_ubelem = ubelem;
498 error = xfs_bulkstat_ag_ichunk(mp, agno, irbp, 491 error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
499 formatter, statstruct_size, &ac); 492 formatter, statstruct_size, &ac,
493 &agino);
500 if (error) 494 if (error)
501 rval = error; 495 break;
502
503 lastino = ac.ac_lastino;
504 ubleft = ac.ac_ubleft;
505 ubelem = ac.ac_ubelem;
506 496
507 cond_resched(); 497 cond_resched();
508 } 498 }
499
509 /* 500 /*
510 * Set up for the next loop iteration. 501 * If we've run out of space or had a formatting error, we
502 * are now done
511 */ 503 */
512 if (XFS_BULKSTAT_UBLEFT(ubleft)) { 504 if (ac.ac_ubleft < statstruct_size || error)
513 if (end_of_ag) {
514 agno++;
515 agino = 0;
516 } else
517 agino = XFS_INO_TO_AGINO(mp, lastino);
518 } else
519 break; 505 break;
506
507 if (end_of_ag) {
508 agno++;
509 agino = 0;
510 }
520 } 511 }
521 /* 512 /*
522 * Done, we're either out of filesystem or space to put the data. 513 * Done, we're either out of filesystem or space to put the data.
523 */ 514 */
524 kmem_free(irbuf); 515 kmem_free(irbuf);
525 *ubcountp = ubelem; 516 *ubcountp = ac.ac_ubelem;
517
526 /* 518 /*
527 * Found some inodes, return them now and return the error next time. 519 * We found some inodes, so clear the error status and return them.
520 * The lastino pointer will point directly at the inode that triggered
521 * any error that occurred, so on the next call the error will be
522 * triggered again and propagated to userspace as there will be no
523 * formatted inodes in the buffer.
528 */ 524 */
529 if (ubelem) 525 if (ac.ac_ubelem)
530 rval = 0; 526 error = 0;
531 if (agno >= mp->m_sb.sb_agcount) { 527
532 /* 528 /*
533 * If we ran out of filesystem, mark lastino as off 529 * If we ran out of filesystem, lastino will point off the end of
534 * the end of the filesystem, so the next call 530 * the filesystem so the next call will return immediately.
535 * will return immediately. 531 */
536 */ 532 *lastinop = XFS_AGINO_TO_INO(mp, agno, agino);
537 *lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0); 533 if (agno >= mp->m_sb.sb_agcount)
538 *done = 1; 534 *done = 1;
539 } else
540 *lastinop = (xfs_ino_t)lastino;
541 535
542 return rval; 536 return error;
543} 537}
544 538
545int 539int
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index aaed08022eb9..6ea8b3912fa4 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -30,22 +30,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp,
30 int *ubused, 30 int *ubused,
31 int *stat); 31 int *stat);
32 32
33struct xfs_bulkstat_agichunk {
34 xfs_ino_t ac_lastino; /* last inode returned */
35 char __user **ac_ubuffer;/* pointer into user's buffer */
36 int ac_ubleft; /* bytes left in user's buffer */
37 int ac_ubelem; /* spaces used in user's buffer */
38};
39
40int
41xfs_bulkstat_ag_ichunk(
42 struct xfs_mount *mp,
43 xfs_agnumber_t agno,
44 struct xfs_inobt_rec_incore *irbp,
45 bulkstat_one_pf formatter,
46 size_t statstruct_size,
47 struct xfs_bulkstat_agichunk *acp);
48
49/* 33/*
50 * Values for stat return value. 34 * Values for stat return value.
51 */ 35 */