aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-03-17 19:31:18 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-17 19:31:18 -0400
commitfaeb20ecfa398b043c3224607f512c009c51653d (patch)
treeffd185ffb5e499a76f261c700de72241e6781ecf
parent364e8dd9d636fea7def862919aac092b19b7c581 (diff)
parent0304688676bdfc8159e165313d71da19c118ba27 (diff)
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o: "Performance improvements in SEEK_DATA and xattr scalability improvements, plus a lot of clean ups and bug fixes" * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (38 commits) ext4: clean up error handling in the MMP support jbd2: do not fail journal because of frozen_buffer allocation failure ext4: use __GFP_NOFAIL in ext4_free_blocks() ext4: fix compile error while opening the macro DOUBLE_CHECK ext4: print ext4 mount option data_err=abort correctly ext4: fix NULL pointer dereference in ext4_mark_inode_dirty() ext4: drop unneeded BUFFER_TRACE in ext4_delete_inline_entry() ext4: fix misspellings in comments. jbd2: fix FS corruption possibility in jbd2_journal_destroy() on umount path ext4: more efficient SEEK_DATA implementation ext4: cleanup handling of bh->b_state in DAX mmap ext4: return hole from ext4_map_blocks() ext4: factor out determining of hole size ext4: fix setting of referenced bit in ext4_es_lookup_extent() ext4: remove i_ioend_count ext4: simplify io_end handling for AIO DIO ext4: move trans handling and completion deferal out of _ext4_get_block ext4: rename and split get blocks functions ext4: use i_mutex to serialize unaligned AIO DIO ext4: pack ioend structure better ...
-rw-r--r--fs/ext2/ext2.h3
-rw-r--r--fs/ext2/super.c25
-rw-r--r--fs/ext2/xattr.c139
-rw-r--r--fs/ext2/xattr.h21
-rw-r--r--fs/ext4/ext4.h45
-rw-r--r--fs/ext4/ext4_extents.h2
-rw-r--r--fs/ext4/extents.c128
-rw-r--r--fs/ext4/extents_status.c4
-rw-r--r--fs/ext4/file.c129
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/indirect.c29
-rw-r--r--fs/ext4/inline.c8
-rw-r--r--fs/ext4/inode.c388
-rw-r--r--fs/ext4/mballoc.c81
-rw-r--r--fs/ext4/mballoc.h12
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/mmp.c34
-rw-r--r--fs/ext4/page-io.c4
-rw-r--r--fs/ext4/super.c35
-rw-r--r--fs/ext4/xattr.c166
-rw-r--r--fs/ext4/xattr.h3
-rw-r--r--fs/jbd2/commit.c49
-rw-r--r--fs/jbd2/journal.c43
-rw-r--r--fs/jbd2/recovery.c31
-rw-r--r--fs/jbd2/revoke.c60
-rw-r--r--fs/jbd2/transaction.c22
-rw-r--r--fs/mbcache.c1093
-rw-r--r--include/linux/jbd2.h16
-rw-r--r--include/linux/mbcache.h93
29 files changed, 1149 insertions, 1518 deletions
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 4c69c94cafd8..170939f379d7 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -61,6 +61,8 @@ struct ext2_block_alloc_info {
61#define rsv_start rsv_window._rsv_start 61#define rsv_start rsv_window._rsv_start
62#define rsv_end rsv_window._rsv_end 62#define rsv_end rsv_window._rsv_end
63 63
64struct mb_cache;
65
64/* 66/*
65 * second extended-fs super-block data in memory 67 * second extended-fs super-block data in memory
66 */ 68 */
@@ -111,6 +113,7 @@ struct ext2_sb_info {
111 * of the mount options. 113 * of the mount options.
112 */ 114 */
113 spinlock_t s_lock; 115 spinlock_t s_lock;
116 struct mb_cache *s_mb_cache;
114}; 117};
115 118
116static inline spinlock_t * 119static inline spinlock_t *
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 2a188413a2b0..b78caf25f746 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -131,7 +131,10 @@ static void ext2_put_super (struct super_block * sb)
131 131
132 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 132 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
133 133
134 ext2_xattr_put_super(sb); 134 if (sbi->s_mb_cache) {
135 ext2_xattr_destroy_cache(sbi->s_mb_cache);
136 sbi->s_mb_cache = NULL;
137 }
135 if (!(sb->s_flags & MS_RDONLY)) { 138 if (!(sb->s_flags & MS_RDONLY)) {
136 struct ext2_super_block *es = sbi->s_es; 139 struct ext2_super_block *es = sbi->s_es;
137 140
@@ -1104,6 +1107,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1104 ext2_msg(sb, KERN_ERR, "error: insufficient memory"); 1107 ext2_msg(sb, KERN_ERR, "error: insufficient memory");
1105 goto failed_mount3; 1108 goto failed_mount3;
1106 } 1109 }
1110
1111#ifdef CONFIG_EXT2_FS_XATTR
1112 sbi->s_mb_cache = ext2_xattr_create_cache();
1113 if (!sbi->s_mb_cache) {
1114 ext2_msg(sb, KERN_ERR, "Failed to create an mb_cache");
1115 goto failed_mount3;
1116 }
1117#endif
1107 /* 1118 /*
1108 * set up enough so that it can read an inode 1119 * set up enough so that it can read an inode
1109 */ 1120 */
@@ -1149,6 +1160,8 @@ cantfind_ext2:
1149 sb->s_id); 1160 sb->s_id);
1150 goto failed_mount; 1161 goto failed_mount;
1151failed_mount3: 1162failed_mount3:
1163 if (sbi->s_mb_cache)
1164 ext2_xattr_destroy_cache(sbi->s_mb_cache);
1152 percpu_counter_destroy(&sbi->s_freeblocks_counter); 1165 percpu_counter_destroy(&sbi->s_freeblocks_counter);
1153 percpu_counter_destroy(&sbi->s_freeinodes_counter); 1166 percpu_counter_destroy(&sbi->s_freeinodes_counter);
1154 percpu_counter_destroy(&sbi->s_dirs_counter); 1167 percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -1555,20 +1568,17 @@ MODULE_ALIAS_FS("ext2");
1555 1568
1556static int __init init_ext2_fs(void) 1569static int __init init_ext2_fs(void)
1557{ 1570{
1558 int err = init_ext2_xattr(); 1571 int err;
1559 if (err) 1572
1560 return err;
1561 err = init_inodecache(); 1573 err = init_inodecache();
1562 if (err) 1574 if (err)
1563 goto out1; 1575 return err;
1564 err = register_filesystem(&ext2_fs_type); 1576 err = register_filesystem(&ext2_fs_type);
1565 if (err) 1577 if (err)
1566 goto out; 1578 goto out;
1567 return 0; 1579 return 0;
1568out: 1580out:
1569 destroy_inodecache(); 1581 destroy_inodecache();
1570out1:
1571 exit_ext2_xattr();
1572 return err; 1582 return err;
1573} 1583}
1574 1584
@@ -1576,7 +1586,6 @@ static void __exit exit_ext2_fs(void)
1576{ 1586{
1577 unregister_filesystem(&ext2_fs_type); 1587 unregister_filesystem(&ext2_fs_type);
1578 destroy_inodecache(); 1588 destroy_inodecache();
1579 exit_ext2_xattr();
1580} 1589}
1581 1590
1582MODULE_AUTHOR("Remy Card and others"); 1591MODULE_AUTHOR("Remy Card and others");
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index f57a7aba32eb..1a5e3bff0b63 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -90,14 +90,12 @@
90static int ext2_xattr_set2(struct inode *, struct buffer_head *, 90static int ext2_xattr_set2(struct inode *, struct buffer_head *,
91 struct ext2_xattr_header *); 91 struct ext2_xattr_header *);
92 92
93static int ext2_xattr_cache_insert(struct buffer_head *); 93static int ext2_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
94static struct buffer_head *ext2_xattr_cache_find(struct inode *, 94static struct buffer_head *ext2_xattr_cache_find(struct inode *,
95 struct ext2_xattr_header *); 95 struct ext2_xattr_header *);
96static void ext2_xattr_rehash(struct ext2_xattr_header *, 96static void ext2_xattr_rehash(struct ext2_xattr_header *,
97 struct ext2_xattr_entry *); 97 struct ext2_xattr_entry *);
98 98
99static struct mb_cache *ext2_xattr_cache;
100
101static const struct xattr_handler *ext2_xattr_handler_map[] = { 99static const struct xattr_handler *ext2_xattr_handler_map[] = {
102 [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, 100 [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler,
103#ifdef CONFIG_EXT2_FS_POSIX_ACL 101#ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -152,6 +150,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name,
152 size_t name_len, size; 150 size_t name_len, size;
153 char *end; 151 char *end;
154 int error; 152 int error;
153 struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
155 154
156 ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", 155 ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
157 name_index, name, buffer, (long)buffer_size); 156 name_index, name, buffer, (long)buffer_size);
@@ -196,7 +195,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get",
196 goto found; 195 goto found;
197 entry = next; 196 entry = next;
198 } 197 }
199 if (ext2_xattr_cache_insert(bh)) 198 if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
200 ea_idebug(inode, "cache insert failed"); 199 ea_idebug(inode, "cache insert failed");
201 error = -ENODATA; 200 error = -ENODATA;
202 goto cleanup; 201 goto cleanup;
@@ -209,7 +208,7 @@ found:
209 le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) 208 le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
210 goto bad_block; 209 goto bad_block;
211 210
212 if (ext2_xattr_cache_insert(bh)) 211 if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
213 ea_idebug(inode, "cache insert failed"); 212 ea_idebug(inode, "cache insert failed");
214 if (buffer) { 213 if (buffer) {
215 error = -ERANGE; 214 error = -ERANGE;
@@ -247,6 +246,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
247 char *end; 246 char *end;
248 size_t rest = buffer_size; 247 size_t rest = buffer_size;
249 int error; 248 int error;
249 struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
250 250
251 ea_idebug(inode, "buffer=%p, buffer_size=%ld", 251 ea_idebug(inode, "buffer=%p, buffer_size=%ld",
252 buffer, (long)buffer_size); 252 buffer, (long)buffer_size);
@@ -281,7 +281,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
281 goto bad_block; 281 goto bad_block;
282 entry = next; 282 entry = next;
283 } 283 }
284 if (ext2_xattr_cache_insert(bh)) 284 if (ext2_xattr_cache_insert(ext2_mb_cache, bh))
285 ea_idebug(inode, "cache insert failed"); 285 ea_idebug(inode, "cache insert failed");
286 286
287 /* list the attribute names */ 287 /* list the attribute names */
@@ -483,22 +483,23 @@ bad_block: ext2_error(sb, "ext2_xattr_set",
483 /* Here we know that we can set the new attribute. */ 483 /* Here we know that we can set the new attribute. */
484 484
485 if (header) { 485 if (header) {
486 struct mb_cache_entry *ce;
487
488 /* assert(header == HDR(bh)); */ 486 /* assert(header == HDR(bh)); */
489 ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev,
490 bh->b_blocknr);
491 lock_buffer(bh); 487 lock_buffer(bh);
492 if (header->h_refcount == cpu_to_le32(1)) { 488 if (header->h_refcount == cpu_to_le32(1)) {
489 __u32 hash = le32_to_cpu(header->h_hash);
490
493 ea_bdebug(bh, "modifying in-place"); 491 ea_bdebug(bh, "modifying in-place");
494 if (ce) 492 /*
495 mb_cache_entry_free(ce); 493 * This must happen under buffer lock for
494 * ext2_xattr_set2() to reliably detect modified block
495 */
496 mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache,
497 hash, bh->b_blocknr);
498
496 /* keep the buffer locked while modifying it. */ 499 /* keep the buffer locked while modifying it. */
497 } else { 500 } else {
498 int offset; 501 int offset;
499 502
500 if (ce)
501 mb_cache_entry_release(ce);
502 unlock_buffer(bh); 503 unlock_buffer(bh);
503 ea_bdebug(bh, "cloning"); 504 ea_bdebug(bh, "cloning");
504 header = kmalloc(bh->b_size, GFP_KERNEL); 505 header = kmalloc(bh->b_size, GFP_KERNEL);
@@ -626,6 +627,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
626 struct super_block *sb = inode->i_sb; 627 struct super_block *sb = inode->i_sb;
627 struct buffer_head *new_bh = NULL; 628 struct buffer_head *new_bh = NULL;
628 int error; 629 int error;
630 struct mb_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache;
629 631
630 if (header) { 632 if (header) {
631 new_bh = ext2_xattr_cache_find(inode, header); 633 new_bh = ext2_xattr_cache_find(inode, header);
@@ -653,7 +655,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
653 don't need to change the reference count. */ 655 don't need to change the reference count. */
654 new_bh = old_bh; 656 new_bh = old_bh;
655 get_bh(new_bh); 657 get_bh(new_bh);
656 ext2_xattr_cache_insert(new_bh); 658 ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
657 } else { 659 } else {
658 /* We need to allocate a new block */ 660 /* We need to allocate a new block */
659 ext2_fsblk_t goal = ext2_group_first_block_no(sb, 661 ext2_fsblk_t goal = ext2_group_first_block_no(sb,
@@ -674,7 +676,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
674 memcpy(new_bh->b_data, header, new_bh->b_size); 676 memcpy(new_bh->b_data, header, new_bh->b_size);
675 set_buffer_uptodate(new_bh); 677 set_buffer_uptodate(new_bh);
676 unlock_buffer(new_bh); 678 unlock_buffer(new_bh);
677 ext2_xattr_cache_insert(new_bh); 679 ext2_xattr_cache_insert(ext2_mb_cache, new_bh);
678 680
679 ext2_xattr_update_super_block(sb); 681 ext2_xattr_update_super_block(sb);
680 } 682 }
@@ -707,19 +709,21 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
707 709
708 error = 0; 710 error = 0;
709 if (old_bh && old_bh != new_bh) { 711 if (old_bh && old_bh != new_bh) {
710 struct mb_cache_entry *ce;
711
712 /* 712 /*
713 * If there was an old block and we are no longer using it, 713 * If there was an old block and we are no longer using it,
714 * release the old block. 714 * release the old block.
715 */ 715 */
716 ce = mb_cache_entry_get(ext2_xattr_cache, old_bh->b_bdev,
717 old_bh->b_blocknr);
718 lock_buffer(old_bh); 716 lock_buffer(old_bh);
719 if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { 717 if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
718 __u32 hash = le32_to_cpu(HDR(old_bh)->h_hash);
719
720 /*
721 * This must happen under buffer lock for
722 * ext2_xattr_set2() to reliably detect freed block
723 */
724 mb_cache_entry_delete_block(ext2_mb_cache,
725 hash, old_bh->b_blocknr);
720 /* Free the old block. */ 726 /* Free the old block. */
721 if (ce)
722 mb_cache_entry_free(ce);
723 ea_bdebug(old_bh, "freeing"); 727 ea_bdebug(old_bh, "freeing");
724 ext2_free_blocks(inode, old_bh->b_blocknr, 1); 728 ext2_free_blocks(inode, old_bh->b_blocknr, 1);
725 mark_inode_dirty(inode); 729 mark_inode_dirty(inode);
@@ -730,8 +734,6 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
730 } else { 734 } else {
731 /* Decrement the refcount only. */ 735 /* Decrement the refcount only. */
732 le32_add_cpu(&HDR(old_bh)->h_refcount, -1); 736 le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
733 if (ce)
734 mb_cache_entry_release(ce);
735 dquot_free_block_nodirty(inode, 1); 737 dquot_free_block_nodirty(inode, 1);
736 mark_inode_dirty(inode); 738 mark_inode_dirty(inode);
737 mark_buffer_dirty(old_bh); 739 mark_buffer_dirty(old_bh);
@@ -757,7 +759,6 @@ void
757ext2_xattr_delete_inode(struct inode *inode) 759ext2_xattr_delete_inode(struct inode *inode)
758{ 760{
759 struct buffer_head *bh = NULL; 761 struct buffer_head *bh = NULL;
760 struct mb_cache_entry *ce;
761 762
762 down_write(&EXT2_I(inode)->xattr_sem); 763 down_write(&EXT2_I(inode)->xattr_sem);
763 if (!EXT2_I(inode)->i_file_acl) 764 if (!EXT2_I(inode)->i_file_acl)
@@ -777,19 +778,22 @@ ext2_xattr_delete_inode(struct inode *inode)
777 EXT2_I(inode)->i_file_acl); 778 EXT2_I(inode)->i_file_acl);
778 goto cleanup; 779 goto cleanup;
779 } 780 }
780 ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr);
781 lock_buffer(bh); 781 lock_buffer(bh);
782 if (HDR(bh)->h_refcount == cpu_to_le32(1)) { 782 if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
783 if (ce) 783 __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
784 mb_cache_entry_free(ce); 784
785 /*
786 * This must happen under buffer lock for ext2_xattr_set2() to
787 * reliably detect freed block
788 */
789 mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache,
790 hash, bh->b_blocknr);
785 ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); 791 ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
786 get_bh(bh); 792 get_bh(bh);
787 bforget(bh); 793 bforget(bh);
788 unlock_buffer(bh); 794 unlock_buffer(bh);
789 } else { 795 } else {
790 le32_add_cpu(&HDR(bh)->h_refcount, -1); 796 le32_add_cpu(&HDR(bh)->h_refcount, -1);
791 if (ce)
792 mb_cache_entry_release(ce);
793 ea_bdebug(bh, "refcount now=%d", 797 ea_bdebug(bh, "refcount now=%d",
794 le32_to_cpu(HDR(bh)->h_refcount)); 798 le32_to_cpu(HDR(bh)->h_refcount));
795 unlock_buffer(bh); 799 unlock_buffer(bh);
@@ -806,18 +810,6 @@ cleanup:
806} 810}
807 811
808/* 812/*
809 * ext2_xattr_put_super()
810 *
811 * This is called when a file system is unmounted.
812 */
813void
814ext2_xattr_put_super(struct super_block *sb)
815{
816 mb_cache_shrink(sb->s_bdev);
817}
818
819
820/*
821 * ext2_xattr_cache_insert() 813 * ext2_xattr_cache_insert()
822 * 814 *
823 * Create a new entry in the extended attribute cache, and insert 815 * Create a new entry in the extended attribute cache, and insert
@@ -826,28 +818,20 @@ ext2_xattr_put_super(struct super_block *sb)
826 * Returns 0, or a negative error number on failure. 818 * Returns 0, or a negative error number on failure.
827 */ 819 */
828static int 820static int
829ext2_xattr_cache_insert(struct buffer_head *bh) 821ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh)
830{ 822{
831 __u32 hash = le32_to_cpu(HDR(bh)->h_hash); 823 __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
832 struct mb_cache_entry *ce;
833 int error; 824 int error;
834 825
835 ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS); 826 error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr, 1);
836 if (!ce)
837 return -ENOMEM;
838 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
839 if (error) { 827 if (error) {
840 mb_cache_entry_free(ce);
841 if (error == -EBUSY) { 828 if (error == -EBUSY) {
842 ea_bdebug(bh, "already in cache (%d cache entries)", 829 ea_bdebug(bh, "already in cache (%d cache entries)",
843 atomic_read(&ext2_xattr_cache->c_entry_count)); 830 atomic_read(&ext2_xattr_cache->c_entry_count));
844 error = 0; 831 error = 0;
845 } 832 }
846 } else { 833 } else
847 ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, 834 ea_bdebug(bh, "inserting [%x]", (int)hash);
848 atomic_read(&ext2_xattr_cache->c_entry_count));
849 mb_cache_entry_release(ce);
850 }
851 return error; 835 return error;
852} 836}
853 837
@@ -904,22 +888,16 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
904{ 888{
905 __u32 hash = le32_to_cpu(header->h_hash); 889 __u32 hash = le32_to_cpu(header->h_hash);
906 struct mb_cache_entry *ce; 890 struct mb_cache_entry *ce;
891 struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache;
907 892
908 if (!header->h_hash) 893 if (!header->h_hash)
909 return NULL; /* never share */ 894 return NULL; /* never share */
910 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); 895 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
911again: 896again:
912 ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev, 897 ce = mb_cache_entry_find_first(ext2_mb_cache, hash);
913 hash);
914 while (ce) { 898 while (ce) {
915 struct buffer_head *bh; 899 struct buffer_head *bh;
916 900
917 if (IS_ERR(ce)) {
918 if (PTR_ERR(ce) == -EAGAIN)
919 goto again;
920 break;
921 }
922
923 bh = sb_bread(inode->i_sb, ce->e_block); 901 bh = sb_bread(inode->i_sb, ce->e_block);
924 if (!bh) { 902 if (!bh) {
925 ext2_error(inode->i_sb, "ext2_xattr_cache_find", 903 ext2_error(inode->i_sb, "ext2_xattr_cache_find",
@@ -927,7 +905,21 @@ again:
927 inode->i_ino, (unsigned long) ce->e_block); 905 inode->i_ino, (unsigned long) ce->e_block);
928 } else { 906 } else {
929 lock_buffer(bh); 907 lock_buffer(bh);
930 if (le32_to_cpu(HDR(bh)->h_refcount) > 908 /*
909 * We have to be careful about races with freeing or
910 * rehashing of xattr block. Once we hold buffer lock
911 * xattr block's state is stable so we can check
912 * whether the block got freed / rehashed or not.
913 * Since we unhash mbcache entry under buffer lock when
914 * freeing / rehashing xattr block, checking whether
915 * entry is still hashed is reliable.
916 */
917 if (hlist_bl_unhashed(&ce->e_hash_list)) {
918 mb_cache_entry_put(ext2_mb_cache, ce);
919 unlock_buffer(bh);
920 brelse(bh);
921 goto again;
922 } else if (le32_to_cpu(HDR(bh)->h_refcount) >
931 EXT2_XATTR_REFCOUNT_MAX) { 923 EXT2_XATTR_REFCOUNT_MAX) {
932 ea_idebug(inode, "block %ld refcount %d>%d", 924 ea_idebug(inode, "block %ld refcount %d>%d",
933 (unsigned long) ce->e_block, 925 (unsigned long) ce->e_block,
@@ -936,13 +928,14 @@ again:
936 } else if (!ext2_xattr_cmp(header, HDR(bh))) { 928 } else if (!ext2_xattr_cmp(header, HDR(bh))) {
937 ea_bdebug(bh, "b_count=%d", 929 ea_bdebug(bh, "b_count=%d",
938 atomic_read(&(bh->b_count))); 930 atomic_read(&(bh->b_count)));
939 mb_cache_entry_release(ce); 931 mb_cache_entry_touch(ext2_mb_cache, ce);
932 mb_cache_entry_put(ext2_mb_cache, ce);
940 return bh; 933 return bh;
941 } 934 }
942 unlock_buffer(bh); 935 unlock_buffer(bh);
943 brelse(bh); 936 brelse(bh);
944 } 937 }
945 ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); 938 ce = mb_cache_entry_find_next(ext2_mb_cache, ce);
946 } 939 }
947 return NULL; 940 return NULL;
948} 941}
@@ -1015,17 +1008,15 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header,
1015 1008
1016#undef BLOCK_HASH_SHIFT 1009#undef BLOCK_HASH_SHIFT
1017 1010
1018int __init 1011#define HASH_BUCKET_BITS 10
1019init_ext2_xattr(void) 1012
1013struct mb_cache *ext2_xattr_create_cache(void)
1020{ 1014{
1021 ext2_xattr_cache = mb_cache_create("ext2_xattr", 6); 1015 return mb_cache_create(HASH_BUCKET_BITS);
1022 if (!ext2_xattr_cache)
1023 return -ENOMEM;
1024 return 0;
1025} 1016}
1026 1017
1027void 1018void ext2_xattr_destroy_cache(struct mb_cache *cache)
1028exit_ext2_xattr(void)
1029{ 1019{
1030 mb_cache_destroy(ext2_xattr_cache); 1020 if (cache)
1021 mb_cache_destroy(cache);
1031} 1022}
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 60edf298644e..6f82ab1b00ca 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -53,6 +53,8 @@ struct ext2_xattr_entry {
53#define EXT2_XATTR_SIZE(size) \ 53#define EXT2_XATTR_SIZE(size) \
54 (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND) 54 (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND)
55 55
56struct mb_cache;
57
56# ifdef CONFIG_EXT2_FS_XATTR 58# ifdef CONFIG_EXT2_FS_XATTR
57 59
58extern const struct xattr_handler ext2_xattr_user_handler; 60extern const struct xattr_handler ext2_xattr_user_handler;
@@ -65,10 +67,9 @@ extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t);
65extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int); 67extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
66 68
67extern void ext2_xattr_delete_inode(struct inode *); 69extern void ext2_xattr_delete_inode(struct inode *);
68extern void ext2_xattr_put_super(struct super_block *);
69 70
70extern int init_ext2_xattr(void); 71extern struct mb_cache *ext2_xattr_create_cache(void);
71extern void exit_ext2_xattr(void); 72extern void ext2_xattr_destroy_cache(struct mb_cache *cache);
72 73
73extern const struct xattr_handler *ext2_xattr_handlers[]; 74extern const struct xattr_handler *ext2_xattr_handlers[];
74 75
@@ -93,19 +94,7 @@ ext2_xattr_delete_inode(struct inode *inode)
93{ 94{
94} 95}
95 96
96static inline void 97static inline void ext2_xattr_destroy_cache(struct mb_cache *cache)
97ext2_xattr_put_super(struct super_block *sb)
98{
99}
100
101static inline int
102init_ext2_xattr(void)
103{
104 return 0;
105}
106
107static inline void
108exit_ext2_xattr(void)
109{ 98{
110} 99}
111 100
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 157b458a69d4..393689dfa1af 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -42,6 +42,18 @@
42 */ 42 */
43 43
44/* 44/*
45 * with AGGRESSIVE_CHECK allocator runs consistency checks over
46 * structures. these checks slow things down a lot
47 */
48#define AGGRESSIVE_CHECK__
49
50/*
51 * with DOUBLE_CHECK defined mballoc creates persistent in-core
52 * bitmaps, maintains and uses them to check for double allocations
53 */
54#define DOUBLE_CHECK__
55
56/*
45 * Define EXT4FS_DEBUG to produce debug messages 57 * Define EXT4FS_DEBUG to produce debug messages
46 */ 58 */
47#undef EXT4FS_DEBUG 59#undef EXT4FS_DEBUG
@@ -182,9 +194,9 @@ typedef struct ext4_io_end {
182 struct bio *bio; /* Linked list of completed 194 struct bio *bio; /* Linked list of completed
183 * bios covering the extent */ 195 * bios covering the extent */
184 unsigned int flag; /* unwritten or not */ 196 unsigned int flag; /* unwritten or not */
197 atomic_t count; /* reference counter */
185 loff_t offset; /* offset in the file */ 198 loff_t offset; /* offset in the file */
186 ssize_t size; /* size of the extent */ 199 ssize_t size; /* size of the extent */
187 atomic_t count; /* reference counter */
188} ext4_io_end_t; 200} ext4_io_end_t;
189 201
190struct ext4_io_submit { 202struct ext4_io_submit {
@@ -1024,13 +1036,8 @@ struct ext4_inode_info {
1024 * transaction reserved 1036 * transaction reserved
1025 */ 1037 */
1026 struct list_head i_rsv_conversion_list; 1038 struct list_head i_rsv_conversion_list;
1027 /*
1028 * Completed IOs that need unwritten extents handling and don't have
1029 * transaction reserved
1030 */
1031 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
1032 atomic_t i_unwritten; /* Nr. of inflight conversions pending */
1033 struct work_struct i_rsv_conversion_work; 1039 struct work_struct i_rsv_conversion_work;
1040 atomic_t i_unwritten; /* Nr. of inflight conversions pending */
1034 1041
1035 spinlock_t i_block_reservation_lock; 1042 spinlock_t i_block_reservation_lock;
1036 1043
@@ -1513,16 +1520,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
1513 } 1520 }
1514} 1521}
1515 1522
1516static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
1517{
1518 return inode->i_private;
1519}
1520
1521static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
1522{
1523 inode->i_private = io;
1524}
1525
1526/* 1523/*
1527 * Inode dynamic state flags 1524 * Inode dynamic state flags
1528 */ 1525 */
@@ -2506,12 +2503,14 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
2506int ext4_inode_is_fast_symlink(struct inode *inode); 2503int ext4_inode_is_fast_symlink(struct inode *inode);
2507struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); 2504struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
2508struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); 2505struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
2509int ext4_get_block_write(struct inode *inode, sector_t iblock, 2506int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
2510 struct buffer_head *bh_result, int create); 2507 struct buffer_head *bh_result, int create);
2511int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, 2508int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
2512 struct buffer_head *bh_result, int create); 2509 struct buffer_head *bh_result, int create);
2513int ext4_get_block(struct inode *inode, sector_t iblock, 2510int ext4_get_block(struct inode *inode, sector_t iblock,
2514 struct buffer_head *bh_result, int create); 2511 struct buffer_head *bh_result, int create);
2512int ext4_dio_get_block(struct inode *inode, sector_t iblock,
2513 struct buffer_head *bh_result, int create);
2515int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2514int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2516 struct buffer_head *bh, int create); 2515 struct buffer_head *bh, int create);
2517int ext4_walk_page_buffers(handle_t *handle, 2516int ext4_walk_page_buffers(handle_t *handle,
@@ -2559,6 +2558,9 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
2559 int used, int quota_claim); 2558 int used, int quota_claim);
2560extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, 2559extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
2561 ext4_fsblk_t pblk, ext4_lblk_t len); 2560 ext4_fsblk_t pblk, ext4_lblk_t len);
2561extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
2562 unsigned int map_len,
2563 struct extent_status *result);
2562 2564
2563/* indirect.c */ 2565/* indirect.c */
2564extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, 2566extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
@@ -3285,10 +3287,7 @@ static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
3285#define EXT4_WQ_HASH_SZ 37 3287#define EXT4_WQ_HASH_SZ 37
3286#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ 3288#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
3287 EXT4_WQ_HASH_SZ]) 3289 EXT4_WQ_HASH_SZ])
3288#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\
3289 EXT4_WQ_HASH_SZ])
3290extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; 3290extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
3291extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
3292 3291
3293#define EXT4_RESIZING 0 3292#define EXT4_RESIZING 0
3294extern int ext4_resize_begin(struct super_block *sb); 3293extern int ext4_resize_begin(struct super_block *sb);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 3c9381547094..8ecf84b8f5a1 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -11,7 +11,7 @@
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public Licens 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
17 */ 17 */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3753ceb0b0dd..95bf4679ac54 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -15,7 +15,7 @@
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details. 16 * GNU General Public License for more details.
17 * 17 *
18 * You should have received a copy of the GNU General Public Licens 18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software 19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
21 */ 21 */
@@ -1736,6 +1736,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1736 */ 1736 */
1737 if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) 1737 if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
1738 return 0; 1738 return 0;
1739 /*
1740 * The check for IO to unwritten extent is somewhat racy as we
1741 * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
1742 * dropping i_data_sem. But reserved blocks should save us in that
1743 * case.
1744 */
1739 if (ext4_ext_is_unwritten(ex1) && 1745 if (ext4_ext_is_unwritten(ex1) &&
1740 (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || 1746 (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
1741 atomic_read(&EXT4_I(inode)->i_unwritten) || 1747 atomic_read(&EXT4_I(inode)->i_unwritten) ||
@@ -2293,59 +2299,69 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
2293} 2299}
2294 2300
2295/* 2301/*
2296 * ext4_ext_put_gap_in_cache: 2302 * ext4_ext_determine_hole - determine hole around given block
2297 * calculate boundaries of the gap that the requested block fits into 2303 * @inode: inode we lookup in
2298 * and cache this gap 2304 * @path: path in extent tree to @lblk
2305 * @lblk: pointer to logical block around which we want to determine hole
2306 *
2307 * Determine hole length (and start if easily possible) around given logical
2308 * block. We don't try too hard to find the beginning of the hole but @path
2309 * actually points to extent before @lblk, we provide it.
2310 *
2311 * The function returns the length of a hole starting at @lblk. We update @lblk
2312 * to the beginning of the hole if we managed to find it.
2299 */ 2313 */
2300static void 2314static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
2301ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, 2315 struct ext4_ext_path *path,
2302 ext4_lblk_t block) 2316 ext4_lblk_t *lblk)
2303{ 2317{
2304 int depth = ext_depth(inode); 2318 int depth = ext_depth(inode);
2305 ext4_lblk_t len;
2306 ext4_lblk_t lblock;
2307 struct ext4_extent *ex; 2319 struct ext4_extent *ex;
2308 struct extent_status es; 2320 ext4_lblk_t len;
2309 2321
2310 ex = path[depth].p_ext; 2322 ex = path[depth].p_ext;
2311 if (ex == NULL) { 2323 if (ex == NULL) {
2312 /* there is no extent yet, so gap is [0;-] */ 2324 /* there is no extent yet, so gap is [0;-] */
2313 lblock = 0; 2325 *lblk = 0;
2314 len = EXT_MAX_BLOCKS; 2326 len = EXT_MAX_BLOCKS;
2315 ext_debug("cache gap(whole file):"); 2327 } else if (*lblk < le32_to_cpu(ex->ee_block)) {
2316 } else if (block < le32_to_cpu(ex->ee_block)) { 2328 len = le32_to_cpu(ex->ee_block) - *lblk;
2317 lblock = block; 2329 } else if (*lblk >= le32_to_cpu(ex->ee_block)
2318 len = le32_to_cpu(ex->ee_block) - block;
2319 ext_debug("cache gap(before): %u [%u:%u]",
2320 block,
2321 le32_to_cpu(ex->ee_block),
2322 ext4_ext_get_actual_len(ex));
2323 } else if (block >= le32_to_cpu(ex->ee_block)
2324 + ext4_ext_get_actual_len(ex)) { 2330 + ext4_ext_get_actual_len(ex)) {
2325 ext4_lblk_t next; 2331 ext4_lblk_t next;
2326 lblock = le32_to_cpu(ex->ee_block)
2327 + ext4_ext_get_actual_len(ex);
2328 2332
2333 *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
2329 next = ext4_ext_next_allocated_block(path); 2334 next = ext4_ext_next_allocated_block(path);
2330 ext_debug("cache gap(after): [%u:%u] %u", 2335 BUG_ON(next == *lblk);
2331 le32_to_cpu(ex->ee_block), 2336 len = next - *lblk;
2332 ext4_ext_get_actual_len(ex),
2333 block);
2334 BUG_ON(next == lblock);
2335 len = next - lblock;
2336 } else { 2337 } else {
2337 BUG(); 2338 BUG();
2338 } 2339 }
2340 return len;
2341}
2339 2342
2340 ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es); 2343/*
2344 * ext4_ext_put_gap_in_cache:
2345 * calculate boundaries of the gap that the requested block fits into
2346 * and cache this gap
2347 */
2348static void
2349ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
2350 ext4_lblk_t hole_len)
2351{
2352 struct extent_status es;
2353
2354 ext4_es_find_delayed_extent_range(inode, hole_start,
2355 hole_start + hole_len - 1, &es);
2341 if (es.es_len) { 2356 if (es.es_len) {
2342 /* There's delayed extent containing lblock? */ 2357 /* There's delayed extent containing lblock? */
2343 if (es.es_lblk <= lblock) 2358 if (es.es_lblk <= hole_start)
2344 return; 2359 return;
2345 len = min(es.es_lblk - lblock, len); 2360 hole_len = min(es.es_lblk - hole_start, hole_len);
2346 } 2361 }
2347 ext_debug(" -> %u:%u\n", lblock, len); 2362 ext_debug(" -> %u:%u\n", hole_start, hole_len);
2348 ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE); 2363 ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
2364 EXTENT_STATUS_HOLE);
2349} 2365}
2350 2366
2351/* 2367/*
@@ -3927,7 +3943,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
3927static int 3943static int
3928convert_initialized_extent(handle_t *handle, struct inode *inode, 3944convert_initialized_extent(handle_t *handle, struct inode *inode,
3929 struct ext4_map_blocks *map, 3945 struct ext4_map_blocks *map,
3930 struct ext4_ext_path **ppath, int flags, 3946 struct ext4_ext_path **ppath,
3931 unsigned int allocated) 3947 unsigned int allocated)
3932{ 3948{
3933 struct ext4_ext_path *path = *ppath; 3949 struct ext4_ext_path *path = *ppath;
@@ -4007,7 +4023,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
4007 struct ext4_ext_path *path = *ppath; 4023 struct ext4_ext_path *path = *ppath;
4008 int ret = 0; 4024 int ret = 0;
4009 int err = 0; 4025 int err = 0;
4010 ext4_io_end_t *io = ext4_inode_aio(inode);
4011 4026
4012 ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " 4027 ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
4013 "block %llu, max_blocks %u, flags %x, allocated %u\n", 4028 "block %llu, max_blocks %u, flags %x, allocated %u\n",
@@ -4030,15 +4045,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
4030 flags | EXT4_GET_BLOCKS_CONVERT); 4045 flags | EXT4_GET_BLOCKS_CONVERT);
4031 if (ret <= 0) 4046 if (ret <= 0)
4032 goto out; 4047 goto out;
4033 /*
4034 * Flag the inode(non aio case) or end_io struct (aio case)
4035 * that this IO needs to conversion to written when IO is
4036 * completed
4037 */
4038 if (io)
4039 ext4_set_io_unwritten_flag(inode, io);
4040 else
4041 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
4042 map->m_flags |= EXT4_MAP_UNWRITTEN; 4048 map->m_flags |= EXT4_MAP_UNWRITTEN;
4043 goto out; 4049 goto out;
4044 } 4050 }
@@ -4283,9 +4289,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4283 unsigned int allocated = 0, offset = 0; 4289 unsigned int allocated = 0, offset = 0;
4284 unsigned int allocated_clusters = 0; 4290 unsigned int allocated_clusters = 0;
4285 struct ext4_allocation_request ar; 4291 struct ext4_allocation_request ar;
4286 ext4_io_end_t *io = ext4_inode_aio(inode);
4287 ext4_lblk_t cluster_offset; 4292 ext4_lblk_t cluster_offset;
4288 int set_unwritten = 0;
4289 bool map_from_cluster = false; 4293 bool map_from_cluster = false;
4290 4294
4291 ext_debug("blocks %u/%u requested for inode %lu\n", 4295 ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -4347,7 +4351,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4347 (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { 4351 (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
4348 allocated = convert_initialized_extent( 4352 allocated = convert_initialized_extent(
4349 handle, inode, map, &path, 4353 handle, inode, map, &path,
4350 flags, allocated); 4354 allocated);
4351 goto out2; 4355 goto out2;
4352 } else if (!ext4_ext_is_unwritten(ex)) 4356 } else if (!ext4_ext_is_unwritten(ex))
4353 goto out; 4357 goto out;
@@ -4368,11 +4372,22 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
4368 * we couldn't try to create block if create flag is zero 4372 * we couldn't try to create block if create flag is zero
4369 */ 4373 */
4370 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 4374 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
4375 ext4_lblk_t hole_start, hole_len;
4376
4377 hole_start = map->m_lblk;
4378 hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
4371 /* 4379 /*
4372 * put just found gap into cache to speed up 4380 * put just found gap into cache to speed up
4373 * subsequent requests 4381 * subsequent requests
4374 */ 4382 */
4375 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); 4383 ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
4384
4385 /* Update hole_len to reflect hole size after map->m_lblk */
4386 if (hole_start != map->m_lblk)
4387 hole_len -= map->m_lblk - hole_start;
4388 map->m_pblk = 0;
4389 map->m_len = min_t(unsigned int, map->m_len, hole_len);
4390
4376 goto out2; 4391 goto out2;
4377 } 4392 }
4378 4393
@@ -4482,15 +4497,6 @@ got_allocated_blocks:
4482 if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ 4497 if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
4483 ext4_ext_mark_unwritten(&newex); 4498 ext4_ext_mark_unwritten(&newex);
4484 map->m_flags |= EXT4_MAP_UNWRITTEN; 4499 map->m_flags |= EXT4_MAP_UNWRITTEN;
4485 /*
4486 * io_end structure was created for every IO write to an
4487 * unwritten extent. To avoid unnecessary conversion,
4488 * here we flag the IO that really needs the conversion.
4489 * For non asycn direct IO case, flag the inode state
4490 * that we need to perform conversion when IO is done.
4491 */
4492 if (flags & EXT4_GET_BLOCKS_PRE_IO)
4493 set_unwritten = 1;
4494 } 4500 }
4495 4501
4496 err = 0; 4502 err = 0;
@@ -4501,14 +4507,6 @@ got_allocated_blocks:
4501 err = ext4_ext_insert_extent(handle, inode, &path, 4507 err = ext4_ext_insert_extent(handle, inode, &path,
4502 &newex, flags); 4508 &newex, flags);
4503 4509
4504 if (!err && set_unwritten) {
4505 if (io)
4506 ext4_set_io_unwritten_flag(inode, io);
4507 else
4508 ext4_set_inode_state(inode,
4509 EXT4_STATE_DIO_UNWRITTEN);
4510 }
4511
4512 if (err && free_on_err) { 4510 if (err && free_on_err) {
4513 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? 4511 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
4514 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; 4512 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index ac748b3af1c1..e38b987ac7f5 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -823,8 +823,8 @@ out:
823 es->es_lblk = es1->es_lblk; 823 es->es_lblk = es1->es_lblk;
824 es->es_len = es1->es_len; 824 es->es_len = es1->es_len;
825 es->es_pblk = es1->es_pblk; 825 es->es_pblk = es1->es_pblk;
826 if (!ext4_es_is_referenced(es)) 826 if (!ext4_es_is_referenced(es1))
827 ext4_es_set_referenced(es); 827 ext4_es_set_referenced(es1);
828 stats->es_stats_cache_hits++; 828 stats->es_stats_cache_hits++;
829 } else { 829 } else {
830 stats->es_stats_cache_misses++; 830 stats->es_stats_cache_misses++;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4cd318f31cbe..6659e216385e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -93,31 +93,29 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
93{ 93{
94 struct file *file = iocb->ki_filp; 94 struct file *file = iocb->ki_filp;
95 struct inode *inode = file_inode(iocb->ki_filp); 95 struct inode *inode = file_inode(iocb->ki_filp);
96 struct mutex *aio_mutex = NULL;
97 struct blk_plug plug; 96 struct blk_plug plug;
98 int o_direct = iocb->ki_flags & IOCB_DIRECT; 97 int o_direct = iocb->ki_flags & IOCB_DIRECT;
98 int unaligned_aio = 0;
99 int overwrite = 0; 99 int overwrite = 0;
100 ssize_t ret; 100 ssize_t ret;
101 101
102 inode_lock(inode);
103 ret = generic_write_checks(iocb, from);
104 if (ret <= 0)
105 goto out;
106
102 /* 107 /*
103 * Unaligned direct AIO must be serialized; see comment above 108 * Unaligned direct AIO must be serialized among each other as zeroing
104 * In the case of O_APPEND, assume that we must always serialize 109 * of partial blocks of two competing unaligned AIOs can result in data
110 * corruption.
105 */ 111 */
106 if (o_direct && 112 if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
107 ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
108 !is_sync_kiocb(iocb) && 113 !is_sync_kiocb(iocb) &&
109 (iocb->ki_flags & IOCB_APPEND || 114 ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
110 ext4_unaligned_aio(inode, from, iocb->ki_pos))) { 115 unaligned_aio = 1;
111 aio_mutex = ext4_aio_mutex(inode);
112 mutex_lock(aio_mutex);
113 ext4_unwritten_wait(inode); 116 ext4_unwritten_wait(inode);
114 } 117 }
115 118
116 inode_lock(inode);
117 ret = generic_write_checks(iocb, from);
118 if (ret <= 0)
119 goto out;
120
121 /* 119 /*
122 * If we have encountered a bitmap-format file, the size limit 120 * If we have encountered a bitmap-format file, the size limit
123 * is smaller than s_maxbytes, which is for extent-mapped files. 121 * is smaller than s_maxbytes, which is for extent-mapped files.
@@ -139,7 +137,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
139 blk_start_plug(&plug); 137 blk_start_plug(&plug);
140 138
141 /* check whether we do a DIO overwrite or not */ 139 /* check whether we do a DIO overwrite or not */
142 if (ext4_should_dioread_nolock(inode) && !aio_mutex && 140 if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
143 !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { 141 !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
144 struct ext4_map_blocks map; 142 struct ext4_map_blocks map;
145 unsigned int blkbits = inode->i_blkbits; 143 unsigned int blkbits = inode->i_blkbits;
@@ -181,14 +179,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
181 if (o_direct) 179 if (o_direct)
182 blk_finish_plug(&plug); 180 blk_finish_plug(&plug);
183 181
184 if (aio_mutex)
185 mutex_unlock(aio_mutex);
186 return ret; 182 return ret;
187 183
188out: 184out:
189 inode_unlock(inode); 185 inode_unlock(inode);
190 if (aio_mutex)
191 mutex_unlock(aio_mutex);
192 return ret; 186 return ret;
193} 187}
194 188
@@ -417,7 +411,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
417 */ 411 */
418static int ext4_find_unwritten_pgoff(struct inode *inode, 412static int ext4_find_unwritten_pgoff(struct inode *inode,
419 int whence, 413 int whence,
420 struct ext4_map_blocks *map, 414 ext4_lblk_t end_blk,
421 loff_t *offset) 415 loff_t *offset)
422{ 416{
423 struct pagevec pvec; 417 struct pagevec pvec;
@@ -432,7 +426,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
432 blkbits = inode->i_sb->s_blocksize_bits; 426 blkbits = inode->i_sb->s_blocksize_bits;
433 startoff = *offset; 427 startoff = *offset;
434 lastoff = startoff; 428 lastoff = startoff;
435 endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; 429 endoff = (loff_t)end_blk << blkbits;
436 430
437 index = startoff >> PAGE_CACHE_SHIFT; 431 index = startoff >> PAGE_CACHE_SHIFT;
438 end = endoff >> PAGE_CACHE_SHIFT; 432 end = endoff >> PAGE_CACHE_SHIFT;
@@ -550,12 +544,11 @@ out:
550static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) 544static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
551{ 545{
552 struct inode *inode = file->f_mapping->host; 546 struct inode *inode = file->f_mapping->host;
553 struct ext4_map_blocks map;
554 struct extent_status es; 547 struct extent_status es;
555 ext4_lblk_t start, last, end; 548 ext4_lblk_t start, last, end;
556 loff_t dataoff, isize; 549 loff_t dataoff, isize;
557 int blkbits; 550 int blkbits;
558 int ret = 0; 551 int ret;
559 552
560 inode_lock(inode); 553 inode_lock(inode);
561 554
@@ -572,41 +565,32 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
572 dataoff = offset; 565 dataoff = offset;
573 566
574 do { 567 do {
575 map.m_lblk = last; 568 ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
576 map.m_len = end - last + 1; 569 if (ret <= 0) {
577 ret = ext4_map_blocks(NULL, inode, &map, 0); 570 /* No extent found -> no data */
578 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 571 if (ret == 0)
579 if (last != start) 572 ret = -ENXIO;
580 dataoff = (loff_t)last << blkbits; 573 inode_unlock(inode);
581 break; 574 return ret;
582 } 575 }
583 576
584 /* 577 last = es.es_lblk;
585 * If there is a delay extent at this offset, 578 if (last != start)
586 * it will be as a data. 579 dataoff = (loff_t)last << blkbits;
587 */ 580 if (!ext4_es_is_unwritten(&es))
588 ext4_es_find_delayed_extent_range(inode, last, last, &es);
589 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
590 if (last != start)
591 dataoff = (loff_t)last << blkbits;
592 break; 581 break;
593 }
594 582
595 /* 583 /*
596 * If there is a unwritten extent at this offset, 584 * If there is a unwritten extent at this offset,
597 * it will be as a data or a hole according to page 585 * it will be as a data or a hole according to page
598 * cache that has data or not. 586 * cache that has data or not.
599 */ 587 */
600 if (map.m_flags & EXT4_MAP_UNWRITTEN) { 588 if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
601 int unwritten; 589 es.es_lblk + es.es_len, &dataoff))
602 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, 590 break;
603 &map, &dataoff); 591 last += es.es_len;
604 if (unwritten)
605 break;
606 }
607
608 last++;
609 dataoff = (loff_t)last << blkbits; 592 dataoff = (loff_t)last << blkbits;
593 cond_resched();
610 } while (last <= end); 594 } while (last <= end);
611 595
612 inode_unlock(inode); 596 inode_unlock(inode);
@@ -623,12 +607,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
623static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) 607static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
624{ 608{
625 struct inode *inode = file->f_mapping->host; 609 struct inode *inode = file->f_mapping->host;
626 struct ext4_map_blocks map;
627 struct extent_status es; 610 struct extent_status es;
628 ext4_lblk_t start, last, end; 611 ext4_lblk_t start, last, end;
629 loff_t holeoff, isize; 612 loff_t holeoff, isize;
630 int blkbits; 613 int blkbits;
631 int ret = 0; 614 int ret;
632 615
633 inode_lock(inode); 616 inode_lock(inode);
634 617
@@ -645,44 +628,30 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
645 holeoff = offset; 628 holeoff = offset;
646 629
647 do { 630 do {
648 map.m_lblk = last; 631 ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
649 map.m_len = end - last + 1; 632 if (ret < 0) {
650 ret = ext4_map_blocks(NULL, inode, &map, 0); 633 inode_unlock(inode);
651 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 634 return ret;
652 last += ret;
653 holeoff = (loff_t)last << blkbits;
654 continue;
655 } 635 }
656 636 /* Found a hole? */
657 /* 637 if (ret == 0 || es.es_lblk > last) {
658 * If there is a delay extent at this offset, 638 if (last != start)
659 * we will skip this extent. 639 holeoff = (loff_t)last << blkbits;
660 */ 640 break;
661 ext4_es_find_delayed_extent_range(inode, last, last, &es);
662 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
663 last = es.es_lblk + es.es_len;
664 holeoff = (loff_t)last << blkbits;
665 continue;
666 } 641 }
667
668 /* 642 /*
669 * If there is a unwritten extent at this offset, 643 * If there is a unwritten extent at this offset,
670 * it will be as a data or a hole according to page 644 * it will be as a data or a hole according to page
671 * cache that has data or not. 645 * cache that has data or not.
672 */ 646 */
673 if (map.m_flags & EXT4_MAP_UNWRITTEN) { 647 if (ext4_es_is_unwritten(&es) &&
674 int unwritten; 648 ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
675 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, 649 last + es.es_len, &holeoff))
676 &map, &holeoff); 650 break;
677 if (!unwritten) {
678 last += ret;
679 holeoff = (loff_t)last << blkbits;
680 continue;
681 }
682 }
683 651
684 /* find a hole */ 652 last += es.es_len;
685 break; 653 holeoff = (loff_t)last << blkbits;
654 cond_resched();
686 } while (last <= end); 655 } while (last <= end);
687 656
688 inode_unlock(inode); 657 inode_unlock(inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index acc0ad56bf2f..237b877d316d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -787,7 +787,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
787 sbi = EXT4_SB(sb); 787 sbi = EXT4_SB(sb);
788 788
789 /* 789 /*
790 * Initalize owners and quota early so that we don't have to account 790 * Initialize owners and quota early so that we don't have to account
791 * for quota initialization worst case in standard inode creating 791 * for quota initialization worst case in standard inode creating
792 * transaction 792 * transaction
793 */ 793 */
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 355ef9c36c87..3027fa681de5 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -555,8 +555,23 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
555 goto got_it; 555 goto got_it;
556 } 556 }
557 557
558 /* Next simple case - plain lookup or failed read of indirect block */ 558 /* Next simple case - plain lookup failed */
559 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) 559 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
560 unsigned epb = inode->i_sb->s_blocksize / sizeof(u32);
561 int i;
562
563 /* Count number blocks in a subtree under 'partial' */
564 count = 1;
565 for (i = 0; partial + i != chain + depth - 1; i++)
566 count *= epb;
567 /* Fill in size of a hole we found */
568 map->m_pblk = 0;
569 map->m_len = min_t(unsigned int, map->m_len, count);
570 goto cleanup;
571 }
572
573 /* Failed read of indirect block */
574 if (err == -EIO)
560 goto cleanup; 575 goto cleanup;
561 576
562 /* 577 /*
@@ -693,21 +708,21 @@ retry:
693 } 708 }
694 if (IS_DAX(inode)) 709 if (IS_DAX(inode))
695 ret = dax_do_io(iocb, inode, iter, offset, 710 ret = dax_do_io(iocb, inode, iter, offset,
696 ext4_get_block, NULL, 0); 711 ext4_dio_get_block, NULL, 0);
697 else 712 else
698 ret = __blockdev_direct_IO(iocb, inode, 713 ret = __blockdev_direct_IO(iocb, inode,
699 inode->i_sb->s_bdev, iter, 714 inode->i_sb->s_bdev, iter,
700 offset, ext4_get_block, NULL, 715 offset, ext4_dio_get_block,
701 NULL, 0); 716 NULL, NULL, 0);
702 inode_dio_end(inode); 717 inode_dio_end(inode);
703 } else { 718 } else {
704locked: 719locked:
705 if (IS_DAX(inode)) 720 if (IS_DAX(inode))
706 ret = dax_do_io(iocb, inode, iter, offset, 721 ret = dax_do_io(iocb, inode, iter, offset,
707 ext4_get_block, NULL, DIO_LOCKING); 722 ext4_dio_get_block, NULL, DIO_LOCKING);
708 else 723 else
709 ret = blockdev_direct_IO(iocb, inode, iter, offset, 724 ret = blockdev_direct_IO(iocb, inode, iter, offset,
710 ext4_get_block); 725 ext4_dio_get_block);
711 726
712 if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { 727 if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
713 loff_t isize = i_size_read(inode); 728 loff_t isize = i_size_read(inode);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index dfe3b9bafc0d..7cbdd3752ba5 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -581,9 +581,10 @@ retry:
581 if (ret) 581 if (ret)
582 goto out; 582 goto out;
583 583
584 if (ext4_should_dioread_nolock(inode)) 584 if (ext4_should_dioread_nolock(inode)) {
585 ret = __block_write_begin(page, from, to, ext4_get_block_write); 585 ret = __block_write_begin(page, from, to,
586 else 586 ext4_get_block_unwritten);
587 } else
587 ret = __block_write_begin(page, from, to, ext4_get_block); 588 ret = __block_write_begin(page, from, to, ext4_get_block);
588 589
589 if (!ret && ext4_should_journal_data(inode)) { 590 if (!ret && ext4_should_journal_data(inode)) {
@@ -1696,7 +1697,6 @@ int ext4_delete_inline_entry(handle_t *handle,
1696 if (err) 1697 if (err)
1697 goto out; 1698 goto out;
1698 1699
1699 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1700 err = ext4_mark_inode_dirty(handle, dir); 1700 err = ext4_mark_inode_dirty(handle, dir);
1701 if (unlikely(err)) 1701 if (unlikely(err))
1702 goto out; 1702 goto out;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index aee960b1af34..b2e9576450eb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -216,7 +216,6 @@ void ext4_evict_inode(struct inode *inode)
216 } 216 }
217 truncate_inode_pages_final(&inode->i_data); 217 truncate_inode_pages_final(&inode->i_data);
218 218
219 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
220 goto no_delete; 219 goto no_delete;
221 } 220 }
222 221
@@ -228,8 +227,6 @@ void ext4_evict_inode(struct inode *inode)
228 ext4_begin_ordered_truncate(inode, 0); 227 ext4_begin_ordered_truncate(inode, 0);
229 truncate_inode_pages_final(&inode->i_data); 228 truncate_inode_pages_final(&inode->i_data);
230 229
231 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
232
233 /* 230 /*
234 * Protect us against freezing - iput() caller didn't have to have any 231 * Protect us against freezing - iput() caller didn't have to have any
235 * protection against it 232 * protection against it
@@ -458,13 +455,13 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
458 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping 455 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
459 * based files 456 * based files
460 * 457 *
461 * On success, it returns the number of blocks being mapped or allocated. 458 * On success, it returns the number of blocks being mapped or allocated. if
462 * if create==0 and the blocks are pre-allocated and unwritten block, 459 * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
463 * the result buffer head is unmapped. If the create ==1, it will make sure 460 * is marked as unwritten. If the create == 1, it will mark @map as mapped.
464 * the buffer head is mapped.
465 * 461 *
466 * It returns 0 if plain look up failed (blocks have not been allocated), in 462 * It returns 0 if plain look up failed (blocks have not been allocated), in
467 * that case, buffer head is unmapped 463 * that case, @map is returned as unmapped but we still do fill map->m_len to
464 * indicate the length of a hole starting at map->m_lblk.
468 * 465 *
469 * It returns the error in case of allocation failure. 466 * It returns the error in case of allocation failure.
470 */ 467 */
@@ -507,6 +504,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
507 retval = map->m_len; 504 retval = map->m_len;
508 map->m_len = retval; 505 map->m_len = retval;
509 } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { 506 } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
507 map->m_pblk = 0;
508 retval = es.es_len - (map->m_lblk - es.es_lblk);
509 if (retval > map->m_len)
510 retval = map->m_len;
511 map->m_len = retval;
510 retval = 0; 512 retval = 0;
511 } else { 513 } else {
512 BUG_ON(1); 514 BUG_ON(1);
@@ -714,16 +716,11 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
714 cmpxchg(&bh->b_state, old_state, new_state) != old_state)); 716 cmpxchg(&bh->b_state, old_state, new_state) != old_state));
715} 717}
716 718
717/* Maximum number of blocks we map for direct IO at once. */
718#define DIO_MAX_BLOCKS 4096
719
720static int _ext4_get_block(struct inode *inode, sector_t iblock, 719static int _ext4_get_block(struct inode *inode, sector_t iblock,
721 struct buffer_head *bh, int flags) 720 struct buffer_head *bh, int flags)
722{ 721{
723 handle_t *handle = ext4_journal_current_handle();
724 struct ext4_map_blocks map; 722 struct ext4_map_blocks map;
725 int ret = 0, started = 0; 723 int ret = 0;
726 int dio_credits;
727 724
728 if (ext4_has_inline_data(inode)) 725 if (ext4_has_inline_data(inode))
729 return -ERANGE; 726 return -ERANGE;
@@ -731,33 +728,14 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
731 map.m_lblk = iblock; 728 map.m_lblk = iblock;
732 map.m_len = bh->b_size >> inode->i_blkbits; 729 map.m_len = bh->b_size >> inode->i_blkbits;
733 730
734 if (flags && !handle) { 731 ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map,
735 /* Direct IO write... */ 732 flags);
736 if (map.m_len > DIO_MAX_BLOCKS)
737 map.m_len = DIO_MAX_BLOCKS;
738 dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
739 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
740 dio_credits);
741 if (IS_ERR(handle)) {
742 ret = PTR_ERR(handle);
743 return ret;
744 }
745 started = 1;
746 }
747
748 ret = ext4_map_blocks(handle, inode, &map, flags);
749 if (ret > 0) { 733 if (ret > 0) {
750 ext4_io_end_t *io_end = ext4_inode_aio(inode);
751
752 map_bh(bh, inode->i_sb, map.m_pblk); 734 map_bh(bh, inode->i_sb, map.m_pblk);
753 ext4_update_bh_state(bh, map.m_flags); 735 ext4_update_bh_state(bh, map.m_flags);
754 if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
755 set_buffer_defer_completion(bh);
756 bh->b_size = inode->i_sb->s_blocksize * map.m_len; 736 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
757 ret = 0; 737 ret = 0;
758 } 738 }
759 if (started)
760 ext4_journal_stop(handle);
761 return ret; 739 return ret;
762} 740}
763 741
@@ -769,6 +747,155 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
769} 747}
770 748
771/* 749/*
750 * Get block function used when preparing for buffered write if we require
751 * creating an unwritten extent if blocks haven't been allocated. The extent
752 * will be converted to written after the IO is complete.
753 */
754int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
755 struct buffer_head *bh_result, int create)
756{
757 ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
758 inode->i_ino, create);
759 return _ext4_get_block(inode, iblock, bh_result,
760 EXT4_GET_BLOCKS_IO_CREATE_EXT);
761}
762
763/* Maximum number of blocks we map for direct IO at once. */
764#define DIO_MAX_BLOCKS 4096
765
766static handle_t *start_dio_trans(struct inode *inode,
767 struct buffer_head *bh_result)
768{
769 int dio_credits;
770
771 /* Trim mapping request to maximum we can map at once for DIO */
772 if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
773 bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
774 dio_credits = ext4_chunk_trans_blocks(inode,
775 bh_result->b_size >> inode->i_blkbits);
776 return ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
777}
778
779/* Get block function for DIO reads and writes to inodes without extents */
780int ext4_dio_get_block(struct inode *inode, sector_t iblock,
781 struct buffer_head *bh, int create)
782{
783 handle_t *handle;
784 int ret;
785
786 /* We don't expect handle for direct IO */
787 WARN_ON_ONCE(ext4_journal_current_handle());
788
789 if (create) {
790 handle = start_dio_trans(inode, bh);
791 if (IS_ERR(handle))
792 return PTR_ERR(handle);
793 }
794 ret = _ext4_get_block(inode, iblock, bh,
795 create ? EXT4_GET_BLOCKS_CREATE : 0);
796 if (create)
797 ext4_journal_stop(handle);
798 return ret;
799}
800
801/*
802 * Get block function for AIO DIO writes when we create unwritten extent if
803 * blocks are not allocated yet. The extent will be converted to written
804 * after IO is complete.
805 */
806static int ext4_dio_get_block_unwritten_async(struct inode *inode,
807 sector_t iblock, struct buffer_head *bh_result, int create)
808{
809 handle_t *handle;
810 int ret;
811
812 /* We don't expect handle for direct IO */
813 WARN_ON_ONCE(ext4_journal_current_handle());
814
815 handle = start_dio_trans(inode, bh_result);
816 if (IS_ERR(handle))
817 return PTR_ERR(handle);
818 ret = _ext4_get_block(inode, iblock, bh_result,
819 EXT4_GET_BLOCKS_IO_CREATE_EXT);
820 ext4_journal_stop(handle);
821
822 /*
823 * When doing DIO using unwritten extents, we need io_end to convert
824 * unwritten extents to written on IO completion. We allocate io_end
825 * once we spot unwritten extent and store it in b_private. Generic
826 * DIO code keeps b_private set and furthermore passes the value to
827 * our completion callback in 'private' argument.
828 */
829 if (!ret && buffer_unwritten(bh_result)) {
830 if (!bh_result->b_private) {
831 ext4_io_end_t *io_end;
832
833 io_end = ext4_init_io_end(inode, GFP_KERNEL);
834 if (!io_end)
835 return -ENOMEM;
836 bh_result->b_private = io_end;
837 ext4_set_io_unwritten_flag(inode, io_end);
838 }
839 set_buffer_defer_completion(bh_result);
840 }
841
842 return ret;
843}
844
845/*
846 * Get block function for non-AIO DIO writes when we create unwritten extent if
847 * blocks are not allocated yet. The extent will be converted to written
848 * after IO is complete from ext4_ext_direct_IO() function.
849 */
850static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
851 sector_t iblock, struct buffer_head *bh_result, int create)
852{
853 handle_t *handle;
854 int ret;
855
856 /* We don't expect handle for direct IO */
857 WARN_ON_ONCE(ext4_journal_current_handle());
858
859 handle = start_dio_trans(inode, bh_result);
860 if (IS_ERR(handle))
861 return PTR_ERR(handle);
862 ret = _ext4_get_block(inode, iblock, bh_result,
863 EXT4_GET_BLOCKS_IO_CREATE_EXT);
864 ext4_journal_stop(handle);
865
866 /*
867 * Mark inode as having pending DIO writes to unwritten extents.
868 * ext4_ext_direct_IO() checks this flag and converts extents to
869 * written.
870 */
871 if (!ret && buffer_unwritten(bh_result))
872 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
873
874 return ret;
875}
876
877static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
878 struct buffer_head *bh_result, int create)
879{
880 int ret;
881
882 ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
883 inode->i_ino, create);
884 /* We don't expect handle for direct IO */
885 WARN_ON_ONCE(ext4_journal_current_handle());
886
887 ret = _ext4_get_block(inode, iblock, bh_result, 0);
888 /*
889 * Blocks should have been preallocated! ext4_file_write_iter() checks
890 * that.
891 */
892 WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
893
894 return ret;
895}
896
897
898/*
772 * `handle' can be NULL if create is zero 899 * `handle' can be NULL if create is zero
773 */ 900 */
774struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 901struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
@@ -1079,13 +1206,14 @@ retry_journal:
1079#ifdef CONFIG_EXT4_FS_ENCRYPTION 1206#ifdef CONFIG_EXT4_FS_ENCRYPTION
1080 if (ext4_should_dioread_nolock(inode)) 1207 if (ext4_should_dioread_nolock(inode))
1081 ret = ext4_block_write_begin(page, pos, len, 1208 ret = ext4_block_write_begin(page, pos, len,
1082 ext4_get_block_write); 1209 ext4_get_block_unwritten);
1083 else 1210 else
1084 ret = ext4_block_write_begin(page, pos, len, 1211 ret = ext4_block_write_begin(page, pos, len,
1085 ext4_get_block); 1212 ext4_get_block);
1086#else 1213#else
1087 if (ext4_should_dioread_nolock(inode)) 1214 if (ext4_should_dioread_nolock(inode))
1088 ret = __block_write_begin(page, pos, len, ext4_get_block_write); 1215 ret = __block_write_begin(page, pos, len,
1216 ext4_get_block_unwritten);
1089 else 1217 else
1090 ret = __block_write_begin(page, pos, len, ext4_get_block); 1218 ret = __block_write_begin(page, pos, len, ext4_get_block);
1091#endif 1219#endif
@@ -3088,37 +3216,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3088 return try_to_free_buffers(page); 3216 return try_to_free_buffers(page);
3089} 3217}
3090 3218
3091/*
3092 * ext4_get_block used when preparing for a DIO write or buffer write.
3093 * We allocate an uinitialized extent if blocks haven't been allocated.
3094 * The extent will be converted to initialized after the IO is complete.
3095 */
3096int ext4_get_block_write(struct inode *inode, sector_t iblock,
3097 struct buffer_head *bh_result, int create)
3098{
3099 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3100 inode->i_ino, create);
3101 return _ext4_get_block(inode, iblock, bh_result,
3102 EXT4_GET_BLOCKS_IO_CREATE_EXT);
3103}
3104
3105static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,
3106 struct buffer_head *bh_result, int create)
3107{
3108 int ret;
3109
3110 ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n",
3111 inode->i_ino, create);
3112 ret = _ext4_get_block(inode, iblock, bh_result, 0);
3113 /*
3114 * Blocks should have been preallocated! ext4_file_write_iter() checks
3115 * that.
3116 */
3117 WARN_ON_ONCE(!buffer_mapped(bh_result));
3118
3119 return ret;
3120}
3121
3122#ifdef CONFIG_FS_DAX 3219#ifdef CONFIG_FS_DAX
3123int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, 3220int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
3124 struct buffer_head *bh_result, int create) 3221 struct buffer_head *bh_result, int create)
@@ -3179,13 +3276,12 @@ out:
3179 WARN_ON_ONCE(ret == 0 && create); 3276 WARN_ON_ONCE(ret == 0 && create);
3180 if (ret > 0) { 3277 if (ret > 0) {
3181 map_bh(bh_result, inode->i_sb, map.m_pblk); 3278 map_bh(bh_result, inode->i_sb, map.m_pblk);
3182 bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
3183 map.m_flags;
3184 /* 3279 /*
3185 * At least for now we have to clear BH_New so that DAX code 3280 * At least for now we have to clear BH_New so that DAX code
3186 * doesn't attempt to zero blocks again in a racy way. 3281 * doesn't attempt to zero blocks again in a racy way.
3187 */ 3282 */
3188 bh_result->b_state &= ~(1 << BH_New); 3283 map.m_flags &= ~EXT4_MAP_NEW;
3284 ext4_update_bh_state(bh_result, map.m_flags);
3189 bh_result->b_size = map.m_len << inode->i_blkbits; 3285 bh_result->b_size = map.m_len << inode->i_blkbits;
3190 ret = 0; 3286 ret = 0;
3191 } 3287 }
@@ -3196,7 +3292,7 @@ out:
3196static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, 3292static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3197 ssize_t size, void *private) 3293 ssize_t size, void *private)
3198{ 3294{
3199 ext4_io_end_t *io_end = iocb->private; 3295 ext4_io_end_t *io_end = private;
3200 3296
3201 /* if not async direct IO just return */ 3297 /* if not async direct IO just return */
3202 if (!io_end) 3298 if (!io_end)
@@ -3204,10 +3300,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3204 3300
3205 ext_debug("ext4_end_io_dio(): io_end 0x%p " 3301 ext_debug("ext4_end_io_dio(): io_end 0x%p "
3206 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", 3302 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
3207 iocb->private, io_end->inode->i_ino, iocb, offset, 3303 io_end, io_end->inode->i_ino, iocb, offset, size);
3208 size);
3209 3304
3210 iocb->private = NULL;
3211 io_end->offset = offset; 3305 io_end->offset = offset;
3212 io_end->size = size; 3306 io_end->size = size;
3213 ext4_put_io_end(io_end); 3307 ext4_put_io_end(io_end);
@@ -3243,7 +3337,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3243 get_block_t *get_block_func = NULL; 3337 get_block_t *get_block_func = NULL;
3244 int dio_flags = 0; 3338 int dio_flags = 0;
3245 loff_t final_size = offset + count; 3339 loff_t final_size = offset + count;
3246 ext4_io_end_t *io_end = NULL;
3247 3340
3248 /* Use the old path for reads and writes beyond i_size. */ 3341 /* Use the old path for reads and writes beyond i_size. */
3249 if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) 3342 if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
@@ -3268,16 +3361,17 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3268 /* 3361 /*
3269 * We could direct write to holes and fallocate. 3362 * We could direct write to holes and fallocate.
3270 * 3363 *
3271 * Allocated blocks to fill the hole are marked as 3364 * Allocated blocks to fill the hole are marked as unwritten to prevent
3272 * unwritten to prevent parallel buffered read to expose 3365 * parallel buffered read to expose the stale data before DIO complete
3273 * the stale data before DIO complete the data IO. 3366 * the data IO.
3274 * 3367 *
3275 * As to previously fallocated extents, ext4 get_block will 3368 * As to previously fallocated extents, ext4 get_block will just simply
3276 * just simply mark the buffer mapped but still keep the 3369 * mark the buffer mapped but still keep the extents unwritten.
3277 * extents unwritten.
3278 * 3370 *
3279 * For non AIO case, we will convert those unwritten extents 3371 * For non AIO case, we will convert those unwritten extents to written
3280 * to written after return back from blockdev_direct_IO. 3372 * after return back from blockdev_direct_IO. That way we save us from
3373 * allocating io_end structure and also the overhead of offloading
3374 * the extent convertion to a workqueue.
3281 * 3375 *
3282 * For async DIO, the conversion needs to be deferred when the 3376 * For async DIO, the conversion needs to be deferred when the
3283 * IO is completed. The ext4 end_io callback function will be 3377 * IO is completed. The ext4 end_io callback function will be
@@ -3285,30 +3379,13 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3285 * case, we allocate an io_end structure to hook to the iocb. 3379 * case, we allocate an io_end structure to hook to the iocb.
3286 */ 3380 */
3287 iocb->private = NULL; 3381 iocb->private = NULL;
3288 if (overwrite) { 3382 if (overwrite)
3289 get_block_func = ext4_get_block_overwrite; 3383 get_block_func = ext4_dio_get_block_overwrite;
3384 else if (is_sync_kiocb(iocb)) {
3385 get_block_func = ext4_dio_get_block_unwritten_sync;
3386 dio_flags = DIO_LOCKING;
3290 } else { 3387 } else {
3291 ext4_inode_aio_set(inode, NULL); 3388 get_block_func = ext4_dio_get_block_unwritten_async;
3292 if (!is_sync_kiocb(iocb)) {
3293 io_end = ext4_init_io_end(inode, GFP_NOFS);
3294 if (!io_end) {
3295 ret = -ENOMEM;
3296 goto retake_lock;
3297 }
3298 /*
3299 * Grab reference for DIO. Will be dropped in
3300 * ext4_end_io_dio()
3301 */
3302 iocb->private = ext4_get_io_end(io_end);
3303 /*
3304 * we save the io structure for current async direct
3305 * IO, so that later ext4_map_blocks() could flag the
3306 * io structure whether there is a unwritten extents
3307 * needs to be converted when IO is completed.
3308 */
3309 ext4_inode_aio_set(inode, io_end);
3310 }
3311 get_block_func = ext4_get_block_write;
3312 dio_flags = DIO_LOCKING; 3389 dio_flags = DIO_LOCKING;
3313 } 3390 }
3314#ifdef CONFIG_EXT4_FS_ENCRYPTION 3391#ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -3323,27 +3400,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3323 get_block_func, 3400 get_block_func,
3324 ext4_end_io_dio, NULL, dio_flags); 3401 ext4_end_io_dio, NULL, dio_flags);
3325 3402
3326 /*
3327 * Put our reference to io_end. This can free the io_end structure e.g.
3328 * in sync IO case or in case of error. It can even perform extent
3329 * conversion if all bios we submitted finished before we got here.
3330 * Note that in that case iocb->private can be already set to NULL
3331 * here.
3332 */
3333 if (io_end) {
3334 ext4_inode_aio_set(inode, NULL);
3335 ext4_put_io_end(io_end);
3336 /*
3337 * When no IO was submitted ext4_end_io_dio() was not
3338 * called so we have to put iocb's reference.
3339 */
3340 if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
3341 WARN_ON(iocb->private != io_end);
3342 WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
3343 ext4_put_io_end(io_end);
3344 iocb->private = NULL;
3345 }
3346 }
3347 if (ret > 0 && !overwrite && ext4_test_inode_state(inode, 3403 if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
3348 EXT4_STATE_DIO_UNWRITTEN)) { 3404 EXT4_STATE_DIO_UNWRITTEN)) {
3349 int err; 3405 int err;
@@ -3358,7 +3414,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3358 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3414 ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3359 } 3415 }
3360 3416
3361retake_lock:
3362 if (iov_iter_rw(iter) == WRITE) 3417 if (iov_iter_rw(iter) == WRITE)
3363 inode_dio_end(inode); 3418 inode_dio_end(inode);
3364 /* take i_mutex locking again if we do a ovewrite dio */ 3419 /* take i_mutex locking again if we do a ovewrite dio */
@@ -5261,6 +5316,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5261 might_sleep(); 5316 might_sleep();
5262 trace_ext4_mark_inode_dirty(inode, _RET_IP_); 5317 trace_ext4_mark_inode_dirty(inode, _RET_IP_);
5263 err = ext4_reserve_inode_write(handle, inode, &iloc); 5318 err = ext4_reserve_inode_write(handle, inode, &iloc);
5319 if (err)
5320 return err;
5264 if (ext4_handle_valid(handle) && 5321 if (ext4_handle_valid(handle) &&
5265 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && 5322 EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
5266 !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { 5323 !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
@@ -5291,9 +5348,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5291 } 5348 }
5292 } 5349 }
5293 } 5350 }
5294 if (!err) 5351 return ext4_mark_iloc_dirty(handle, inode, &iloc);
5295 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
5296 return err;
5297} 5352}
5298 5353
5299/* 5354/*
@@ -5502,7 +5557,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5502 unlock_page(page); 5557 unlock_page(page);
5503 /* OK, we need to fill the hole... */ 5558 /* OK, we need to fill the hole... */
5504 if (ext4_should_dioread_nolock(inode)) 5559 if (ext4_should_dioread_nolock(inode))
5505 get_block = ext4_get_block_write; 5560 get_block = ext4_get_block_unwritten;
5506 else 5561 else
5507 get_block = ext4_get_block; 5562 get_block = ext4_get_block;
5508retry_alloc: 5563retry_alloc:
@@ -5545,3 +5600,70 @@ int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
5545 5600
5546 return err; 5601 return err;
5547} 5602}
5603
5604/*
5605 * Find the first extent at or after @lblk in an inode that is not a hole.
5606 * Search for @map_len blocks at most. The extent is returned in @result.
5607 *
5608 * The function returns 1 if we found an extent. The function returns 0 in
5609 * case there is no extent at or after @lblk and in that case also sets
5610 * @result->es_len to 0. In case of error, the error code is returned.
5611 */
5612int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
5613 unsigned int map_len, struct extent_status *result)
5614{
5615 struct ext4_map_blocks map;
5616 struct extent_status es = {};
5617 int ret;
5618
5619 map.m_lblk = lblk;
5620 map.m_len = map_len;
5621
5622 /*
5623 * For non-extent based files this loop may iterate several times since
5624 * we do not determine full hole size.
5625 */
5626 while (map.m_len > 0) {
5627 ret = ext4_map_blocks(NULL, inode, &map, 0);
5628 if (ret < 0)
5629 return ret;
5630 /* There's extent covering m_lblk? Just return it. */
5631 if (ret > 0) {
5632 int status;
5633
5634 ext4_es_store_pblock(result, map.m_pblk);
5635 result->es_lblk = map.m_lblk;
5636 result->es_len = map.m_len;
5637 if (map.m_flags & EXT4_MAP_UNWRITTEN)
5638 status = EXTENT_STATUS_UNWRITTEN;
5639 else
5640 status = EXTENT_STATUS_WRITTEN;
5641 ext4_es_store_status(result, status);
5642 return 1;
5643 }
5644 ext4_es_find_delayed_extent_range(inode, map.m_lblk,
5645 map.m_lblk + map.m_len - 1,
5646 &es);
5647 /* Is delalloc data before next block in extent tree? */
5648 if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) {
5649 ext4_lblk_t offset = 0;
5650
5651 if (es.es_lblk < lblk)
5652 offset = lblk - es.es_lblk;
5653 result->es_lblk = es.es_lblk + offset;
5654 ext4_es_store_pblock(result,
5655 ext4_es_pblock(&es) + offset);
5656 result->es_len = es.es_len - offset;
5657 ext4_es_store_status(result, ext4_es_status(&es));
5658
5659 return 1;
5660 }
5661 /* There's a hole at m_lblk, advance us after it */
5662 map.m_lblk += map.m_len;
5663 map_len -= map.m_len;
5664 map.m_len = map_len;
5665 cond_resched();
5666 }
5667 result->es_len = 0;
5668 return 0;
5669}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4424b7bf8ac6..50e05df28f66 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -11,7 +11,7 @@
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public Licens 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
17 */ 17 */
@@ -815,7 +815,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b)
815 * for this page; do not hold this lock when calling this routine! 815 * for this page; do not hold this lock when calling this routine!
816 */ 816 */
817 817
818static int ext4_mb_init_cache(struct page *page, char *incore) 818static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
819{ 819{
820 ext4_group_t ngroups; 820 ext4_group_t ngroups;
821 int blocksize; 821 int blocksize;
@@ -848,7 +848,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
848 /* allocate buffer_heads to read bitmaps */ 848 /* allocate buffer_heads to read bitmaps */
849 if (groups_per_page > 1) { 849 if (groups_per_page > 1) {
850 i = sizeof(struct buffer_head *) * groups_per_page; 850 i = sizeof(struct buffer_head *) * groups_per_page;
851 bh = kzalloc(i, GFP_NOFS); 851 bh = kzalloc(i, gfp);
852 if (bh == NULL) { 852 if (bh == NULL) {
853 err = -ENOMEM; 853 err = -ENOMEM;
854 goto out; 854 goto out;
@@ -983,7 +983,7 @@ out:
983 * are on the same page e4b->bd_buddy_page is NULL and return value is 0. 983 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
984 */ 984 */
985static int ext4_mb_get_buddy_page_lock(struct super_block *sb, 985static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
986 ext4_group_t group, struct ext4_buddy *e4b) 986 ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
987{ 987{
988 struct inode *inode = EXT4_SB(sb)->s_buddy_cache; 988 struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
989 int block, pnum, poff; 989 int block, pnum, poff;
@@ -1002,7 +1002,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
1002 block = group * 2; 1002 block = group * 2;
1003 pnum = block / blocks_per_page; 1003 pnum = block / blocks_per_page;
1004 poff = block % blocks_per_page; 1004 poff = block % blocks_per_page;
1005 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1005 page = find_or_create_page(inode->i_mapping, pnum, gfp);
1006 if (!page) 1006 if (!page)
1007 return -ENOMEM; 1007 return -ENOMEM;
1008 BUG_ON(page->mapping != inode->i_mapping); 1008 BUG_ON(page->mapping != inode->i_mapping);
@@ -1016,7 +1016,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
1016 1016
1017 block++; 1017 block++;
1018 pnum = block / blocks_per_page; 1018 pnum = block / blocks_per_page;
1019 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1019 page = find_or_create_page(inode->i_mapping, pnum, gfp);
1020 if (!page) 1020 if (!page)
1021 return -ENOMEM; 1021 return -ENOMEM;
1022 BUG_ON(page->mapping != inode->i_mapping); 1022 BUG_ON(page->mapping != inode->i_mapping);
@@ -1042,7 +1042,7 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1042 * calling this routine! 1042 * calling this routine!
1043 */ 1043 */
1044static noinline_for_stack 1044static noinline_for_stack
1045int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 1045int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
1046{ 1046{
1047 1047
1048 struct ext4_group_info *this_grp; 1048 struct ext4_group_info *this_grp;
@@ -1062,7 +1062,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1062 * The call to ext4_mb_get_buddy_page_lock will mark the 1062 * The call to ext4_mb_get_buddy_page_lock will mark the
1063 * page accessed. 1063 * page accessed.
1064 */ 1064 */
1065 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); 1065 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
1066 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { 1066 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1067 /* 1067 /*
1068 * somebody initialized the group 1068 * somebody initialized the group
@@ -1072,7 +1072,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1072 } 1072 }
1073 1073
1074 page = e4b.bd_bitmap_page; 1074 page = e4b.bd_bitmap_page;
1075 ret = ext4_mb_init_cache(page, NULL); 1075 ret = ext4_mb_init_cache(page, NULL, gfp);
1076 if (ret) 1076 if (ret)
1077 goto err; 1077 goto err;
1078 if (!PageUptodate(page)) { 1078 if (!PageUptodate(page)) {
@@ -1091,7 +1091,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1091 } 1091 }
1092 /* init buddy cache */ 1092 /* init buddy cache */
1093 page = e4b.bd_buddy_page; 1093 page = e4b.bd_buddy_page;
1094 ret = ext4_mb_init_cache(page, e4b.bd_bitmap); 1094 ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
1095 if (ret) 1095 if (ret)
1096 goto err; 1096 goto err;
1097 if (!PageUptodate(page)) { 1097 if (!PageUptodate(page)) {
@@ -1109,8 +1109,8 @@ err:
1109 * calling this routine! 1109 * calling this routine!
1110 */ 1110 */
1111static noinline_for_stack int 1111static noinline_for_stack int
1112ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1112ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
1113 struct ext4_buddy *e4b) 1113 struct ext4_buddy *e4b, gfp_t gfp)
1114{ 1114{
1115 int blocks_per_page; 1115 int blocks_per_page;
1116 int block; 1116 int block;
@@ -1140,7 +1140,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1140 * we need full data about the group 1140 * we need full data about the group
1141 * to make a good selection 1141 * to make a good selection
1142 */ 1142 */
1143 ret = ext4_mb_init_group(sb, group); 1143 ret = ext4_mb_init_group(sb, group, gfp);
1144 if (ret) 1144 if (ret)
1145 return ret; 1145 return ret;
1146 } 1146 }
@@ -1168,11 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1168 * wait for it to initialize. 1168 * wait for it to initialize.
1169 */ 1169 */
1170 page_cache_release(page); 1170 page_cache_release(page);
1171 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1171 page = find_or_create_page(inode->i_mapping, pnum, gfp);
1172 if (page) { 1172 if (page) {
1173 BUG_ON(page->mapping != inode->i_mapping); 1173 BUG_ON(page->mapping != inode->i_mapping);
1174 if (!PageUptodate(page)) { 1174 if (!PageUptodate(page)) {
1175 ret = ext4_mb_init_cache(page, NULL); 1175 ret = ext4_mb_init_cache(page, NULL, gfp);
1176 if (ret) { 1176 if (ret) {
1177 unlock_page(page); 1177 unlock_page(page);
1178 goto err; 1178 goto err;
@@ -1204,11 +1204,12 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1204 if (page == NULL || !PageUptodate(page)) { 1204 if (page == NULL || !PageUptodate(page)) {
1205 if (page) 1205 if (page)
1206 page_cache_release(page); 1206 page_cache_release(page);
1207 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1207 page = find_or_create_page(inode->i_mapping, pnum, gfp);
1208 if (page) { 1208 if (page) {
1209 BUG_ON(page->mapping != inode->i_mapping); 1209 BUG_ON(page->mapping != inode->i_mapping);
1210 if (!PageUptodate(page)) { 1210 if (!PageUptodate(page)) {
1211 ret = ext4_mb_init_cache(page, e4b->bd_bitmap); 1211 ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
1212 gfp);
1212 if (ret) { 1213 if (ret) {
1213 unlock_page(page); 1214 unlock_page(page);
1214 goto err; 1215 goto err;
@@ -1247,6 +1248,12 @@ err:
1247 return ret; 1248 return ret;
1248} 1249}
1249 1250
1251static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1252 struct ext4_buddy *e4b)
1253{
1254 return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
1255}
1256
1250static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) 1257static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1251{ 1258{
1252 if (e4b->bd_bitmap_page) 1259 if (e4b->bd_bitmap_page)
@@ -2045,7 +2052,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
2045 2052
2046 /* We only do this if the grp has never been initialized */ 2053 /* We only do this if the grp has never been initialized */
2047 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 2054 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2048 int ret = ext4_mb_init_group(ac->ac_sb, group); 2055 int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
2049 if (ret) 2056 if (ret)
2050 return ret; 2057 return ret;
2051 } 2058 }
@@ -4695,16 +4702,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4695 } 4702 }
4696 4703
4697 /* 4704 /*
4698 * We need to make sure we don't reuse the freed block until
4699 * after the transaction is committed, which we can do by
4700 * treating the block as metadata, below. We make an
4701 * exception if the inode is to be written in writeback mode
4702 * since writeback mode has weak data consistency guarantees.
4703 */
4704 if (!ext4_should_writeback_data(inode))
4705 flags |= EXT4_FREE_BLOCKS_METADATA;
4706
4707 /*
4708 * If the extent to be freed does not begin on a cluster 4705 * If the extent to be freed does not begin on a cluster
4709 * boundary, we need to deal with partial clusters at the 4706 * boundary, we need to deal with partial clusters at the
4710 * beginning and end of the extent. Normally we will free 4707 * beginning and end of the extent. Normally we will free
@@ -4738,14 +4735,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4738 4735
4739 if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { 4736 if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
4740 int i; 4737 int i;
4738 int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
4741 4739
4742 for (i = 0; i < count; i++) { 4740 for (i = 0; i < count; i++) {
4743 cond_resched(); 4741 cond_resched();
4744 bh = sb_find_get_block(inode->i_sb, block + i); 4742 if (is_metadata)
4745 if (!bh) 4743 bh = sb_find_get_block(inode->i_sb, block + i);
4746 continue; 4744 ext4_forget(handle, is_metadata, inode, bh, block + i);
4747 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4748 inode, bh, block + i);
4749 } 4745 }
4750 } 4746 }
4751 4747
@@ -4815,16 +4811,23 @@ do_more:
4815#endif 4811#endif
4816 trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); 4812 trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
4817 4813
4818 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4814 /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
4815 err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
4816 GFP_NOFS|__GFP_NOFAIL);
4819 if (err) 4817 if (err)
4820 goto error_return; 4818 goto error_return;
4821 4819
4822 if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { 4820 /*
4821 * We need to make sure we don't reuse the freed block until after the
4822 * transaction is committed. We make an exception if the inode is to be
4823 * written in writeback mode since writeback mode has weak data
4824 * consistency guarantees.
4825 */
4826 if (ext4_handle_valid(handle) &&
4827 ((flags & EXT4_FREE_BLOCKS_METADATA) ||
4828 !ext4_should_writeback_data(inode))) {
4823 struct ext4_free_data *new_entry; 4829 struct ext4_free_data *new_entry;
4824 /* 4830 /*
4825 * blocks being freed are metadata. these blocks shouldn't
4826 * be used until this transaction is committed
4827 *
4828 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed 4831 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
4829 * to fail. 4832 * to fail.
4830 */ 4833 */
@@ -5217,7 +5220,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
5217 grp = ext4_get_group_info(sb, group); 5220 grp = ext4_get_group_info(sb, group);
5218 /* We only do this if the grp has never been initialized */ 5221 /* We only do this if the grp has never been initialized */
5219 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 5222 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
5220 ret = ext4_mb_init_group(sb, group); 5223 ret = ext4_mb_init_group(sb, group, GFP_NOFS);
5221 if (ret) 5224 if (ret)
5222 break; 5225 break;
5223 } 5226 }
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index d634e183b4d4..3ef1df6ae9ec 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -23,18 +23,6 @@
23#include "ext4.h" 23#include "ext4.h"
24 24
25/* 25/*
26 * with AGGRESSIVE_CHECK allocator runs consistency checks over
27 * structures. these checks slow things down a lot
28 */
29#define AGGRESSIVE_CHECK__
30
31/*
32 * with DOUBLE_CHECK defined mballoc creates persistent in-core
33 * bitmaps, maintains and uses them to check for double allocations
34 */
35#define DOUBLE_CHECK__
36
37/*
38 */ 26 */
39#ifdef CONFIG_EXT4_DEBUG 27#ifdef CONFIG_EXT4_DEBUG
40extern ushort ext4_mballoc_debug; 28extern ushort ext4_mballoc_debug;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a4651894cc33..364ea4d4a943 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -361,7 +361,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
361 * blocks. 361 * blocks.
362 * 362 *
363 * While converting to extents we need not 363 * While converting to extents we need not
364 * update the orignal inode i_blocks for extent blocks 364 * update the original inode i_blocks for extent blocks
365 * via quota APIs. The quota update happened via tmp_inode already. 365 * via quota APIs. The quota update happened via tmp_inode already.
366 */ 366 */
367 spin_lock(&inode->i_lock); 367 spin_lock(&inode->i_lock);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 0a512aa81bf7..24445275d330 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -91,21 +91,22 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
91 submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); 91 submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
92 wait_on_buffer(*bh); 92 wait_on_buffer(*bh);
93 if (!buffer_uptodate(*bh)) { 93 if (!buffer_uptodate(*bh)) {
94 brelse(*bh);
95 *bh = NULL;
96 ret = -EIO; 94 ret = -EIO;
97 goto warn_exit; 95 goto warn_exit;
98 } 96 }
99
100 mmp = (struct mmp_struct *)((*bh)->b_data); 97 mmp = (struct mmp_struct *)((*bh)->b_data);
101 if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) 98 if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) {
102 ret = -EFSCORRUPTED; 99 ret = -EFSCORRUPTED;
103 else if (!ext4_mmp_csum_verify(sb, mmp)) 100 goto warn_exit;
101 }
102 if (!ext4_mmp_csum_verify(sb, mmp)) {
104 ret = -EFSBADCRC; 103 ret = -EFSBADCRC;
105 else 104 goto warn_exit;
106 return 0; 105 }
107 106 return 0;
108warn_exit: 107warn_exit:
108 brelse(*bh);
109 *bh = NULL;
109 ext4_warning(sb, "Error %d while reading MMP block %llu", 110 ext4_warning(sb, "Error %d while reading MMP block %llu",
110 ret, mmp_block); 111 ret, mmp_block);
111 return ret; 112 return ret;
@@ -181,15 +182,13 @@ static int kmmpd(void *data)
181 EXT4_FEATURE_INCOMPAT_MMP)) { 182 EXT4_FEATURE_INCOMPAT_MMP)) {
182 ext4_warning(sb, "kmmpd being stopped since MMP feature" 183 ext4_warning(sb, "kmmpd being stopped since MMP feature"
183 " has been disabled."); 184 " has been disabled.");
184 EXT4_SB(sb)->s_mmp_tsk = NULL; 185 goto exit_thread;
185 goto failed;
186 } 186 }
187 187
188 if (sb->s_flags & MS_RDONLY) { 188 if (sb->s_flags & MS_RDONLY) {
189 ext4_warning(sb, "kmmpd being stopped since filesystem " 189 ext4_warning(sb, "kmmpd being stopped since filesystem "
190 "has been remounted as readonly."); 190 "has been remounted as readonly.");
191 EXT4_SB(sb)->s_mmp_tsk = NULL; 191 goto exit_thread;
192 goto failed;
193 } 192 }
194 193
195 diff = jiffies - last_update_time; 194 diff = jiffies - last_update_time;
@@ -211,9 +210,7 @@ static int kmmpd(void *data)
211 if (retval) { 210 if (retval) {
212 ext4_error(sb, "error reading MMP data: %d", 211 ext4_error(sb, "error reading MMP data: %d",
213 retval); 212 retval);
214 213 goto exit_thread;
215 EXT4_SB(sb)->s_mmp_tsk = NULL;
216 goto failed;
217 } 214 }
218 215
219 mmp_check = (struct mmp_struct *)(bh_check->b_data); 216 mmp_check = (struct mmp_struct *)(bh_check->b_data);
@@ -225,7 +222,9 @@ static int kmmpd(void *data)
225 "The filesystem seems to have been" 222 "The filesystem seems to have been"
226 " multiply mounted."); 223 " multiply mounted.");
227 ext4_error(sb, "abort"); 224 ext4_error(sb, "abort");
228 goto failed; 225 put_bh(bh_check);
226 retval = -EBUSY;
227 goto exit_thread;
229 } 228 }
230 put_bh(bh_check); 229 put_bh(bh_check);
231 } 230 }
@@ -248,7 +247,8 @@ static int kmmpd(void *data)
248 247
249 retval = write_mmp_block(sb, bh); 248 retval = write_mmp_block(sb, bh);
250 249
251failed: 250exit_thread:
251 EXT4_SB(sb)->s_mmp_tsk = NULL;
252 kfree(data); 252 kfree(data);
253 brelse(bh); 253 brelse(bh);
254 return retval; 254 return retval;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 090b3498638e..349d7aa04fe7 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -128,9 +128,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
128 BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); 128 BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
129 WARN_ON(io_end->handle); 129 WARN_ON(io_end->handle);
130 130
131 if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
132 wake_up_all(ext4_ioend_wq(io_end->inode));
133
134 for (bio = io_end->bio; bio; bio = next_bio) { 131 for (bio = io_end->bio; bio; bio = next_bio) {
135 next_bio = bio->bi_private; 132 next_bio = bio->bi_private;
136 ext4_finish_bio(bio); 133 ext4_finish_bio(bio);
@@ -265,7 +262,6 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
265{ 262{
266 ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); 263 ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags);
267 if (io) { 264 if (io) {
268 atomic_inc(&EXT4_I(inode)->i_ioend_count);
269 io->inode = inode; 265 io->inode = inode;
270 INIT_LIST_HEAD(&io->list); 266 INIT_LIST_HEAD(&io->list);
271 atomic_set(&io->count, 1); 267 atomic_set(&io->count, 1);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3ed01ec011d7..99996e9a8f57 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -55,7 +55,6 @@
55 55
56static struct ext4_lazy_init *ext4_li_info; 56static struct ext4_lazy_init *ext4_li_info;
57static struct mutex ext4_li_mtx; 57static struct mutex ext4_li_mtx;
58static int ext4_mballoc_ready;
59static struct ratelimit_state ext4_mount_msg_ratelimit; 58static struct ratelimit_state ext4_mount_msg_ratelimit;
60 59
61static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 60static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
@@ -844,7 +843,6 @@ static void ext4_put_super(struct super_block *sb)
844 ext4_release_system_zone(sb); 843 ext4_release_system_zone(sb);
845 ext4_mb_release(sb); 844 ext4_mb_release(sb);
846 ext4_ext_release(sb); 845 ext4_ext_release(sb);
847 ext4_xattr_put_super(sb);
848 846
849 if (!(sb->s_flags & MS_RDONLY)) { 847 if (!(sb->s_flags & MS_RDONLY)) {
850 ext4_clear_feature_journal_needs_recovery(sb); 848 ext4_clear_feature_journal_needs_recovery(sb);
@@ -944,7 +942,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
944 spin_lock_init(&ei->i_completed_io_lock); 942 spin_lock_init(&ei->i_completed_io_lock);
945 ei->i_sync_tid = 0; 943 ei->i_sync_tid = 0;
946 ei->i_datasync_tid = 0; 944 ei->i_datasync_tid = 0;
947 atomic_set(&ei->i_ioend_count, 0);
948 atomic_set(&ei->i_unwritten, 0); 945 atomic_set(&ei->i_unwritten, 0);
949 INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); 946 INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
950#ifdef CONFIG_EXT4_FS_ENCRYPTION 947#ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -1425,9 +1422,9 @@ static const struct mount_opts {
1425 {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, 1422 {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
1426 {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, 1423 {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
1427 {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, 1424 {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
1428 MOPT_NO_EXT2 | MOPT_SET}, 1425 MOPT_NO_EXT2},
1429 {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, 1426 {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
1430 MOPT_NO_EXT2 | MOPT_CLEAR}, 1427 MOPT_NO_EXT2},
1431 {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, 1428 {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
1432 {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, 1429 {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
1433 {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, 1430 {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
@@ -1705,6 +1702,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1705 ext4_msg(sb, KERN_INFO, "dax option not supported"); 1702 ext4_msg(sb, KERN_INFO, "dax option not supported");
1706 return -1; 1703 return -1;
1707#endif 1704#endif
1705 } else if (token == Opt_data_err_abort) {
1706 sbi->s_mount_opt |= m->mount_opt;
1707 } else if (token == Opt_data_err_ignore) {
1708 sbi->s_mount_opt &= ~m->mount_opt;
1708 } else { 1709 } else {
1709 if (!args->from) 1710 if (!args->from)
1710 arg = 1; 1711 arg = 1;
@@ -1914,6 +1915,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
1914 SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); 1915 SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
1915 if (nodefs || sbi->s_max_dir_size_kb) 1916 if (nodefs || sbi->s_max_dir_size_kb)
1916 SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); 1917 SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
1918 if (test_opt(sb, DATA_ERR_ABORT))
1919 SEQ_OPTS_PUTS("data_err=abort");
1917 1920
1918 ext4_show_quota_options(seq, sb); 1921 ext4_show_quota_options(seq, sb);
1919 return 0; 1922 return 0;
@@ -3796,12 +3799,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3796 sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; 3799 sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
3797 3800
3798no_journal: 3801no_journal:
3799 if (ext4_mballoc_ready) { 3802 sbi->s_mb_cache = ext4_xattr_create_cache();
3800 sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); 3803 if (!sbi->s_mb_cache) {
3801 if (!sbi->s_mb_cache) { 3804 ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
3802 ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); 3805 goto failed_mount_wq;
3803 goto failed_mount_wq;
3804 }
3805 } 3806 }
3806 3807
3807 if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && 3808 if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
@@ -4027,6 +4028,10 @@ failed_mount4:
4027 if (EXT4_SB(sb)->rsv_conversion_wq) 4028 if (EXT4_SB(sb)->rsv_conversion_wq)
4028 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); 4029 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4029failed_mount_wq: 4030failed_mount_wq:
4031 if (sbi->s_mb_cache) {
4032 ext4_xattr_destroy_cache(sbi->s_mb_cache);
4033 sbi->s_mb_cache = NULL;
4034 }
4030 if (sbi->s_journal) { 4035 if (sbi->s_journal) {
4031 jbd2_journal_destroy(sbi->s_journal); 4036 jbd2_journal_destroy(sbi->s_journal);
4032 sbi->s_journal = NULL; 4037 sbi->s_journal = NULL;
@@ -5321,7 +5326,6 @@ MODULE_ALIAS_FS("ext4");
5321 5326
5322/* Shared across all ext4 file systems */ 5327/* Shared across all ext4 file systems */
5323wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; 5328wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
5324struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
5325 5329
5326static int __init ext4_init_fs(void) 5330static int __init ext4_init_fs(void)
5327{ 5331{
@@ -5334,10 +5338,8 @@ static int __init ext4_init_fs(void)
5334 /* Build-time check for flags consistency */ 5338 /* Build-time check for flags consistency */
5335 ext4_check_flag_values(); 5339 ext4_check_flag_values();
5336 5340
5337 for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { 5341 for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
5338 mutex_init(&ext4__aio_mutex[i]);
5339 init_waitqueue_head(&ext4__ioend_wq[i]); 5342 init_waitqueue_head(&ext4__ioend_wq[i]);
5340 }
5341 5343
5342 err = ext4_init_es(); 5344 err = ext4_init_es();
5343 if (err) 5345 if (err)
@@ -5358,8 +5360,6 @@ static int __init ext4_init_fs(void)
5358 err = ext4_init_mballoc(); 5360 err = ext4_init_mballoc();
5359 if (err) 5361 if (err)
5360 goto out2; 5362 goto out2;
5361 else
5362 ext4_mballoc_ready = 1;
5363 err = init_inodecache(); 5363 err = init_inodecache();
5364 if (err) 5364 if (err)
5365 goto out1; 5365 goto out1;
@@ -5375,7 +5375,6 @@ out:
5375 unregister_as_ext3(); 5375 unregister_as_ext3();
5376 destroy_inodecache(); 5376 destroy_inodecache();
5377out1: 5377out1:
5378 ext4_mballoc_ready = 0;
5379 ext4_exit_mballoc(); 5378 ext4_exit_mballoc();
5380out2: 5379out2:
5381 ext4_exit_sysfs(); 5380 ext4_exit_sysfs();
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index a95151e875bd..0441e055c8e8 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -545,30 +545,44 @@ static void
545ext4_xattr_release_block(handle_t *handle, struct inode *inode, 545ext4_xattr_release_block(handle_t *handle, struct inode *inode,
546 struct buffer_head *bh) 546 struct buffer_head *bh)
547{ 547{
548 struct mb_cache_entry *ce = NULL;
549 int error = 0;
550 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); 548 struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
549 u32 hash, ref;
550 int error = 0;
551 551
552 ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
553 BUFFER_TRACE(bh, "get_write_access"); 552 BUFFER_TRACE(bh, "get_write_access");
554 error = ext4_journal_get_write_access(handle, bh); 553 error = ext4_journal_get_write_access(handle, bh);
555 if (error) 554 if (error)
556 goto out; 555 goto out;
557 556
558 lock_buffer(bh); 557 lock_buffer(bh);
559 if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { 558 hash = le32_to_cpu(BHDR(bh)->h_hash);
559 ref = le32_to_cpu(BHDR(bh)->h_refcount);
560 if (ref == 1) {
560 ea_bdebug(bh, "refcount now=0; freeing"); 561 ea_bdebug(bh, "refcount now=0; freeing");
561 if (ce) 562 /*
562 mb_cache_entry_free(ce); 563 * This must happen under buffer lock for
564 * ext4_xattr_block_set() to reliably detect freed block
565 */
566 mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr);
563 get_bh(bh); 567 get_bh(bh);
564 unlock_buffer(bh); 568 unlock_buffer(bh);
565 ext4_free_blocks(handle, inode, bh, 0, 1, 569 ext4_free_blocks(handle, inode, bh, 0, 1,
566 EXT4_FREE_BLOCKS_METADATA | 570 EXT4_FREE_BLOCKS_METADATA |
567 EXT4_FREE_BLOCKS_FORGET); 571 EXT4_FREE_BLOCKS_FORGET);
568 } else { 572 } else {
569 le32_add_cpu(&BHDR(bh)->h_refcount, -1); 573 ref--;
570 if (ce) 574 BHDR(bh)->h_refcount = cpu_to_le32(ref);
571 mb_cache_entry_release(ce); 575 if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) {
576 struct mb_cache_entry *ce;
577
578 ce = mb_cache_entry_get(ext4_mb_cache, hash,
579 bh->b_blocknr);
580 if (ce) {
581 ce->e_reusable = 1;
582 mb_cache_entry_put(ext4_mb_cache, ce);
583 }
584 }
585
572 /* 586 /*
573 * Beware of this ugliness: Releasing of xattr block references 587 * Beware of this ugliness: Releasing of xattr block references
574 * from different inodes can race and so we have to protect 588 * from different inodes can race and so we have to protect
@@ -790,8 +804,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
790 if (i->value && i->value_len > sb->s_blocksize) 804 if (i->value && i->value_len > sb->s_blocksize)
791 return -ENOSPC; 805 return -ENOSPC;
792 if (s->base) { 806 if (s->base) {
793 ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
794 bs->bh->b_blocknr);
795 BUFFER_TRACE(bs->bh, "get_write_access"); 807 BUFFER_TRACE(bs->bh, "get_write_access");
796 error = ext4_journal_get_write_access(handle, bs->bh); 808 error = ext4_journal_get_write_access(handle, bs->bh);
797 if (error) 809 if (error)
@@ -799,10 +811,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
799 lock_buffer(bs->bh); 811 lock_buffer(bs->bh);
800 812
801 if (header(s->base)->h_refcount == cpu_to_le32(1)) { 813 if (header(s->base)->h_refcount == cpu_to_le32(1)) {
802 if (ce) { 814 __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash);
803 mb_cache_entry_free(ce); 815
804 ce = NULL; 816 /*
805 } 817 * This must happen under buffer lock for
818 * ext4_xattr_block_set() to reliably detect modified
819 * block
820 */
821 mb_cache_entry_delete_block(ext4_mb_cache, hash,
822 bs->bh->b_blocknr);
806 ea_bdebug(bs->bh, "modifying in-place"); 823 ea_bdebug(bs->bh, "modifying in-place");
807 error = ext4_xattr_set_entry(i, s); 824 error = ext4_xattr_set_entry(i, s);
808 if (!error) { 825 if (!error) {
@@ -826,10 +843,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
826 int offset = (char *)s->here - bs->bh->b_data; 843 int offset = (char *)s->here - bs->bh->b_data;
827 844
828 unlock_buffer(bs->bh); 845 unlock_buffer(bs->bh);
829 if (ce) {
830 mb_cache_entry_release(ce);
831 ce = NULL;
832 }
833 ea_bdebug(bs->bh, "cloning"); 846 ea_bdebug(bs->bh, "cloning");
834 s->base = kmalloc(bs->bh->b_size, GFP_NOFS); 847 s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
835 error = -ENOMEM; 848 error = -ENOMEM;
@@ -872,6 +885,8 @@ inserted:
872 if (new_bh == bs->bh) 885 if (new_bh == bs->bh)
873 ea_bdebug(new_bh, "keeping"); 886 ea_bdebug(new_bh, "keeping");
874 else { 887 else {
888 u32 ref;
889
875 /* The old block is released after updating 890 /* The old block is released after updating
876 the inode. */ 891 the inode. */
877 error = dquot_alloc_block(inode, 892 error = dquot_alloc_block(inode,
@@ -884,9 +899,40 @@ inserted:
884 if (error) 899 if (error)
885 goto cleanup_dquot; 900 goto cleanup_dquot;
886 lock_buffer(new_bh); 901 lock_buffer(new_bh);
887 le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); 902 /*
903 * We have to be careful about races with
904 * freeing, rehashing or adding references to
905 * xattr block. Once we hold buffer lock xattr
906 * block's state is stable so we can check
907 * whether the block got freed / rehashed or
908 * not. Since we unhash mbcache entry under
909 * buffer lock when freeing / rehashing xattr
910 * block, checking whether entry is still
911 * hashed is reliable. Same rules hold for
912 * e_reusable handling.
913 */
914 if (hlist_bl_unhashed(&ce->e_hash_list) ||
915 !ce->e_reusable) {
916 /*
917 * Undo everything and check mbcache
918 * again.
919 */
920 unlock_buffer(new_bh);
921 dquot_free_block(inode,
922 EXT4_C2B(EXT4_SB(sb),
923 1));
924 brelse(new_bh);
925 mb_cache_entry_put(ext4_mb_cache, ce);
926 ce = NULL;
927 new_bh = NULL;
928 goto inserted;
929 }
930 ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
931 BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
932 if (ref >= EXT4_XATTR_REFCOUNT_MAX)
933 ce->e_reusable = 0;
888 ea_bdebug(new_bh, "reusing; refcount now=%d", 934 ea_bdebug(new_bh, "reusing; refcount now=%d",
889 le32_to_cpu(BHDR(new_bh)->h_refcount)); 935 ref);
890 unlock_buffer(new_bh); 936 unlock_buffer(new_bh);
891 error = ext4_handle_dirty_xattr_block(handle, 937 error = ext4_handle_dirty_xattr_block(handle,
892 inode, 938 inode,
@@ -894,7 +940,8 @@ inserted:
894 if (error) 940 if (error)
895 goto cleanup_dquot; 941 goto cleanup_dquot;
896 } 942 }
897 mb_cache_entry_release(ce); 943 mb_cache_entry_touch(ext4_mb_cache, ce);
944 mb_cache_entry_put(ext4_mb_cache, ce);
898 ce = NULL; 945 ce = NULL;
899 } else if (bs->bh && s->base == bs->bh->b_data) { 946 } else if (bs->bh && s->base == bs->bh->b_data) {
900 /* We were modifying this block in-place. */ 947 /* We were modifying this block in-place. */
@@ -959,7 +1006,7 @@ getblk_failed:
959 1006
960cleanup: 1007cleanup:
961 if (ce) 1008 if (ce)
962 mb_cache_entry_release(ce); 1009 mb_cache_entry_put(ext4_mb_cache, ce);
963 brelse(new_bh); 1010 brelse(new_bh);
964 if (!(bs->bh && s->base == bs->bh->b_data)) 1011 if (!(bs->bh && s->base == bs->bh->b_data))
965 kfree(s->base); 1012 kfree(s->base);
@@ -1070,6 +1117,17 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
1070 return 0; 1117 return 0;
1071} 1118}
1072 1119
1120static int ext4_xattr_value_same(struct ext4_xattr_search *s,
1121 struct ext4_xattr_info *i)
1122{
1123 void *value;
1124
1125 if (le32_to_cpu(s->here->e_value_size) != i->value_len)
1126 return 0;
1127 value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs);
1128 return !memcmp(value, i->value, i->value_len);
1129}
1130
1073/* 1131/*
1074 * ext4_xattr_set_handle() 1132 * ext4_xattr_set_handle()
1075 * 1133 *
@@ -1146,6 +1204,13 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1146 else if (!bs.s.not_found) 1204 else if (!bs.s.not_found)
1147 error = ext4_xattr_block_set(handle, inode, &i, &bs); 1205 error = ext4_xattr_block_set(handle, inode, &i, &bs);
1148 } else { 1206 } else {
1207 error = 0;
1208 /* Xattr value did not change? Save us some work and bail out */
1209 if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i))
1210 goto cleanup;
1211 if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
1212 goto cleanup;
1213
1149 error = ext4_xattr_ibody_set(handle, inode, &i, &is); 1214 error = ext4_xattr_ibody_set(handle, inode, &i, &is);
1150 if (!error && !bs.s.not_found) { 1215 if (!error && !bs.s.not_found) {
1151 i.value = NULL; 1216 i.value = NULL;
@@ -1512,17 +1577,6 @@ cleanup:
1512} 1577}
1513 1578
1514/* 1579/*
1515 * ext4_xattr_put_super()
1516 *
1517 * This is called when a file system is unmounted.
1518 */
1519void
1520ext4_xattr_put_super(struct super_block *sb)
1521{
1522 mb_cache_shrink(sb->s_bdev);
1523}
1524
1525/*
1526 * ext4_xattr_cache_insert() 1580 * ext4_xattr_cache_insert()
1527 * 1581 *
1528 * Create a new entry in the extended attribute cache, and insert 1582 * Create a new entry in the extended attribute cache, and insert
@@ -1533,26 +1587,19 @@ ext4_xattr_put_super(struct super_block *sb)
1533static void 1587static void
1534ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) 1588ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
1535{ 1589{
1536 __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); 1590 struct ext4_xattr_header *header = BHDR(bh);
1537 struct mb_cache_entry *ce; 1591 __u32 hash = le32_to_cpu(header->h_hash);
1592 int reusable = le32_to_cpu(header->h_refcount) <
1593 EXT4_XATTR_REFCOUNT_MAX;
1538 int error; 1594 int error;
1539 1595
1540 ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS); 1596 error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash,
1541 if (!ce) { 1597 bh->b_blocknr, reusable);
1542 ea_bdebug(bh, "out of memory");
1543 return;
1544 }
1545 error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
1546 if (error) { 1598 if (error) {
1547 mb_cache_entry_free(ce); 1599 if (error == -EBUSY)
1548 if (error == -EBUSY) {
1549 ea_bdebug(bh, "already in cache"); 1600 ea_bdebug(bh, "already in cache");
1550 error = 0; 1601 } else
1551 }
1552 } else {
1553 ea_bdebug(bh, "inserting [%x]", (int)hash); 1602 ea_bdebug(bh, "inserting [%x]", (int)hash);
1554 mb_cache_entry_release(ce);
1555 }
1556} 1603}
1557 1604
1558/* 1605/*
@@ -1614,33 +1661,20 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
1614 if (!header->h_hash) 1661 if (!header->h_hash)
1615 return NULL; /* never share */ 1662 return NULL; /* never share */
1616 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); 1663 ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
1617again: 1664 ce = mb_cache_entry_find_first(ext4_mb_cache, hash);
1618 ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
1619 hash);
1620 while (ce) { 1665 while (ce) {
1621 struct buffer_head *bh; 1666 struct buffer_head *bh;
1622 1667
1623 if (IS_ERR(ce)) {
1624 if (PTR_ERR(ce) == -EAGAIN)
1625 goto again;
1626 break;
1627 }
1628 bh = sb_bread(inode->i_sb, ce->e_block); 1668 bh = sb_bread(inode->i_sb, ce->e_block);
1629 if (!bh) { 1669 if (!bh) {
1630 EXT4_ERROR_INODE(inode, "block %lu read error", 1670 EXT4_ERROR_INODE(inode, "block %lu read error",
1631 (unsigned long) ce->e_block); 1671 (unsigned long) ce->e_block);
1632 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
1633 EXT4_XATTR_REFCOUNT_MAX) {
1634 ea_idebug(inode, "block %lu refcount %d>=%d",
1635 (unsigned long) ce->e_block,
1636 le32_to_cpu(BHDR(bh)->h_refcount),
1637 EXT4_XATTR_REFCOUNT_MAX);
1638 } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { 1672 } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
1639 *pce = ce; 1673 *pce = ce;
1640 return bh; 1674 return bh;
1641 } 1675 }
1642 brelse(bh); 1676 brelse(bh);
1643 ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); 1677 ce = mb_cache_entry_find_next(ext4_mb_cache, ce);
1644 } 1678 }
1645 return NULL; 1679 return NULL;
1646} 1680}
@@ -1716,9 +1750,9 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
1716#define HASH_BUCKET_BITS 10 1750#define HASH_BUCKET_BITS 10
1717 1751
1718struct mb_cache * 1752struct mb_cache *
1719ext4_xattr_create_cache(char *name) 1753ext4_xattr_create_cache(void)
1720{ 1754{
1721 return mb_cache_create(name, HASH_BUCKET_BITS); 1755 return mb_cache_create(HASH_BUCKET_BITS);
1722} 1756}
1723 1757
1724void ext4_xattr_destroy_cache(struct mb_cache *cache) 1758void ext4_xattr_destroy_cache(struct mb_cache *cache)
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index ddc0957760ba..69dd3e6566e0 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -108,7 +108,6 @@ extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_
108extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); 108extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
109 109
110extern void ext4_xattr_delete_inode(handle_t *, struct inode *); 110extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
111extern void ext4_xattr_put_super(struct super_block *);
112 111
113extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, 112extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
114 struct ext4_inode *raw_inode, handle_t *handle); 113 struct ext4_inode *raw_inode, handle_t *handle);
@@ -124,7 +123,7 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
124 struct ext4_xattr_info *i, 123 struct ext4_xattr_info *i,
125 struct ext4_xattr_ibody_find *is); 124 struct ext4_xattr_ibody_find *is);
126 125
127extern struct mb_cache *ext4_xattr_create_cache(char *name); 126extern struct mb_cache *ext4_xattr_create_cache(void);
128extern void ext4_xattr_destroy_cache(struct mb_cache *); 127extern void ext4_xattr_destroy_cache(struct mb_cache *);
129 128
130#ifdef CONFIG_EXT4_FS_SECURITY 129#ifdef CONFIG_EXT4_FS_SECURITY
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 36345fefa3ff..517f2de784cf 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -131,14 +131,12 @@ static int journal_submit_commit_record(journal_t *journal,
131 if (is_journal_aborted(journal)) 131 if (is_journal_aborted(journal))
132 return 0; 132 return 0;
133 133
134 bh = jbd2_journal_get_descriptor_buffer(journal); 134 bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
135 JBD2_COMMIT_BLOCK);
135 if (!bh) 136 if (!bh)
136 return 1; 137 return 1;
137 138
138 tmp = (struct commit_header *)bh->b_data; 139 tmp = (struct commit_header *)bh->b_data;
139 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
140 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
141 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
142 tmp->h_commit_sec = cpu_to_be64(now.tv_sec); 140 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
143 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); 141 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
144 142
@@ -222,7 +220,7 @@ static int journal_submit_data_buffers(journal_t *journal,
222 spin_lock(&journal->j_list_lock); 220 spin_lock(&journal->j_list_lock);
223 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 221 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
224 mapping = jinode->i_vfs_inode->i_mapping; 222 mapping = jinode->i_vfs_inode->i_mapping;
225 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 223 jinode->i_flags |= JI_COMMIT_RUNNING;
226 spin_unlock(&journal->j_list_lock); 224 spin_unlock(&journal->j_list_lock);
227 /* 225 /*
228 * submit the inode data buffers. We use writepage 226 * submit the inode data buffers. We use writepage
@@ -236,8 +234,8 @@ static int journal_submit_data_buffers(journal_t *journal,
236 ret = err; 234 ret = err;
237 spin_lock(&journal->j_list_lock); 235 spin_lock(&journal->j_list_lock);
238 J_ASSERT(jinode->i_transaction == commit_transaction); 236 J_ASSERT(jinode->i_transaction == commit_transaction);
239 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 237 jinode->i_flags &= ~JI_COMMIT_RUNNING;
240 smp_mb__after_atomic(); 238 smp_mb();
241 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 239 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
242 } 240 }
243 spin_unlock(&journal->j_list_lock); 241 spin_unlock(&journal->j_list_lock);
@@ -258,7 +256,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
258 /* For locking, see the comment in journal_submit_data_buffers() */ 256 /* For locking, see the comment in journal_submit_data_buffers() */
259 spin_lock(&journal->j_list_lock); 257 spin_lock(&journal->j_list_lock);
260 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 258 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
261 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 259 jinode->i_flags |= JI_COMMIT_RUNNING;
262 spin_unlock(&journal->j_list_lock); 260 spin_unlock(&journal->j_list_lock);
263 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); 261 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
264 if (err) { 262 if (err) {
@@ -274,8 +272,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
274 ret = err; 272 ret = err;
275 } 273 }
276 spin_lock(&journal->j_list_lock); 274 spin_lock(&journal->j_list_lock);
277 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); 275 jinode->i_flags &= ~JI_COMMIT_RUNNING;
278 smp_mb__after_atomic(); 276 smp_mb();
279 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 277 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
280 } 278 }
281 279
@@ -319,22 +317,6 @@ static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
319 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); 317 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
320} 318}
321 319
322static void jbd2_descr_block_csum_set(journal_t *j,
323 struct buffer_head *bh)
324{
325 struct jbd2_journal_block_tail *tail;
326 __u32 csum;
327
328 if (!jbd2_journal_has_csum_v2or3(j))
329 return;
330
331 tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
332 sizeof(struct jbd2_journal_block_tail));
333 tail->t_checksum = 0;
334 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
335 tail->t_checksum = cpu_to_be32(csum);
336}
337
338static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, 320static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
339 struct buffer_head *bh, __u32 sequence) 321 struct buffer_head *bh, __u32 sequence)
340{ 322{
@@ -379,7 +361,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
379 ktime_t start_time; 361 ktime_t start_time;
380 u64 commit_time; 362 u64 commit_time;
381 char *tagp = NULL; 363 char *tagp = NULL;
382 journal_header_t *header;
383 journal_block_tag_t *tag = NULL; 364 journal_block_tag_t *tag = NULL;
384 int space_left = 0; 365 int space_left = 0;
385 int first_tag = 0; 366 int first_tag = 0;
@@ -554,8 +535,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
554 jbd2_journal_abort(journal, err); 535 jbd2_journal_abort(journal, err);
555 536
556 blk_start_plug(&plug); 537 blk_start_plug(&plug);
557 jbd2_journal_write_revoke_records(journal, commit_transaction, 538 jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
558 &log_bufs, WRITE_SYNC);
559 539
560 jbd_debug(3, "JBD2: commit phase 2b\n"); 540 jbd_debug(3, "JBD2: commit phase 2b\n");
561 541
@@ -616,7 +596,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
616 596
617 jbd_debug(4, "JBD2: get descriptor\n"); 597 jbd_debug(4, "JBD2: get descriptor\n");
618 598
619 descriptor = jbd2_journal_get_descriptor_buffer(journal); 599 descriptor = jbd2_journal_get_descriptor_buffer(
600 commit_transaction,
601 JBD2_DESCRIPTOR_BLOCK);
620 if (!descriptor) { 602 if (!descriptor) {
621 jbd2_journal_abort(journal, -EIO); 603 jbd2_journal_abort(journal, -EIO);
622 continue; 604 continue;
@@ -625,11 +607,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
625 jbd_debug(4, "JBD2: got buffer %llu (%p)\n", 607 jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
626 (unsigned long long)descriptor->b_blocknr, 608 (unsigned long long)descriptor->b_blocknr,
627 descriptor->b_data); 609 descriptor->b_data);
628 header = (journal_header_t *)descriptor->b_data;
629 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
630 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
631 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
632
633 tagp = &descriptor->b_data[sizeof(journal_header_t)]; 610 tagp = &descriptor->b_data[sizeof(journal_header_t)];
634 space_left = descriptor->b_size - 611 space_left = descriptor->b_size -
635 sizeof(journal_header_t); 612 sizeof(journal_header_t);
@@ -721,7 +698,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
721 698
722 tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); 699 tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
723 700
724 jbd2_descr_block_csum_set(journal, descriptor); 701 jbd2_descriptor_block_csum_set(journal, descriptor);
725start_journal_io: 702start_journal_io:
726 for (i = 0; i < bufs; i++) { 703 for (i = 0; i < bufs; i++) {
727 struct buffer_head *bh = wbuf[i]; 704 struct buffer_head *bh = wbuf[i];
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 81e622681c82..de73a9516a54 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -805,10 +805,13 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
805 * But we don't bother doing that, so there will be coherency problems with 805 * But we don't bother doing that, so there will be coherency problems with
806 * mmaps of blockdevs which hold live JBD-controlled filesystems. 806 * mmaps of blockdevs which hold live JBD-controlled filesystems.
807 */ 807 */
808struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) 808struct buffer_head *
809jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type)
809{ 810{
811 journal_t *journal = transaction->t_journal;
810 struct buffer_head *bh; 812 struct buffer_head *bh;
811 unsigned long long blocknr; 813 unsigned long long blocknr;
814 journal_header_t *header;
812 int err; 815 int err;
813 816
814 err = jbd2_journal_next_log_block(journal, &blocknr); 817 err = jbd2_journal_next_log_block(journal, &blocknr);
@@ -821,12 +824,31 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
821 return NULL; 824 return NULL;
822 lock_buffer(bh); 825 lock_buffer(bh);
823 memset(bh->b_data, 0, journal->j_blocksize); 826 memset(bh->b_data, 0, journal->j_blocksize);
827 header = (journal_header_t *)bh->b_data;
828 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
829 header->h_blocktype = cpu_to_be32(type);
830 header->h_sequence = cpu_to_be32(transaction->t_tid);
824 set_buffer_uptodate(bh); 831 set_buffer_uptodate(bh);
825 unlock_buffer(bh); 832 unlock_buffer(bh);
826 BUFFER_TRACE(bh, "return this buffer"); 833 BUFFER_TRACE(bh, "return this buffer");
827 return bh; 834 return bh;
828} 835}
829 836
837void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh)
838{
839 struct jbd2_journal_block_tail *tail;
840 __u32 csum;
841
842 if (!jbd2_journal_has_csum_v2or3(j))
843 return;
844
845 tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
846 sizeof(struct jbd2_journal_block_tail));
847 tail->t_checksum = 0;
848 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
849 tail->t_checksum = cpu_to_be32(csum);
850}
851
830/* 852/*
831 * Return tid of the oldest transaction in the journal and block in the journal 853 * Return tid of the oldest transaction in the journal and block in the journal
832 * where the transaction starts. 854 * where the transaction starts.
@@ -1408,11 +1430,12 @@ out:
1408/** 1430/**
1409 * jbd2_mark_journal_empty() - Mark on disk journal as empty. 1431 * jbd2_mark_journal_empty() - Mark on disk journal as empty.
1410 * @journal: The journal to update. 1432 * @journal: The journal to update.
1433 * @write_op: With which operation should we write the journal sb
1411 * 1434 *
1412 * Update a journal's dynamic superblock fields to show that journal is empty. 1435 * Update a journal's dynamic superblock fields to show that journal is empty.
1413 * Write updated superblock to disk waiting for IO to complete. 1436 * Write updated superblock to disk waiting for IO to complete.
1414 */ 1437 */
1415static void jbd2_mark_journal_empty(journal_t *journal) 1438static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
1416{ 1439{
1417 journal_superblock_t *sb = journal->j_superblock; 1440 journal_superblock_t *sb = journal->j_superblock;
1418 1441
@@ -1430,7 +1453,7 @@ static void jbd2_mark_journal_empty(journal_t *journal)
1430 sb->s_start = cpu_to_be32(0); 1453 sb->s_start = cpu_to_be32(0);
1431 read_unlock(&journal->j_state_lock); 1454 read_unlock(&journal->j_state_lock);
1432 1455
1433 jbd2_write_superblock(journal, WRITE_FUA); 1456 jbd2_write_superblock(journal, write_op);
1434 1457
1435 /* Log is no longer empty */ 1458 /* Log is no longer empty */
1436 write_lock(&journal->j_state_lock); 1459 write_lock(&journal->j_state_lock);
@@ -1716,7 +1739,13 @@ int jbd2_journal_destroy(journal_t *journal)
1716 if (journal->j_sb_buffer) { 1739 if (journal->j_sb_buffer) {
1717 if (!is_journal_aborted(journal)) { 1740 if (!is_journal_aborted(journal)) {
1718 mutex_lock(&journal->j_checkpoint_mutex); 1741 mutex_lock(&journal->j_checkpoint_mutex);
1719 jbd2_mark_journal_empty(journal); 1742
1743 write_lock(&journal->j_state_lock);
1744 journal->j_tail_sequence =
1745 ++journal->j_transaction_sequence;
1746 write_unlock(&journal->j_state_lock);
1747
1748 jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA);
1720 mutex_unlock(&journal->j_checkpoint_mutex); 1749 mutex_unlock(&journal->j_checkpoint_mutex);
1721 } else 1750 } else
1722 err = -EIO; 1751 err = -EIO;
@@ -1975,7 +2004,7 @@ int jbd2_journal_flush(journal_t *journal)
1975 * the magic code for a fully-recovered superblock. Any future 2004 * the magic code for a fully-recovered superblock. Any future
1976 * commits of data to the journal will restore the current 2005 * commits of data to the journal will restore the current
1977 * s_start value. */ 2006 * s_start value. */
1978 jbd2_mark_journal_empty(journal); 2007 jbd2_mark_journal_empty(journal, WRITE_FUA);
1979 mutex_unlock(&journal->j_checkpoint_mutex); 2008 mutex_unlock(&journal->j_checkpoint_mutex);
1980 write_lock(&journal->j_state_lock); 2009 write_lock(&journal->j_state_lock);
1981 J_ASSERT(!journal->j_running_transaction); 2010 J_ASSERT(!journal->j_running_transaction);
@@ -2021,7 +2050,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
2021 if (write) { 2050 if (write) {
2022 /* Lock to make assertions happy... */ 2051 /* Lock to make assertions happy... */
2023 mutex_lock(&journal->j_checkpoint_mutex); 2052 mutex_lock(&journal->j_checkpoint_mutex);
2024 jbd2_mark_journal_empty(journal); 2053 jbd2_mark_journal_empty(journal, WRITE_FUA);
2025 mutex_unlock(&journal->j_checkpoint_mutex); 2054 mutex_unlock(&journal->j_checkpoint_mutex);
2026 } 2055 }
2027 2056
@@ -2565,7 +2594,7 @@ void jbd2_journal_release_jbd_inode(journal_t *journal,
2565restart: 2594restart:
2566 spin_lock(&journal->j_list_lock); 2595 spin_lock(&journal->j_list_lock);
2567 /* Is commit writing out inode - we have to wait */ 2596 /* Is commit writing out inode - we have to wait */
2568 if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) { 2597 if (jinode->i_flags & JI_COMMIT_RUNNING) {
2569 wait_queue_head_t *wq; 2598 wait_queue_head_t *wq;
2570 DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); 2599 DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
2571 wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); 2600 wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 7f277e49fe88..08a456b96e4e 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -174,8 +174,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
174 return 0; 174 return 0;
175} 175}
176 176
177static int jbd2_descr_block_csum_verify(journal_t *j, 177static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf)
178 void *buf)
179{ 178{
180 struct jbd2_journal_block_tail *tail; 179 struct jbd2_journal_block_tail *tail;
181 __be32 provided; 180 __be32 provided;
@@ -522,8 +521,8 @@ static int do_one_pass(journal_t *journal,
522 descr_csum_size = 521 descr_csum_size =
523 sizeof(struct jbd2_journal_block_tail); 522 sizeof(struct jbd2_journal_block_tail);
524 if (descr_csum_size > 0 && 523 if (descr_csum_size > 0 &&
525 !jbd2_descr_block_csum_verify(journal, 524 !jbd2_descriptor_block_csum_verify(journal,
526 bh->b_data)) { 525 bh->b_data)) {
527 printk(KERN_ERR "JBD2: Invalid checksum " 526 printk(KERN_ERR "JBD2: Invalid checksum "
528 "recovering block %lu in log\n", 527 "recovering block %lu in log\n",
529 next_log_block); 528 next_log_block);
@@ -811,26 +810,6 @@ static int do_one_pass(journal_t *journal,
811 return err; 810 return err;
812} 811}
813 812
814static int jbd2_revoke_block_csum_verify(journal_t *j,
815 void *buf)
816{
817 struct jbd2_journal_revoke_tail *tail;
818 __be32 provided;
819 __u32 calculated;
820
821 if (!jbd2_journal_has_csum_v2or3(j))
822 return 1;
823
824 tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
825 sizeof(struct jbd2_journal_revoke_tail));
826 provided = tail->r_checksum;
827 tail->r_checksum = 0;
828 calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
829 tail->r_checksum = provided;
830
831 return provided == cpu_to_be32(calculated);
832}
833
834/* Scan a revoke record, marking all blocks mentioned as revoked. */ 813/* Scan a revoke record, marking all blocks mentioned as revoked. */
835 814
836static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, 815static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
@@ -846,11 +825,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
846 offset = sizeof(jbd2_journal_revoke_header_t); 825 offset = sizeof(jbd2_journal_revoke_header_t);
847 rcount = be32_to_cpu(header->r_count); 826 rcount = be32_to_cpu(header->r_count);
848 827
849 if (!jbd2_revoke_block_csum_verify(journal, header)) 828 if (!jbd2_descriptor_block_csum_verify(journal, header))
850 return -EFSBADCRC; 829 return -EFSBADCRC;
851 830
852 if (jbd2_journal_has_csum_v2or3(journal)) 831 if (jbd2_journal_has_csum_v2or3(journal))
853 csum_size = sizeof(struct jbd2_journal_revoke_tail); 832 csum_size = sizeof(struct jbd2_journal_block_tail);
854 if (rcount > journal->j_blocksize - csum_size) 833 if (rcount > journal->j_blocksize - csum_size)
855 return -EINVAL; 834 return -EINVAL;
856 max = rcount; 835 max = rcount;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 705ae577882b..91171dc352cb 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -122,11 +122,11 @@ struct jbd2_revoke_table_s
122 122
123 123
124#ifdef __KERNEL__ 124#ifdef __KERNEL__
125static void write_one_revoke_record(journal_t *, transaction_t *, 125static void write_one_revoke_record(transaction_t *,
126 struct list_head *, 126 struct list_head *,
127 struct buffer_head **, int *, 127 struct buffer_head **, int *,
128 struct jbd2_revoke_record_s *, int); 128 struct jbd2_revoke_record_s *);
129static void flush_descriptor(journal_t *, struct buffer_head *, int, int); 129static void flush_descriptor(journal_t *, struct buffer_head *, int);
130#endif 130#endif
131 131
132/* Utility functions to maintain the revoke table */ 132/* Utility functions to maintain the revoke table */
@@ -519,11 +519,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
519 * Write revoke records to the journal for all entries in the current 519 * Write revoke records to the journal for all entries in the current
520 * revoke hash, deleting the entries as we go. 520 * revoke hash, deleting the entries as we go.
521 */ 521 */
522void jbd2_journal_write_revoke_records(journal_t *journal, 522void jbd2_journal_write_revoke_records(transaction_t *transaction,
523 transaction_t *transaction, 523 struct list_head *log_bufs)
524 struct list_head *log_bufs,
525 int write_op)
526{ 524{
525 journal_t *journal = transaction->t_journal;
527 struct buffer_head *descriptor; 526 struct buffer_head *descriptor;
528 struct jbd2_revoke_record_s *record; 527 struct jbd2_revoke_record_s *record;
529 struct jbd2_revoke_table_s *revoke; 528 struct jbd2_revoke_table_s *revoke;
@@ -544,16 +543,15 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
544 while (!list_empty(hash_list)) { 543 while (!list_empty(hash_list)) {
545 record = (struct jbd2_revoke_record_s *) 544 record = (struct jbd2_revoke_record_s *)
546 hash_list->next; 545 hash_list->next;
547 write_one_revoke_record(journal, transaction, log_bufs, 546 write_one_revoke_record(transaction, log_bufs,
548 &descriptor, &offset, 547 &descriptor, &offset, record);
549 record, write_op);
550 count++; 548 count++;
551 list_del(&record->hash); 549 list_del(&record->hash);
552 kmem_cache_free(jbd2_revoke_record_cache, record); 550 kmem_cache_free(jbd2_revoke_record_cache, record);
553 } 551 }
554 } 552 }
555 if (descriptor) 553 if (descriptor)
556 flush_descriptor(journal, descriptor, offset, write_op); 554 flush_descriptor(journal, descriptor, offset);
557 jbd_debug(1, "Wrote %d revoke records\n", count); 555 jbd_debug(1, "Wrote %d revoke records\n", count);
558} 556}
559 557
@@ -562,18 +560,16 @@ void jbd2_journal_write_revoke_records(journal_t *journal,
562 * block if the old one is full or if we have not already created one. 560 * block if the old one is full or if we have not already created one.
563 */ 561 */
564 562
565static void write_one_revoke_record(journal_t *journal, 563static void write_one_revoke_record(transaction_t *transaction,
566 transaction_t *transaction,
567 struct list_head *log_bufs, 564 struct list_head *log_bufs,
568 struct buffer_head **descriptorp, 565 struct buffer_head **descriptorp,
569 int *offsetp, 566 int *offsetp,
570 struct jbd2_revoke_record_s *record, 567 struct jbd2_revoke_record_s *record)
571 int write_op)
572{ 568{
569 journal_t *journal = transaction->t_journal;
573 int csum_size = 0; 570 int csum_size = 0;
574 struct buffer_head *descriptor; 571 struct buffer_head *descriptor;
575 int sz, offset; 572 int sz, offset;
576 journal_header_t *header;
577 573
578 /* If we are already aborting, this all becomes a noop. We 574 /* If we are already aborting, this all becomes a noop. We
579 still need to go round the loop in 575 still need to go round the loop in
@@ -587,7 +583,7 @@ static void write_one_revoke_record(journal_t *journal,
587 583
588 /* Do we need to leave space at the end for a checksum? */ 584 /* Do we need to leave space at the end for a checksum? */
589 if (jbd2_journal_has_csum_v2or3(journal)) 585 if (jbd2_journal_has_csum_v2or3(journal))
590 csum_size = sizeof(struct jbd2_journal_revoke_tail); 586 csum_size = sizeof(struct jbd2_journal_block_tail);
591 587
592 if (jbd2_has_feature_64bit(journal)) 588 if (jbd2_has_feature_64bit(journal))
593 sz = 8; 589 sz = 8;
@@ -597,19 +593,16 @@ static void write_one_revoke_record(journal_t *journal,
597 /* Make sure we have a descriptor with space left for the record */ 593 /* Make sure we have a descriptor with space left for the record */
598 if (descriptor) { 594 if (descriptor) {
599 if (offset + sz > journal->j_blocksize - csum_size) { 595 if (offset + sz > journal->j_blocksize - csum_size) {
600 flush_descriptor(journal, descriptor, offset, write_op); 596 flush_descriptor(journal, descriptor, offset);
601 descriptor = NULL; 597 descriptor = NULL;
602 } 598 }
603 } 599 }
604 600
605 if (!descriptor) { 601 if (!descriptor) {
606 descriptor = jbd2_journal_get_descriptor_buffer(journal); 602 descriptor = jbd2_journal_get_descriptor_buffer(transaction,
603 JBD2_REVOKE_BLOCK);
607 if (!descriptor) 604 if (!descriptor)
608 return; 605 return;
609 header = (journal_header_t *)descriptor->b_data;
610 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
611 header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK);
612 header->h_sequence = cpu_to_be32(transaction->t_tid);
613 606
614 /* Record it so that we can wait for IO completion later */ 607 /* Record it so that we can wait for IO completion later */
615 BUFFER_TRACE(descriptor, "file in log_bufs"); 608 BUFFER_TRACE(descriptor, "file in log_bufs");
@@ -630,21 +623,6 @@ static void write_one_revoke_record(journal_t *journal,
630 *offsetp = offset; 623 *offsetp = offset;
631} 624}
632 625
633static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
634{
635 struct jbd2_journal_revoke_tail *tail;
636 __u32 csum;
637
638 if (!jbd2_journal_has_csum_v2or3(j))
639 return;
640
641 tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
642 sizeof(struct jbd2_journal_revoke_tail));
643 tail->r_checksum = 0;
644 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
645 tail->r_checksum = cpu_to_be32(csum);
646}
647
648/* 626/*
649 * Flush a revoke descriptor out to the journal. If we are aborting, 627 * Flush a revoke descriptor out to the journal. If we are aborting,
650 * this is a noop; otherwise we are generating a buffer which needs to 628 * this is a noop; otherwise we are generating a buffer which needs to
@@ -654,7 +632,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
654 632
655static void flush_descriptor(journal_t *journal, 633static void flush_descriptor(journal_t *journal,
656 struct buffer_head *descriptor, 634 struct buffer_head *descriptor,
657 int offset, int write_op) 635 int offset)
658{ 636{
659 jbd2_journal_revoke_header_t *header; 637 jbd2_journal_revoke_header_t *header;
660 638
@@ -665,12 +643,12 @@ static void flush_descriptor(journal_t *journal,
665 643
666 header = (jbd2_journal_revoke_header_t *)descriptor->b_data; 644 header = (jbd2_journal_revoke_header_t *)descriptor->b_data;
667 header->r_count = cpu_to_be32(offset); 645 header->r_count = cpu_to_be32(offset);
668 jbd2_revoke_csum_set(journal, descriptor); 646 jbd2_descriptor_block_csum_set(journal, descriptor);
669 647
670 set_buffer_jwrite(descriptor); 648 set_buffer_jwrite(descriptor);
671 BUFFER_TRACE(descriptor, "write"); 649 BUFFER_TRACE(descriptor, "write");
672 set_buffer_dirty(descriptor); 650 set_buffer_dirty(descriptor);
673 write_dirty_buffer(descriptor, write_op); 651 write_dirty_buffer(descriptor, WRITE_SYNC);
674} 652}
675#endif 653#endif
676 654
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 081dff087fc0..01e4652d88f6 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -966,14 +966,8 @@ repeat:
966 if (!frozen_buffer) { 966 if (!frozen_buffer) {
967 JBUFFER_TRACE(jh, "allocate memory for buffer"); 967 JBUFFER_TRACE(jh, "allocate memory for buffer");
968 jbd_unlock_bh_state(bh); 968 jbd_unlock_bh_state(bh);
969 frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); 969 frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
970 if (!frozen_buffer) { 970 GFP_NOFS | __GFP_NOFAIL);
971 printk(KERN_ERR "%s: OOM for frozen_buffer\n",
972 __func__);
973 JBUFFER_TRACE(jh, "oom!");
974 error = -ENOMEM;
975 goto out;
976 }
977 goto repeat; 971 goto repeat;
978 } 972 }
979 jh->b_frozen_data = frozen_buffer; 973 jh->b_frozen_data = frozen_buffer;
@@ -1226,15 +1220,9 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
1226 goto out; 1220 goto out;
1227 1221
1228repeat: 1222repeat:
1229 if (!jh->b_committed_data) { 1223 if (!jh->b_committed_data)
1230 committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); 1224 committed_data = jbd2_alloc(jh2bh(jh)->b_size,
1231 if (!committed_data) { 1225 GFP_NOFS|__GFP_NOFAIL);
1232 printk(KERN_ERR "%s: No memory for committed data\n",
1233 __func__);
1234 err = -ENOMEM;
1235 goto out;
1236 }
1237 }
1238 1226
1239 jbd_lock_bh_state(bh); 1227 jbd_lock_bh_state(bh);
1240 if (!jh->b_committed_data) { 1228 if (!jh->b_committed_data) {
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 187477ded6b3..eccda3a02de6 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -1,858 +1,433 @@
1/* 1#include <linux/spinlock.h>
2 * linux/fs/mbcache.c
3 * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
4 */
5
6/*
7 * Filesystem Meta Information Block Cache (mbcache)
8 *
9 * The mbcache caches blocks of block devices that need to be located
10 * by their device/block number, as well as by other criteria (such
11 * as the block's contents).
12 *
13 * There can only be one cache entry in a cache per device and block number.
14 * Additional indexes need not be unique in this sense. The number of
15 * additional indexes (=other criteria) can be hardwired at compile time
16 * or specified at cache create time.
17 *
18 * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
19 * in the cache. A valid entry is in the main hash tables of the cache,
20 * and may also be in the lru list. An invalid entry is not in any hashes
21 * or lists.
22 *
23 * A valid cache entry is only in the lru list if no handles refer to it.
24 * Invalid cache entries will be freed when the last handle to the cache
25 * entry is released. Entries that cannot be freed immediately are put
26 * back on the lru list.
27 */
28
29/*
30 * Lock descriptions and usage:
31 *
32 * Each hash chain of both the block and index hash tables now contains
33 * a built-in lock used to serialize accesses to the hash chain.
34 *
35 * Accesses to global data structures mb_cache_list and mb_cache_lru_list
36 * are serialized via the global spinlock mb_cache_spinlock.
37 *
38 * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
39 * accesses to its local data, such as e_used and e_queued.
40 *
41 * Lock ordering:
42 *
43 * Each block hash chain's lock has the highest lock order, followed by an
44 * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
45 * lock), and mb_cach_spinlock, with the lowest order. While holding
46 * either a block or index hash chain lock, a thread can acquire an
47 * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
48 *
49 * Synchronization:
50 *
51 * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
52 * index hash chian, it needs to lock the corresponding hash chain. For each
53 * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
54 * prevent either any simultaneous release or free on the entry and also
55 * to serialize accesses to either the e_used or e_queued member of the entry.
56 *
57 * To avoid having a dangling reference to an already freed
58 * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
59 * block hash chain and also no longer being referenced, both e_used,
60 * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is
61 * first removed from a block hash chain.
62 */
63
64#include <linux/kernel.h>
65#include <linux/module.h>
66
67#include <linux/hash.h>
68#include <linux/fs.h>
69#include <linux/mm.h>
70#include <linux/slab.h> 2#include <linux/slab.h>
71#include <linux/sched.h> 3#include <linux/list.h>
72#include <linux/list_bl.h> 4#include <linux/list_bl.h>
5#include <linux/module.h>
6#include <linux/sched.h>
7#include <linux/workqueue.h>
73#include <linux/mbcache.h> 8#include <linux/mbcache.h>
74#include <linux/init.h>
75#include <linux/blockgroup_lock.h>
76#include <linux/log2.h>
77
78#ifdef MB_CACHE_DEBUG
79# define mb_debug(f...) do { \
80 printk(KERN_DEBUG f); \
81 printk("\n"); \
82 } while (0)
83#define mb_assert(c) do { if (!(c)) \
84 printk(KERN_ERR "assertion " #c " failed\n"); \
85 } while(0)
86#else
87# define mb_debug(f...) do { } while(0)
88# define mb_assert(c) do { } while(0)
89#endif
90#define mb_error(f...) do { \
91 printk(KERN_ERR f); \
92 printk("\n"); \
93 } while(0)
94
95#define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
96
97#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS)
98#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \
99 (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
100
101static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
102static struct blockgroup_lock *mb_cache_bg_lock;
103static struct kmem_cache *mb_cache_kmem_cache;
104
105MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
106MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
107MODULE_LICENSE("GPL");
108
109EXPORT_SYMBOL(mb_cache_create);
110EXPORT_SYMBOL(mb_cache_shrink);
111EXPORT_SYMBOL(mb_cache_destroy);
112EXPORT_SYMBOL(mb_cache_entry_alloc);
113EXPORT_SYMBOL(mb_cache_entry_insert);
114EXPORT_SYMBOL(mb_cache_entry_release);
115EXPORT_SYMBOL(mb_cache_entry_free);
116EXPORT_SYMBOL(mb_cache_entry_get);
117#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
118EXPORT_SYMBOL(mb_cache_entry_find_first);
119EXPORT_SYMBOL(mb_cache_entry_find_next);
120#endif
121 9
122/* 10/*
123 * Global data: list of all mbcache's, lru list, and a spinlock for 11 * Mbcache is a simple key-value store. Keys need not be unique, however
124 * accessing cache data structures on SMP machines. The lru list is 12 * key-value pairs are expected to be unique (we use this fact in
125 * global across all mbcaches. 13 * mb_cache_entry_delete_block()).
14 *
15 * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
16 * They use hash of a block contents as a key and block number as a value.
17 * That's why keys need not be unique (different xattr blocks may end up having
18 * the same hash). However block number always uniquely identifies a cache
19 * entry.
20 *
21 * We provide functions for creation and removal of entries, search by key,
22 * and a special "delete entry with given key-value pair" operation. Fixed
23 * size hash table is used for fast key lookups.
126 */ 24 */
127 25
128static LIST_HEAD(mb_cache_list); 26struct mb_cache {
129static LIST_HEAD(mb_cache_lru_list); 27 /* Hash table of entries */
130static DEFINE_SPINLOCK(mb_cache_spinlock); 28 struct hlist_bl_head *c_hash;
131 29 /* log2 of hash table size */
132static inline void 30 int c_bucket_bits;
133__spin_lock_mb_cache_entry(struct mb_cache_entry *ce) 31 /* Maximum entries in cache to avoid degrading hash too much */
134{ 32 int c_max_entries;
135 spin_lock(bgl_lock_ptr(mb_cache_bg_lock, 33 /* Protects c_list, c_entry_count */
136 MB_CACHE_ENTRY_LOCK_INDEX(ce))); 34 spinlock_t c_list_lock;
137} 35 struct list_head c_list;
138 36 /* Number of entries in cache */
139static inline void 37 unsigned long c_entry_count;
140__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce) 38 struct shrinker c_shrink;
141{ 39 /* Work for shrinking when the cache has too many entries */
142 spin_unlock(bgl_lock_ptr(mb_cache_bg_lock, 40 struct work_struct c_shrink_work;
143 MB_CACHE_ENTRY_LOCK_INDEX(ce))); 41};
144}
145
146static inline int
147__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
148{
149 return !hlist_bl_unhashed(&ce->e_block_list);
150}
151 42
43static struct kmem_cache *mb_entry_cache;
152 44
153static inline void 45static unsigned long mb_cache_shrink(struct mb_cache *cache,
154__mb_cache_entry_unhash_block(struct mb_cache_entry *ce) 46 unsigned int nr_to_scan);
155{
156 if (__mb_cache_entry_is_block_hashed(ce))
157 hlist_bl_del_init(&ce->e_block_list);
158}
159 47
160static inline int 48static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
161__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce) 49 u32 key)
162{ 50{
163 return !hlist_bl_unhashed(&ce->e_index.o_list); 51 return &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
164} 52}
165 53
166static inline void 54/*
167__mb_cache_entry_unhash_index(struct mb_cache_entry *ce) 55 * Number of entries to reclaim synchronously when there are too many entries
168{ 56 * in cache
169 if (__mb_cache_entry_is_index_hashed(ce)) 57 */
170 hlist_bl_del_init(&ce->e_index.o_list); 58#define SYNC_SHRINK_BATCH 64
171}
172 59
173/* 60/*
174 * __mb_cache_entry_unhash_unlock() 61 * mb_cache_entry_create - create entry in cache
175 * 62 * @cache - cache where the entry should be created
176 * This function is called to unhash both the block and index hash 63 * @mask - gfp mask with which the entry should be allocated
177 * chain. 64 * @key - key of the entry
178 * It assumes both the block and index hash chain is locked upon entry. 65 * @block - block that contains data
179 * It also unlock both hash chains both exit 66 * @reusable - is the block reusable by other inodes?
67 *
68 * Creates entry in @cache with key @key and records that data is stored in
69 * block @block. The function returns -EBUSY if entry with the same key
70 * and for the same block already exists in cache. Otherwise 0 is returned.
180 */ 71 */
181static inline void 72int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
182__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce) 73 sector_t block, bool reusable)
183{ 74{
184 __mb_cache_entry_unhash_index(ce); 75 struct mb_cache_entry *entry, *dup;
185 hlist_bl_unlock(ce->e_index_hash_p); 76 struct hlist_bl_node *dup_node;
186 __mb_cache_entry_unhash_block(ce); 77 struct hlist_bl_head *head;
187 hlist_bl_unlock(ce->e_block_hash_p); 78
79 /* Schedule background reclaim if there are too many entries */
80 if (cache->c_entry_count >= cache->c_max_entries)
81 schedule_work(&cache->c_shrink_work);
82 /* Do some sync reclaim if background reclaim cannot keep up */
83 if (cache->c_entry_count >= 2*cache->c_max_entries)
84 mb_cache_shrink(cache, SYNC_SHRINK_BATCH);
85
86 entry = kmem_cache_alloc(mb_entry_cache, mask);
87 if (!entry)
88 return -ENOMEM;
89
90 INIT_LIST_HEAD(&entry->e_list);
91 /* One ref for hash, one ref returned */
92 atomic_set(&entry->e_refcnt, 1);
93 entry->e_key = key;
94 entry->e_block = block;
95 entry->e_reusable = reusable;
96 head = mb_cache_entry_head(cache, key);
97 hlist_bl_lock(head);
98 hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
99 if (dup->e_key == key && dup->e_block == block) {
100 hlist_bl_unlock(head);
101 kmem_cache_free(mb_entry_cache, entry);
102 return -EBUSY;
103 }
104 }
105 hlist_bl_add_head(&entry->e_hash_list, head);
106 hlist_bl_unlock(head);
107
108 spin_lock(&cache->c_list_lock);
109 list_add_tail(&entry->e_list, &cache->c_list);
110 /* Grab ref for LRU list */
111 atomic_inc(&entry->e_refcnt);
112 cache->c_entry_count++;
113 spin_unlock(&cache->c_list_lock);
114
115 return 0;
188} 116}
117EXPORT_SYMBOL(mb_cache_entry_create);
189 118
190static void 119void __mb_cache_entry_free(struct mb_cache_entry *entry)
191__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
192{ 120{
193 struct mb_cache *cache = ce->e_cache; 121 kmem_cache_free(mb_entry_cache, entry);
194
195 mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
196 kmem_cache_free(cache->c_entry_cache, ce);
197 atomic_dec(&cache->c_entry_count);
198} 122}
123EXPORT_SYMBOL(__mb_cache_entry_free);
199 124
200static void 125static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
201__mb_cache_entry_release(struct mb_cache_entry *ce) 126 struct mb_cache_entry *entry,
127 u32 key)
202{ 128{
203 /* First lock the entry to serialize access to its local data. */ 129 struct mb_cache_entry *old_entry = entry;
204 __spin_lock_mb_cache_entry(ce); 130 struct hlist_bl_node *node;
205 /* Wake up all processes queuing for this cache entry. */ 131 struct hlist_bl_head *head;
206 if (ce->e_queued) 132
207 wake_up_all(&mb_cache_queue); 133 head = mb_cache_entry_head(cache, key);
208 if (ce->e_used >= MB_CACHE_WRITER) 134 hlist_bl_lock(head);
209 ce->e_used -= MB_CACHE_WRITER; 135 if (entry && !hlist_bl_unhashed(&entry->e_hash_list))
210 /* 136 node = entry->e_hash_list.next;
211 * Make sure that all cache entries on lru_list have 137 else
212 * both e_used and e_qued of 0s. 138 node = hlist_bl_first(head);
213 */ 139 while (node) {
214 ce->e_used--; 140 entry = hlist_bl_entry(node, struct mb_cache_entry,
215 if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) { 141 e_hash_list);
216 if (!__mb_cache_entry_is_block_hashed(ce)) { 142 if (entry->e_key == key && entry->e_reusable) {
217 __spin_unlock_mb_cache_entry(ce); 143 atomic_inc(&entry->e_refcnt);
218 goto forget; 144 goto out;
219 } 145 }
220 /* 146 node = node->next;
221 * Need access to lru list, first drop entry lock,
222 * then reacquire the lock in the proper order.
223 */
224 spin_lock(&mb_cache_spinlock);
225 if (list_empty(&ce->e_lru_list))
226 list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
227 spin_unlock(&mb_cache_spinlock);
228 } 147 }
229 __spin_unlock_mb_cache_entry(ce); 148 entry = NULL;
230 return; 149out:
231forget: 150 hlist_bl_unlock(head);
232 mb_assert(list_empty(&ce->e_lru_list)); 151 if (old_entry)
233 __mb_cache_entry_forget(ce, GFP_KERNEL); 152 mb_cache_entry_put(cache, old_entry);
153
154 return entry;
234} 155}
235 156
236/* 157/*
237 * mb_cache_shrink_scan() memory pressure callback 158 * mb_cache_entry_find_first - find the first entry in cache with given key
238 * 159 * @cache: cache where we should search
239 * This function is called by the kernel memory management when memory 160 * @key: key to look for
240 * gets low.
241 * 161 *
242 * @shrink: (ignored) 162 * Search in @cache for entry with key @key. Grabs reference to the first
243 * @sc: shrink_control passed from reclaim 163 * entry found and returns the entry.
244 *
245 * Returns the number of objects freed.
246 */ 164 */
247static unsigned long 165struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
248mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 166 u32 key)
249{ 167{
250 LIST_HEAD(free_list); 168 return __entry_find(cache, NULL, key);
251 struct mb_cache_entry *entry, *tmp;
252 int nr_to_scan = sc->nr_to_scan;
253 gfp_t gfp_mask = sc->gfp_mask;
254 unsigned long freed = 0;
255
256 mb_debug("trying to free %d entries", nr_to_scan);
257 spin_lock(&mb_cache_spinlock);
258 while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
259 struct mb_cache_entry *ce =
260 list_entry(mb_cache_lru_list.next,
261 struct mb_cache_entry, e_lru_list);
262 list_del_init(&ce->e_lru_list);
263 if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
264 continue;
265 spin_unlock(&mb_cache_spinlock);
266 /* Prevent any find or get operation on the entry */
267 hlist_bl_lock(ce->e_block_hash_p);
268 hlist_bl_lock(ce->e_index_hash_p);
269 /* Ignore if it is touched by a find/get */
270 if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
271 !list_empty(&ce->e_lru_list)) {
272 hlist_bl_unlock(ce->e_index_hash_p);
273 hlist_bl_unlock(ce->e_block_hash_p);
274 spin_lock(&mb_cache_spinlock);
275 continue;
276 }
277 __mb_cache_entry_unhash_unlock(ce);
278 list_add_tail(&ce->e_lru_list, &free_list);
279 spin_lock(&mb_cache_spinlock);
280 }
281 spin_unlock(&mb_cache_spinlock);
282
283 list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
284 __mb_cache_entry_forget(entry, gfp_mask);
285 freed++;
286 }
287 return freed;
288} 169}
170EXPORT_SYMBOL(mb_cache_entry_find_first);
289 171
290static unsigned long 172/*
291mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 173 * mb_cache_entry_find_next - find next entry in cache with the same
174 * @cache: cache where we should search
175 * @entry: entry to start search from
176 *
177 * Finds next entry in the hash chain which has the same key as @entry.
178 * If @entry is unhashed (which can happen when deletion of entry races
179 * with the search), finds the first entry in the hash chain. The function
180 * drops reference to @entry and returns with a reference to the found entry.
181 */
182struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
183 struct mb_cache_entry *entry)
292{ 184{
293 struct mb_cache *cache; 185 return __entry_find(cache, entry, entry->e_key);
294 unsigned long count = 0;
295
296 spin_lock(&mb_cache_spinlock);
297 list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
298 mb_debug("cache %s (%d)", cache->c_name,
299 atomic_read(&cache->c_entry_count));
300 count += atomic_read(&cache->c_entry_count);
301 }
302 spin_unlock(&mb_cache_spinlock);
303
304 return vfs_pressure_ratio(count);
305} 186}
306 187EXPORT_SYMBOL(mb_cache_entry_find_next);
307static struct shrinker mb_cache_shrinker = {
308 .count_objects = mb_cache_shrink_count,
309 .scan_objects = mb_cache_shrink_scan,
310 .seeks = DEFAULT_SEEKS,
311};
312 188
313/* 189/*
314 * mb_cache_create() create a new cache 190 * mb_cache_entry_get - get a cache entry by block number (and key)
315 * 191 * @cache - cache we work with
316 * All entries in one cache are equal size. Cache entries may be from 192 * @key - key of block number @block
317 * multiple devices. If this is the first mbcache created, registers 193 * @block - block number
318 * the cache with kernel memory management. Returns NULL if no more
319 * memory was available.
320 *
321 * @name: name of the cache (informal)
322 * @bucket_bits: log2(number of hash buckets)
323 */ 194 */
324struct mb_cache * 195struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
325mb_cache_create(const char *name, int bucket_bits) 196 sector_t block)
326{ 197{
327 int n, bucket_count = 1 << bucket_bits; 198 struct hlist_bl_node *node;
328 struct mb_cache *cache = NULL; 199 struct hlist_bl_head *head;
329 200 struct mb_cache_entry *entry;
330 if (!mb_cache_bg_lock) { 201
331 mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock), 202 head = mb_cache_entry_head(cache, key);
332 GFP_KERNEL); 203 hlist_bl_lock(head);
333 if (!mb_cache_bg_lock) 204 hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
334 return NULL; 205 if (entry->e_key == key && entry->e_block == block) {
335 bgl_lock_init(mb_cache_bg_lock); 206 atomic_inc(&entry->e_refcnt);
336 } 207 goto out;
337 208 }
338 cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
339 if (!cache)
340 return NULL;
341 cache->c_name = name;
342 atomic_set(&cache->c_entry_count, 0);
343 cache->c_bucket_bits = bucket_bits;
344 cache->c_block_hash = kmalloc(bucket_count *
345 sizeof(struct hlist_bl_head), GFP_KERNEL);
346 if (!cache->c_block_hash)
347 goto fail;
348 for (n=0; n<bucket_count; n++)
349 INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
350 cache->c_index_hash = kmalloc(bucket_count *
351 sizeof(struct hlist_bl_head), GFP_KERNEL);
352 if (!cache->c_index_hash)
353 goto fail;
354 for (n=0; n<bucket_count; n++)
355 INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
356 if (!mb_cache_kmem_cache) {
357 mb_cache_kmem_cache = kmem_cache_create(name,
358 sizeof(struct mb_cache_entry), 0,
359 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
360 if (!mb_cache_kmem_cache)
361 goto fail2;
362 } 209 }
363 cache->c_entry_cache = mb_cache_kmem_cache; 210 entry = NULL;
364 211out:
365 /* 212 hlist_bl_unlock(head);
366 * Set an upper limit on the number of cache entries so that the hash 213 return entry;
367 * chains won't grow too long.
368 */
369 cache->c_max_entries = bucket_count << 4;
370
371 spin_lock(&mb_cache_spinlock);
372 list_add(&cache->c_cache_list, &mb_cache_list);
373 spin_unlock(&mb_cache_spinlock);
374 return cache;
375
376fail2:
377 kfree(cache->c_index_hash);
378
379fail:
380 kfree(cache->c_block_hash);
381 kfree(cache);
382 return NULL;
383} 214}
215EXPORT_SYMBOL(mb_cache_entry_get);
384 216
385 217/* mb_cache_entry_delete_block - remove information about block from cache
386/* 218 * @cache - cache we work with
387 * mb_cache_shrink() 219 * @key - key of block @block
388 * 220 * @block - block number
389 * Removes all cache entries of a device from the cache. All cache entries
390 * currently in use cannot be freed, and thus remain in the cache. All others
391 * are freed.
392 * 221 *
393 * @bdev: which device's cache entries to shrink 222 * Remove entry from cache @cache with key @key with data stored in @block.
394 */ 223 */
395void 224void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
396mb_cache_shrink(struct block_device *bdev) 225 sector_t block)
397{ 226{
398 LIST_HEAD(free_list); 227 struct hlist_bl_node *node;
399 struct list_head *l; 228 struct hlist_bl_head *head;
400 struct mb_cache_entry *ce, *tmp; 229 struct mb_cache_entry *entry;
401 230
402 l = &mb_cache_lru_list; 231 head = mb_cache_entry_head(cache, key);
403 spin_lock(&mb_cache_spinlock); 232 hlist_bl_lock(head);
404 while (!list_is_last(l, &mb_cache_lru_list)) { 233 hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
405 l = l->next; 234 if (entry->e_key == key && entry->e_block == block) {
406 ce = list_entry(l, struct mb_cache_entry, e_lru_list); 235 /* We keep hash list reference to keep entry alive */
407 if (ce->e_bdev == bdev) { 236 hlist_bl_del_init(&entry->e_hash_list);
408 list_del_init(&ce->e_lru_list); 237 hlist_bl_unlock(head);
409 if (ce->e_used || ce->e_queued || 238 spin_lock(&cache->c_list_lock);
410 atomic_read(&ce->e_refcnt)) 239 if (!list_empty(&entry->e_list)) {
411 continue; 240 list_del_init(&entry->e_list);
412 spin_unlock(&mb_cache_spinlock); 241 cache->c_entry_count--;
413 /* 242 atomic_dec(&entry->e_refcnt);
414 * Prevent any find or get operation on the entry.
415 */
416 hlist_bl_lock(ce->e_block_hash_p);
417 hlist_bl_lock(ce->e_index_hash_p);
418 /* Ignore if it is touched by a find/get */
419 if (ce->e_used || ce->e_queued ||
420 atomic_read(&ce->e_refcnt) ||
421 !list_empty(&ce->e_lru_list)) {
422 hlist_bl_unlock(ce->e_index_hash_p);
423 hlist_bl_unlock(ce->e_block_hash_p);
424 l = &mb_cache_lru_list;
425 spin_lock(&mb_cache_spinlock);
426 continue;
427 } 243 }
428 __mb_cache_entry_unhash_unlock(ce); 244 spin_unlock(&cache->c_list_lock);
429 mb_assert(!(ce->e_used || ce->e_queued || 245 mb_cache_entry_put(cache, entry);
430 atomic_read(&ce->e_refcnt))); 246 return;
431 list_add_tail(&ce->e_lru_list, &free_list);
432 l = &mb_cache_lru_list;
433 spin_lock(&mb_cache_spinlock);
434 } 247 }
435 } 248 }
436 spin_unlock(&mb_cache_spinlock); 249 hlist_bl_unlock(head);
437
438 list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
439 __mb_cache_entry_forget(ce, GFP_KERNEL);
440 }
441} 250}
251EXPORT_SYMBOL(mb_cache_entry_delete_block);
442 252
443 253/* mb_cache_entry_touch - cache entry got used
444/* 254 * @cache - cache the entry belongs to
445 * mb_cache_destroy() 255 * @entry - entry that got used
446 * 256 *
447 * Shrinks the cache to its minimum possible size (hopefully 0 entries), 257 * Marks entry as used to give hit higher chances of surviving in cache.
448 * and then destroys it. If this was the last mbcache, un-registers the
449 * mbcache from kernel memory management.
450 */ 258 */
451void 259void mb_cache_entry_touch(struct mb_cache *cache,
452mb_cache_destroy(struct mb_cache *cache) 260 struct mb_cache_entry *entry)
453{ 261{
454 LIST_HEAD(free_list); 262 entry->e_referenced = 1;
455 struct mb_cache_entry *ce, *tmp;
456
457 spin_lock(&mb_cache_spinlock);
458 list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
459 if (ce->e_cache == cache)
460 list_move_tail(&ce->e_lru_list, &free_list);
461 }
462 list_del(&cache->c_cache_list);
463 spin_unlock(&mb_cache_spinlock);
464
465 list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
466 list_del_init(&ce->e_lru_list);
467 /*
468 * Prevent any find or get operation on the entry.
469 */
470 hlist_bl_lock(ce->e_block_hash_p);
471 hlist_bl_lock(ce->e_index_hash_p);
472 mb_assert(!(ce->e_used || ce->e_queued ||
473 atomic_read(&ce->e_refcnt)));
474 __mb_cache_entry_unhash_unlock(ce);
475 __mb_cache_entry_forget(ce, GFP_KERNEL);
476 }
477
478 if (atomic_read(&cache->c_entry_count) > 0) {
479 mb_error("cache %s: %d orphaned entries",
480 cache->c_name,
481 atomic_read(&cache->c_entry_count));
482 }
483
484 if (list_empty(&mb_cache_list)) {
485 kmem_cache_destroy(mb_cache_kmem_cache);
486 mb_cache_kmem_cache = NULL;
487 }
488 kfree(cache->c_index_hash);
489 kfree(cache->c_block_hash);
490 kfree(cache);
491} 263}
264EXPORT_SYMBOL(mb_cache_entry_touch);
492 265
493/* 266static unsigned long mb_cache_count(struct shrinker *shrink,
494 * mb_cache_entry_alloc() 267 struct shrink_control *sc)
495 *
496 * Allocates a new cache entry. The new entry will not be valid initially,
497 * and thus cannot be looked up yet. It should be filled with data, and
498 * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
499 * if no more memory was available.
500 */
501struct mb_cache_entry *
502mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
503{ 268{
504 struct mb_cache_entry *ce; 269 struct mb_cache *cache = container_of(shrink, struct mb_cache,
505 270 c_shrink);
506 if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
507 struct list_head *l;
508
509 l = &mb_cache_lru_list;
510 spin_lock(&mb_cache_spinlock);
511 while (!list_is_last(l, &mb_cache_lru_list)) {
512 l = l->next;
513 ce = list_entry(l, struct mb_cache_entry, e_lru_list);
514 if (ce->e_cache == cache) {
515 list_del_init(&ce->e_lru_list);
516 if (ce->e_used || ce->e_queued ||
517 atomic_read(&ce->e_refcnt))
518 continue;
519 spin_unlock(&mb_cache_spinlock);
520 /*
521 * Prevent any find or get operation on the
522 * entry.
523 */
524 hlist_bl_lock(ce->e_block_hash_p);
525 hlist_bl_lock(ce->e_index_hash_p);
526 /* Ignore if it is touched by a find/get */
527 if (ce->e_used || ce->e_queued ||
528 atomic_read(&ce->e_refcnt) ||
529 !list_empty(&ce->e_lru_list)) {
530 hlist_bl_unlock(ce->e_index_hash_p);
531 hlist_bl_unlock(ce->e_block_hash_p);
532 l = &mb_cache_lru_list;
533 spin_lock(&mb_cache_spinlock);
534 continue;
535 }
536 mb_assert(list_empty(&ce->e_lru_list));
537 mb_assert(!(ce->e_used || ce->e_queued ||
538 atomic_read(&ce->e_refcnt)));
539 __mb_cache_entry_unhash_unlock(ce);
540 goto found;
541 }
542 }
543 spin_unlock(&mb_cache_spinlock);
544 }
545 271
546 ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); 272 return cache->c_entry_count;
547 if (!ce)
548 return NULL;
549 atomic_inc(&cache->c_entry_count);
550 INIT_LIST_HEAD(&ce->e_lru_list);
551 INIT_HLIST_BL_NODE(&ce->e_block_list);
552 INIT_HLIST_BL_NODE(&ce->e_index.o_list);
553 ce->e_cache = cache;
554 ce->e_queued = 0;
555 atomic_set(&ce->e_refcnt, 0);
556found:
557 ce->e_block_hash_p = &cache->c_block_hash[0];
558 ce->e_index_hash_p = &cache->c_index_hash[0];
559 ce->e_used = 1 + MB_CACHE_WRITER;
560 return ce;
561} 273}
562 274
563 275/* Shrink number of entries in cache */
564/* 276static unsigned long mb_cache_shrink(struct mb_cache *cache,
565 * mb_cache_entry_insert() 277 unsigned int nr_to_scan)
566 *
567 * Inserts an entry that was allocated using mb_cache_entry_alloc() into
568 * the cache. After this, the cache entry can be looked up, but is not yet
569 * in the lru list as the caller still holds a handle to it. Returns 0 on
570 * success, or -EBUSY if a cache entry for that device + inode exists
571 * already (this may happen after a failed lookup, but when another process
572 * has inserted the same cache entry in the meantime).
573 *
574 * @bdev: device the cache entry belongs to
575 * @block: block number
576 * @key: lookup key
577 */
578int
579mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
580 sector_t block, unsigned int key)
581{ 278{
582 struct mb_cache *cache = ce->e_cache; 279 struct mb_cache_entry *entry;
583 unsigned int bucket; 280 struct hlist_bl_head *head;
584 struct hlist_bl_node *l; 281 unsigned int shrunk = 0;
585 struct hlist_bl_head *block_hash_p; 282
586 struct hlist_bl_head *index_hash_p; 283 spin_lock(&cache->c_list_lock);
587 struct mb_cache_entry *lce; 284 while (nr_to_scan-- && !list_empty(&cache->c_list)) {
588 285 entry = list_first_entry(&cache->c_list,
589 mb_assert(ce); 286 struct mb_cache_entry, e_list);
590 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), 287 if (entry->e_referenced) {
591 cache->c_bucket_bits); 288 entry->e_referenced = 0;
592 block_hash_p = &cache->c_block_hash[bucket]; 289 list_move_tail(&cache->c_list, &entry->e_list);
593 hlist_bl_lock(block_hash_p); 290 continue;
594 hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
595 if (lce->e_bdev == bdev && lce->e_block == block) {
596 hlist_bl_unlock(block_hash_p);
597 return -EBUSY;
598 } 291 }
292 list_del_init(&entry->e_list);
293 cache->c_entry_count--;
294 /*
295 * We keep LRU list reference so that entry doesn't go away
296 * from under us.
297 */
298 spin_unlock(&cache->c_list_lock);
299 head = mb_cache_entry_head(cache, entry->e_key);
300 hlist_bl_lock(head);
301 if (!hlist_bl_unhashed(&entry->e_hash_list)) {
302 hlist_bl_del_init(&entry->e_hash_list);
303 atomic_dec(&entry->e_refcnt);
304 }
305 hlist_bl_unlock(head);
306 if (mb_cache_entry_put(cache, entry))
307 shrunk++;
308 cond_resched();
309 spin_lock(&cache->c_list_lock);
599 } 310 }
600 mb_assert(!__mb_cache_entry_is_block_hashed(ce)); 311 spin_unlock(&cache->c_list_lock);
601 __mb_cache_entry_unhash_block(ce);
602 __mb_cache_entry_unhash_index(ce);
603 ce->e_bdev = bdev;
604 ce->e_block = block;
605 ce->e_block_hash_p = block_hash_p;
606 ce->e_index.o_key = key;
607 hlist_bl_add_head(&ce->e_block_list, block_hash_p);
608 hlist_bl_unlock(block_hash_p);
609 bucket = hash_long(key, cache->c_bucket_bits);
610 index_hash_p = &cache->c_index_hash[bucket];
611 hlist_bl_lock(index_hash_p);
612 ce->e_index_hash_p = index_hash_p;
613 hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
614 hlist_bl_unlock(index_hash_p);
615 return 0;
616}
617 312
313 return shrunk;
314}
618 315
619/* 316static unsigned long mb_cache_scan(struct shrinker *shrink,
620 * mb_cache_entry_release() 317 struct shrink_control *sc)
621 *
622 * Release a handle to a cache entry. When the last handle to a cache entry
623 * is released it is either freed (if it is invalid) or otherwise inserted
624 * in to the lru list.
625 */
626void
627mb_cache_entry_release(struct mb_cache_entry *ce)
628{ 318{
629 __mb_cache_entry_release(ce); 319 int nr_to_scan = sc->nr_to_scan;
320 struct mb_cache *cache = container_of(shrink, struct mb_cache,
321 c_shrink);
322 return mb_cache_shrink(cache, nr_to_scan);
630} 323}
631 324
325/* We shrink 1/X of the cache when we have too many entries in it */
326#define SHRINK_DIVISOR 16
632 327
633/* 328static void mb_cache_shrink_worker(struct work_struct *work)
634 * mb_cache_entry_free()
635 *
636 */
637void
638mb_cache_entry_free(struct mb_cache_entry *ce)
639{ 329{
640 mb_assert(ce); 330 struct mb_cache *cache = container_of(work, struct mb_cache,
641 mb_assert(list_empty(&ce->e_lru_list)); 331 c_shrink_work);
642 hlist_bl_lock(ce->e_index_hash_p); 332 mb_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR);
643 __mb_cache_entry_unhash_index(ce);
644 hlist_bl_unlock(ce->e_index_hash_p);
645 hlist_bl_lock(ce->e_block_hash_p);
646 __mb_cache_entry_unhash_block(ce);
647 hlist_bl_unlock(ce->e_block_hash_p);
648 __mb_cache_entry_release(ce);
649} 333}
650 334
651
652/* 335/*
653 * mb_cache_entry_get() 336 * mb_cache_create - create cache
337 * @bucket_bits: log2 of the hash table size
654 * 338 *
655 * Get a cache entry by device / block number. (There can only be one entry 339 * Create cache for keys with 2^bucket_bits hash entries.
656 * in the cache per device and block.) Returns NULL if no such cache entry
657 * exists. The returned cache entry is locked for exclusive access ("single
658 * writer").
659 */ 340 */
660struct mb_cache_entry * 341struct mb_cache *mb_cache_create(int bucket_bits)
661mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
662 sector_t block)
663{ 342{
664 unsigned int bucket; 343 struct mb_cache *cache;
665 struct hlist_bl_node *l; 344 int bucket_count = 1 << bucket_bits;
666 struct mb_cache_entry *ce; 345 int i;
667 struct hlist_bl_head *block_hash_p;
668
669 bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
670 cache->c_bucket_bits);
671 block_hash_p = &cache->c_block_hash[bucket];
672 /* First serialize access to the block corresponding hash chain. */
673 hlist_bl_lock(block_hash_p);
674 hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
675 mb_assert(ce->e_block_hash_p == block_hash_p);
676 if (ce->e_bdev == bdev && ce->e_block == block) {
677 /*
678 * Prevent a free from removing the entry.
679 */
680 atomic_inc(&ce->e_refcnt);
681 hlist_bl_unlock(block_hash_p);
682 __spin_lock_mb_cache_entry(ce);
683 atomic_dec(&ce->e_refcnt);
684 if (ce->e_used > 0) {
685 DEFINE_WAIT(wait);
686 while (ce->e_used > 0) {
687 ce->e_queued++;
688 prepare_to_wait(&mb_cache_queue, &wait,
689 TASK_UNINTERRUPTIBLE);
690 __spin_unlock_mb_cache_entry(ce);
691 schedule();
692 __spin_lock_mb_cache_entry(ce);
693 ce->e_queued--;
694 }
695 finish_wait(&mb_cache_queue, &wait);
696 }
697 ce->e_used += 1 + MB_CACHE_WRITER;
698 __spin_unlock_mb_cache_entry(ce);
699 346
700 if (!list_empty(&ce->e_lru_list)) { 347 if (!try_module_get(THIS_MODULE))
701 spin_lock(&mb_cache_spinlock); 348 return NULL;
702 list_del_init(&ce->e_lru_list); 349
703 spin_unlock(&mb_cache_spinlock); 350 cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL);
704 } 351 if (!cache)
705 if (!__mb_cache_entry_is_block_hashed(ce)) { 352 goto err_out;
706 __mb_cache_entry_release(ce); 353 cache->c_bucket_bits = bucket_bits;
707 return NULL; 354 cache->c_max_entries = bucket_count << 4;
708 } 355 INIT_LIST_HEAD(&cache->c_list);
709 return ce; 356 spin_lock_init(&cache->c_list_lock);
710 } 357 cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head),
358 GFP_KERNEL);
359 if (!cache->c_hash) {
360 kfree(cache);
361 goto err_out;
711 } 362 }
712 hlist_bl_unlock(block_hash_p); 363 for (i = 0; i < bucket_count; i++)
713 return NULL; 364 INIT_HLIST_BL_HEAD(&cache->c_hash[i]);
714}
715 365
716#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) 366 cache->c_shrink.count_objects = mb_cache_count;
367 cache->c_shrink.scan_objects = mb_cache_scan;
368 cache->c_shrink.seeks = DEFAULT_SEEKS;
369 register_shrinker(&cache->c_shrink);
717 370
718static struct mb_cache_entry * 371 INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker);
719__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
720 struct block_device *bdev, unsigned int key)
721{
722 372
723 /* The index hash chain is alredy acquire by caller. */ 373 return cache;
724 while (l != NULL) { 374
725 struct mb_cache_entry *ce = 375err_out:
726 hlist_bl_entry(l, struct mb_cache_entry, 376 module_put(THIS_MODULE);
727 e_index.o_list);
728 mb_assert(ce->e_index_hash_p == head);
729 if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
730 /*
731 * Prevent a free from removing the entry.
732 */
733 atomic_inc(&ce->e_refcnt);
734 hlist_bl_unlock(head);
735 __spin_lock_mb_cache_entry(ce);
736 atomic_dec(&ce->e_refcnt);
737 ce->e_used++;
738 /* Incrementing before holding the lock gives readers
739 priority over writers. */
740 if (ce->e_used >= MB_CACHE_WRITER) {
741 DEFINE_WAIT(wait);
742
743 while (ce->e_used >= MB_CACHE_WRITER) {
744 ce->e_queued++;
745 prepare_to_wait(&mb_cache_queue, &wait,
746 TASK_UNINTERRUPTIBLE);
747 __spin_unlock_mb_cache_entry(ce);
748 schedule();
749 __spin_lock_mb_cache_entry(ce);
750 ce->e_queued--;
751 }
752 finish_wait(&mb_cache_queue, &wait);
753 }
754 __spin_unlock_mb_cache_entry(ce);
755 if (!list_empty(&ce->e_lru_list)) {
756 spin_lock(&mb_cache_spinlock);
757 list_del_init(&ce->e_lru_list);
758 spin_unlock(&mb_cache_spinlock);
759 }
760 if (!__mb_cache_entry_is_block_hashed(ce)) {
761 __mb_cache_entry_release(ce);
762 return ERR_PTR(-EAGAIN);
763 }
764 return ce;
765 }
766 l = l->next;
767 }
768 hlist_bl_unlock(head);
769 return NULL; 377 return NULL;
770} 378}
771 379EXPORT_SYMBOL(mb_cache_create);
772 380
773/* 381/*
774 * mb_cache_entry_find_first() 382 * mb_cache_destroy - destroy cache
775 * 383 * @cache: the cache to destroy
776 * Find the first cache entry on a given device with a certain key in
777 * an additional index. Additional matches can be found with
778 * mb_cache_entry_find_next(). Returns NULL if no match was found. The
779 * returned cache entry is locked for shared access ("multiple readers").
780 * 384 *
781 * @cache: the cache to search 385 * Free all entries in cache and cache itself. Caller must make sure nobody
782 * @bdev: the device the cache entry should belong to 386 * (except shrinker) can reach @cache when calling this.
783 * @key: the key in the index
784 */ 387 */
785struct mb_cache_entry * 388void mb_cache_destroy(struct mb_cache *cache)
786mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
787 unsigned int key)
788{ 389{
789 unsigned int bucket = hash_long(key, cache->c_bucket_bits); 390 struct mb_cache_entry *entry, *next;
790 struct hlist_bl_node *l;
791 struct mb_cache_entry *ce = NULL;
792 struct hlist_bl_head *index_hash_p;
793
794 index_hash_p = &cache->c_index_hash[bucket];
795 hlist_bl_lock(index_hash_p);
796 if (!hlist_bl_empty(index_hash_p)) {
797 l = hlist_bl_first(index_hash_p);
798 ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
799 } else
800 hlist_bl_unlock(index_hash_p);
801 return ce;
802}
803 391
392 unregister_shrinker(&cache->c_shrink);
804 393
805/* 394 /*
806 * mb_cache_entry_find_next() 395 * We don't bother with any locking. Cache must not be used at this
807 * 396 * point.
808 * Find the next cache entry on a given device with a certain key in an 397 */
809 * additional index. Returns NULL if no match could be found. The previous 398 list_for_each_entry_safe(entry, next, &cache->c_list, e_list) {
810 * entry is atomatically released, so that mb_cache_entry_find_next() can 399 if (!hlist_bl_unhashed(&entry->e_hash_list)) {
811 * be called like this: 400 hlist_bl_del_init(&entry->e_hash_list);
812 * 401 atomic_dec(&entry->e_refcnt);
813 * entry = mb_cache_entry_find_first(); 402 } else
814 * while (entry) { 403 WARN_ON(1);
815 * ... 404 list_del(&entry->e_list);
816 * entry = mb_cache_entry_find_next(entry, ...); 405 WARN_ON(atomic_read(&entry->e_refcnt) != 1);
817 * } 406 mb_cache_entry_put(cache, entry);
818 * 407 }
819 * @prev: The previous match 408 kfree(cache->c_hash);
820 * @bdev: the device the cache entry should belong to 409 kfree(cache);
821 * @key: the key in the index 410 module_put(THIS_MODULE);
822 */
823struct mb_cache_entry *
824mb_cache_entry_find_next(struct mb_cache_entry *prev,
825 struct block_device *bdev, unsigned int key)
826{
827 struct mb_cache *cache = prev->e_cache;
828 unsigned int bucket = hash_long(key, cache->c_bucket_bits);
829 struct hlist_bl_node *l;
830 struct mb_cache_entry *ce;
831 struct hlist_bl_head *index_hash_p;
832
833 index_hash_p = &cache->c_index_hash[bucket];
834 mb_assert(prev->e_index_hash_p == index_hash_p);
835 hlist_bl_lock(index_hash_p);
836 mb_assert(!hlist_bl_empty(index_hash_p));
837 l = prev->e_index.o_list.next;
838 ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
839 __mb_cache_entry_release(prev);
840 return ce;
841} 411}
412EXPORT_SYMBOL(mb_cache_destroy);
842 413
843#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */ 414static int __init mbcache_init(void)
844
845static int __init init_mbcache(void)
846{ 415{
847 register_shrinker(&mb_cache_shrinker); 416 mb_entry_cache = kmem_cache_create("mbcache",
417 sizeof(struct mb_cache_entry), 0,
418 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
419 BUG_ON(!mb_entry_cache);
848 return 0; 420 return 0;
849} 421}
850 422
851static void __exit exit_mbcache(void) 423static void __exit mbcache_exit(void)
852{ 424{
853 unregister_shrinker(&mb_cache_shrinker); 425 kmem_cache_destroy(mb_entry_cache);
854} 426}
855 427
856module_init(init_mbcache) 428module_init(mbcache_init)
857module_exit(exit_mbcache) 429module_exit(mbcache_exit)
858 430
431MODULE_AUTHOR("Jan Kara <jack@suse.cz>");
432MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
433MODULE_LICENSE("GPL");
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 65407f6c9120..fd1083c46c61 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -200,7 +200,7 @@ typedef struct journal_block_tag_s
200 __be32 t_blocknr_high; /* most-significant high 32bits. */ 200 __be32 t_blocknr_high; /* most-significant high 32bits. */
201} journal_block_tag_t; 201} journal_block_tag_t;
202 202
203/* Tail of descriptor block, for checksumming */ 203/* Tail of descriptor or revoke block, for checksumming */
204struct jbd2_journal_block_tail { 204struct jbd2_journal_block_tail {
205 __be32 t_checksum; /* crc32c(uuid+descr_block) */ 205 __be32 t_checksum; /* crc32c(uuid+descr_block) */
206}; 206};
@@ -215,11 +215,6 @@ typedef struct jbd2_journal_revoke_header_s
215 __be32 r_count; /* Count of bytes used in the block */ 215 __be32 r_count; /* Count of bytes used in the block */
216} jbd2_journal_revoke_header_t; 216} jbd2_journal_revoke_header_t;
217 217
218/* Tail of revoke block, for checksumming */
219struct jbd2_journal_revoke_tail {
220 __be32 r_checksum; /* crc32c(uuid+revoke_block) */
221};
222
223/* Definitions for the journal tag flags word: */ 218/* Definitions for the journal tag flags word: */
224#define JBD2_FLAG_ESCAPE 1 /* on-disk block is escaped */ 219#define JBD2_FLAG_ESCAPE 1 /* on-disk block is escaped */
225#define JBD2_FLAG_SAME_UUID 2 /* block has same uuid as previous */ 220#define JBD2_FLAG_SAME_UUID 2 /* block has same uuid as previous */
@@ -1137,7 +1132,8 @@ static inline void jbd2_unfile_log_bh(struct buffer_head *bh)
1137} 1132}
1138 1133
1139/* Log buffer allocation */ 1134/* Log buffer allocation */
1140struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal); 1135struct buffer_head *jbd2_journal_get_descriptor_buffer(transaction_t *, int);
1136void jbd2_descriptor_block_csum_set(journal_t *, struct buffer_head *);
1141int jbd2_journal_next_log_block(journal_t *, unsigned long long *); 1137int jbd2_journal_next_log_block(journal_t *, unsigned long long *);
1142int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, 1138int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
1143 unsigned long *block); 1139 unsigned long *block);
@@ -1327,10 +1323,8 @@ extern int jbd2_journal_init_revoke_caches(void);
1327extern void jbd2_journal_destroy_revoke(journal_t *); 1323extern void jbd2_journal_destroy_revoke(journal_t *);
1328extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *); 1324extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *);
1329extern int jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); 1325extern int jbd2_journal_cancel_revoke(handle_t *, struct journal_head *);
1330extern void jbd2_journal_write_revoke_records(journal_t *journal, 1326extern void jbd2_journal_write_revoke_records(transaction_t *transaction,
1331 transaction_t *transaction, 1327 struct list_head *log_bufs);
1332 struct list_head *log_bufs,
1333 int write_op);
1334 1328
1335/* Recovery revoke support */ 1329/* Recovery revoke support */
1336extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); 1330extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t);
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index 6a392e7a723a..86c9a8b480c5 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -1,55 +1,52 @@
1/* 1#ifndef _LINUX_MBCACHE_H
2 File: linux/mbcache.h 2#define _LINUX_MBCACHE_H
3 3
4 (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> 4#include <linux/hash.h>
5*/ 5#include <linux/list_bl.h>
6struct mb_cache_entry { 6#include <linux/list.h>
7 struct list_head e_lru_list; 7#include <linux/atomic.h>
8 struct mb_cache *e_cache; 8#include <linux/fs.h>
9 unsigned short e_used;
10 unsigned short e_queued;
11 atomic_t e_refcnt;
12 struct block_device *e_bdev;
13 sector_t e_block;
14 struct hlist_bl_node e_block_list;
15 struct {
16 struct hlist_bl_node o_list;
17 unsigned int o_key;
18 } e_index;
19 struct hlist_bl_head *e_block_hash_p;
20 struct hlist_bl_head *e_index_hash_p;
21};
22 9
23struct mb_cache { 10struct mb_cache;
24 struct list_head c_cache_list;
25 const char *c_name;
26 atomic_t c_entry_count;
27 int c_max_entries;
28 int c_bucket_bits;
29 struct kmem_cache *c_entry_cache;
30 struct hlist_bl_head *c_block_hash;
31 struct hlist_bl_head *c_index_hash;
32};
33 11
34/* Functions on caches */ 12struct mb_cache_entry {
13 /* List of entries in cache - protected by cache->c_list_lock */
14 struct list_head e_list;
15 /* Hash table list - protected by hash chain bitlock */
16 struct hlist_bl_node e_hash_list;
17 atomic_t e_refcnt;
18 /* Key in hash - stable during lifetime of the entry */
19 u32 e_key;
20 u32 e_referenced:1;
21 u32 e_reusable:1;
22 /* Block number of hashed block - stable during lifetime of the entry */
23 sector_t e_block;
24};
35 25
36struct mb_cache *mb_cache_create(const char *, int); 26struct mb_cache *mb_cache_create(int bucket_bits);
37void mb_cache_shrink(struct block_device *); 27void mb_cache_destroy(struct mb_cache *cache);
38void mb_cache_destroy(struct mb_cache *);
39 28
40/* Functions on cache entries */ 29int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
30 sector_t block, bool reusable);
31void __mb_cache_entry_free(struct mb_cache_entry *entry);
32static inline int mb_cache_entry_put(struct mb_cache *cache,
33 struct mb_cache_entry *entry)
34{
35 if (!atomic_dec_and_test(&entry->e_refcnt))
36 return 0;
37 __mb_cache_entry_free(entry);
38 return 1;
39}
41 40
42struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *, gfp_t); 41void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key,
43int mb_cache_entry_insert(struct mb_cache_entry *, struct block_device *, 42 sector_t block);
44 sector_t, unsigned int); 43struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
45void mb_cache_entry_release(struct mb_cache_entry *); 44 sector_t block);
46void mb_cache_entry_free(struct mb_cache_entry *);
47struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *,
48 struct block_device *,
49 sector_t);
50struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, 45struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
51 struct block_device *, 46 u32 key);
52 unsigned int); 47struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
53struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, 48 struct mb_cache_entry *entry);
54 struct block_device *, 49void mb_cache_entry_touch(struct mb_cache *cache,
55 unsigned int); 50 struct mb_cache_entry *entry);
51
52#endif /* _LINUX_MBCACHE_H */