diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-17 19:31:18 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-17 19:31:18 -0400 |
commit | faeb20ecfa398b043c3224607f512c009c51653d (patch) | |
tree | ffd185ffb5e499a76f261c700de72241e6781ecf | |
parent | 364e8dd9d636fea7def862919aac092b19b7c581 (diff) | |
parent | 0304688676bdfc8159e165313d71da19c118ba27 (diff) |
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o:
"Performance improvements in SEEK_DATA and xattr scalability
improvements, plus a lot of clean ups and bug fixes"
* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (38 commits)
ext4: clean up error handling in the MMP support
jbd2: do not fail journal because of frozen_buffer allocation failure
ext4: use __GFP_NOFAIL in ext4_free_blocks()
ext4: fix compile error while opening the macro DOUBLE_CHECK
ext4: print ext4 mount option data_err=abort correctly
ext4: fix NULL pointer dereference in ext4_mark_inode_dirty()
ext4: drop unneeded BUFFER_TRACE in ext4_delete_inline_entry()
ext4: fix misspellings in comments.
jbd2: fix FS corruption possibility in jbd2_journal_destroy() on umount path
ext4: more efficient SEEK_DATA implementation
ext4: cleanup handling of bh->b_state in DAX mmap
ext4: return hole from ext4_map_blocks()
ext4: factor out determining of hole size
ext4: fix setting of referenced bit in ext4_es_lookup_extent()
ext4: remove i_ioend_count
ext4: simplify io_end handling for AIO DIO
ext4: move trans handling and completion deferal out of _ext4_get_block
ext4: rename and split get blocks functions
ext4: use i_mutex to serialize unaligned AIO DIO
ext4: pack ioend structure better
...
-rw-r--r-- | fs/ext2/ext2.h | 3 | ||||
-rw-r--r-- | fs/ext2/super.c | 25 | ||||
-rw-r--r-- | fs/ext2/xattr.c | 139 | ||||
-rw-r--r-- | fs/ext2/xattr.h | 21 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 45 | ||||
-rw-r--r-- | fs/ext4/ext4_extents.h | 2 | ||||
-rw-r--r-- | fs/ext4/extents.c | 128 | ||||
-rw-r--r-- | fs/ext4/extents_status.c | 4 | ||||
-rw-r--r-- | fs/ext4/file.c | 129 | ||||
-rw-r--r-- | fs/ext4/ialloc.c | 2 | ||||
-rw-r--r-- | fs/ext4/indirect.c | 29 | ||||
-rw-r--r-- | fs/ext4/inline.c | 8 | ||||
-rw-r--r-- | fs/ext4/inode.c | 388 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 81 | ||||
-rw-r--r-- | fs/ext4/mballoc.h | 12 | ||||
-rw-r--r-- | fs/ext4/migrate.c | 2 | ||||
-rw-r--r-- | fs/ext4/mmp.c | 34 | ||||
-rw-r--r-- | fs/ext4/page-io.c | 4 | ||||
-rw-r--r-- | fs/ext4/super.c | 35 | ||||
-rw-r--r-- | fs/ext4/xattr.c | 166 | ||||
-rw-r--r-- | fs/ext4/xattr.h | 3 | ||||
-rw-r--r-- | fs/jbd2/commit.c | 49 | ||||
-rw-r--r-- | fs/jbd2/journal.c | 43 | ||||
-rw-r--r-- | fs/jbd2/recovery.c | 31 | ||||
-rw-r--r-- | fs/jbd2/revoke.c | 60 | ||||
-rw-r--r-- | fs/jbd2/transaction.c | 22 | ||||
-rw-r--r-- | fs/mbcache.c | 1093 | ||||
-rw-r--r-- | include/linux/jbd2.h | 16 | ||||
-rw-r--r-- | include/linux/mbcache.h | 93 |
29 files changed, 1149 insertions, 1518 deletions
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 4c69c94cafd8..170939f379d7 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h | |||
@@ -61,6 +61,8 @@ struct ext2_block_alloc_info { | |||
61 | #define rsv_start rsv_window._rsv_start | 61 | #define rsv_start rsv_window._rsv_start |
62 | #define rsv_end rsv_window._rsv_end | 62 | #define rsv_end rsv_window._rsv_end |
63 | 63 | ||
64 | struct mb_cache; | ||
65 | |||
64 | /* | 66 | /* |
65 | * second extended-fs super-block data in memory | 67 | * second extended-fs super-block data in memory |
66 | */ | 68 | */ |
@@ -111,6 +113,7 @@ struct ext2_sb_info { | |||
111 | * of the mount options. | 113 | * of the mount options. |
112 | */ | 114 | */ |
113 | spinlock_t s_lock; | 115 | spinlock_t s_lock; |
116 | struct mb_cache *s_mb_cache; | ||
114 | }; | 117 | }; |
115 | 118 | ||
116 | static inline spinlock_t * | 119 | static inline spinlock_t * |
diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 2a188413a2b0..b78caf25f746 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c | |||
@@ -131,7 +131,10 @@ static void ext2_put_super (struct super_block * sb) | |||
131 | 131 | ||
132 | dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); | 132 | dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); |
133 | 133 | ||
134 | ext2_xattr_put_super(sb); | 134 | if (sbi->s_mb_cache) { |
135 | ext2_xattr_destroy_cache(sbi->s_mb_cache); | ||
136 | sbi->s_mb_cache = NULL; | ||
137 | } | ||
135 | if (!(sb->s_flags & MS_RDONLY)) { | 138 | if (!(sb->s_flags & MS_RDONLY)) { |
136 | struct ext2_super_block *es = sbi->s_es; | 139 | struct ext2_super_block *es = sbi->s_es; |
137 | 140 | ||
@@ -1104,6 +1107,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) | |||
1104 | ext2_msg(sb, KERN_ERR, "error: insufficient memory"); | 1107 | ext2_msg(sb, KERN_ERR, "error: insufficient memory"); |
1105 | goto failed_mount3; | 1108 | goto failed_mount3; |
1106 | } | 1109 | } |
1110 | |||
1111 | #ifdef CONFIG_EXT2_FS_XATTR | ||
1112 | sbi->s_mb_cache = ext2_xattr_create_cache(); | ||
1113 | if (!sbi->s_mb_cache) { | ||
1114 | ext2_msg(sb, KERN_ERR, "Failed to create an mb_cache"); | ||
1115 | goto failed_mount3; | ||
1116 | } | ||
1117 | #endif | ||
1107 | /* | 1118 | /* |
1108 | * set up enough so that it can read an inode | 1119 | * set up enough so that it can read an inode |
1109 | */ | 1120 | */ |
@@ -1149,6 +1160,8 @@ cantfind_ext2: | |||
1149 | sb->s_id); | 1160 | sb->s_id); |
1150 | goto failed_mount; | 1161 | goto failed_mount; |
1151 | failed_mount3: | 1162 | failed_mount3: |
1163 | if (sbi->s_mb_cache) | ||
1164 | ext2_xattr_destroy_cache(sbi->s_mb_cache); | ||
1152 | percpu_counter_destroy(&sbi->s_freeblocks_counter); | 1165 | percpu_counter_destroy(&sbi->s_freeblocks_counter); |
1153 | percpu_counter_destroy(&sbi->s_freeinodes_counter); | 1166 | percpu_counter_destroy(&sbi->s_freeinodes_counter); |
1154 | percpu_counter_destroy(&sbi->s_dirs_counter); | 1167 | percpu_counter_destroy(&sbi->s_dirs_counter); |
@@ -1555,20 +1568,17 @@ MODULE_ALIAS_FS("ext2"); | |||
1555 | 1568 | ||
1556 | static int __init init_ext2_fs(void) | 1569 | static int __init init_ext2_fs(void) |
1557 | { | 1570 | { |
1558 | int err = init_ext2_xattr(); | 1571 | int err; |
1559 | if (err) | 1572 | |
1560 | return err; | ||
1561 | err = init_inodecache(); | 1573 | err = init_inodecache(); |
1562 | if (err) | 1574 | if (err) |
1563 | goto out1; | 1575 | return err; |
1564 | err = register_filesystem(&ext2_fs_type); | 1576 | err = register_filesystem(&ext2_fs_type); |
1565 | if (err) | 1577 | if (err) |
1566 | goto out; | 1578 | goto out; |
1567 | return 0; | 1579 | return 0; |
1568 | out: | 1580 | out: |
1569 | destroy_inodecache(); | 1581 | destroy_inodecache(); |
1570 | out1: | ||
1571 | exit_ext2_xattr(); | ||
1572 | return err; | 1582 | return err; |
1573 | } | 1583 | } |
1574 | 1584 | ||
@@ -1576,7 +1586,6 @@ static void __exit exit_ext2_fs(void) | |||
1576 | { | 1586 | { |
1577 | unregister_filesystem(&ext2_fs_type); | 1587 | unregister_filesystem(&ext2_fs_type); |
1578 | destroy_inodecache(); | 1588 | destroy_inodecache(); |
1579 | exit_ext2_xattr(); | ||
1580 | } | 1589 | } |
1581 | 1590 | ||
1582 | MODULE_AUTHOR("Remy Card and others"); | 1591 | MODULE_AUTHOR("Remy Card and others"); |
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index f57a7aba32eb..1a5e3bff0b63 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c | |||
@@ -90,14 +90,12 @@ | |||
90 | static int ext2_xattr_set2(struct inode *, struct buffer_head *, | 90 | static int ext2_xattr_set2(struct inode *, struct buffer_head *, |
91 | struct ext2_xattr_header *); | 91 | struct ext2_xattr_header *); |
92 | 92 | ||
93 | static int ext2_xattr_cache_insert(struct buffer_head *); | 93 | static int ext2_xattr_cache_insert(struct mb_cache *, struct buffer_head *); |
94 | static struct buffer_head *ext2_xattr_cache_find(struct inode *, | 94 | static struct buffer_head *ext2_xattr_cache_find(struct inode *, |
95 | struct ext2_xattr_header *); | 95 | struct ext2_xattr_header *); |
96 | static void ext2_xattr_rehash(struct ext2_xattr_header *, | 96 | static void ext2_xattr_rehash(struct ext2_xattr_header *, |
97 | struct ext2_xattr_entry *); | 97 | struct ext2_xattr_entry *); |
98 | 98 | ||
99 | static struct mb_cache *ext2_xattr_cache; | ||
100 | |||
101 | static const struct xattr_handler *ext2_xattr_handler_map[] = { | 99 | static const struct xattr_handler *ext2_xattr_handler_map[] = { |
102 | [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, | 100 | [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, |
103 | #ifdef CONFIG_EXT2_FS_POSIX_ACL | 101 | #ifdef CONFIG_EXT2_FS_POSIX_ACL |
@@ -152,6 +150,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name, | |||
152 | size_t name_len, size; | 150 | size_t name_len, size; |
153 | char *end; | 151 | char *end; |
154 | int error; | 152 | int error; |
153 | struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; | ||
155 | 154 | ||
156 | ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", | 155 | ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", |
157 | name_index, name, buffer, (long)buffer_size); | 156 | name_index, name, buffer, (long)buffer_size); |
@@ -196,7 +195,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", | |||
196 | goto found; | 195 | goto found; |
197 | entry = next; | 196 | entry = next; |
198 | } | 197 | } |
199 | if (ext2_xattr_cache_insert(bh)) | 198 | if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) |
200 | ea_idebug(inode, "cache insert failed"); | 199 | ea_idebug(inode, "cache insert failed"); |
201 | error = -ENODATA; | 200 | error = -ENODATA; |
202 | goto cleanup; | 201 | goto cleanup; |
@@ -209,7 +208,7 @@ found: | |||
209 | le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) | 208 | le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) |
210 | goto bad_block; | 209 | goto bad_block; |
211 | 210 | ||
212 | if (ext2_xattr_cache_insert(bh)) | 211 | if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) |
213 | ea_idebug(inode, "cache insert failed"); | 212 | ea_idebug(inode, "cache insert failed"); |
214 | if (buffer) { | 213 | if (buffer) { |
215 | error = -ERANGE; | 214 | error = -ERANGE; |
@@ -247,6 +246,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) | |||
247 | char *end; | 246 | char *end; |
248 | size_t rest = buffer_size; | 247 | size_t rest = buffer_size; |
249 | int error; | 248 | int error; |
249 | struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; | ||
250 | 250 | ||
251 | ea_idebug(inode, "buffer=%p, buffer_size=%ld", | 251 | ea_idebug(inode, "buffer=%p, buffer_size=%ld", |
252 | buffer, (long)buffer_size); | 252 | buffer, (long)buffer_size); |
@@ -281,7 +281,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", | |||
281 | goto bad_block; | 281 | goto bad_block; |
282 | entry = next; | 282 | entry = next; |
283 | } | 283 | } |
284 | if (ext2_xattr_cache_insert(bh)) | 284 | if (ext2_xattr_cache_insert(ext2_mb_cache, bh)) |
285 | ea_idebug(inode, "cache insert failed"); | 285 | ea_idebug(inode, "cache insert failed"); |
286 | 286 | ||
287 | /* list the attribute names */ | 287 | /* list the attribute names */ |
@@ -483,22 +483,23 @@ bad_block: ext2_error(sb, "ext2_xattr_set", | |||
483 | /* Here we know that we can set the new attribute. */ | 483 | /* Here we know that we can set the new attribute. */ |
484 | 484 | ||
485 | if (header) { | 485 | if (header) { |
486 | struct mb_cache_entry *ce; | ||
487 | |||
488 | /* assert(header == HDR(bh)); */ | 486 | /* assert(header == HDR(bh)); */ |
489 | ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, | ||
490 | bh->b_blocknr); | ||
491 | lock_buffer(bh); | 487 | lock_buffer(bh); |
492 | if (header->h_refcount == cpu_to_le32(1)) { | 488 | if (header->h_refcount == cpu_to_le32(1)) { |
489 | __u32 hash = le32_to_cpu(header->h_hash); | ||
490 | |||
493 | ea_bdebug(bh, "modifying in-place"); | 491 | ea_bdebug(bh, "modifying in-place"); |
494 | if (ce) | 492 | /* |
495 | mb_cache_entry_free(ce); | 493 | * This must happen under buffer lock for |
494 | * ext2_xattr_set2() to reliably detect modified block | ||
495 | */ | ||
496 | mb_cache_entry_delete_block(EXT2_SB(sb)->s_mb_cache, | ||
497 | hash, bh->b_blocknr); | ||
498 | |||
496 | /* keep the buffer locked while modifying it. */ | 499 | /* keep the buffer locked while modifying it. */ |
497 | } else { | 500 | } else { |
498 | int offset; | 501 | int offset; |
499 | 502 | ||
500 | if (ce) | ||
501 | mb_cache_entry_release(ce); | ||
502 | unlock_buffer(bh); | 503 | unlock_buffer(bh); |
503 | ea_bdebug(bh, "cloning"); | 504 | ea_bdebug(bh, "cloning"); |
504 | header = kmalloc(bh->b_size, GFP_KERNEL); | 505 | header = kmalloc(bh->b_size, GFP_KERNEL); |
@@ -626,6 +627,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, | |||
626 | struct super_block *sb = inode->i_sb; | 627 | struct super_block *sb = inode->i_sb; |
627 | struct buffer_head *new_bh = NULL; | 628 | struct buffer_head *new_bh = NULL; |
628 | int error; | 629 | int error; |
630 | struct mb_cache *ext2_mb_cache = EXT2_SB(sb)->s_mb_cache; | ||
629 | 631 | ||
630 | if (header) { | 632 | if (header) { |
631 | new_bh = ext2_xattr_cache_find(inode, header); | 633 | new_bh = ext2_xattr_cache_find(inode, header); |
@@ -653,7 +655,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, | |||
653 | don't need to change the reference count. */ | 655 | don't need to change the reference count. */ |
654 | new_bh = old_bh; | 656 | new_bh = old_bh; |
655 | get_bh(new_bh); | 657 | get_bh(new_bh); |
656 | ext2_xattr_cache_insert(new_bh); | 658 | ext2_xattr_cache_insert(ext2_mb_cache, new_bh); |
657 | } else { | 659 | } else { |
658 | /* We need to allocate a new block */ | 660 | /* We need to allocate a new block */ |
659 | ext2_fsblk_t goal = ext2_group_first_block_no(sb, | 661 | ext2_fsblk_t goal = ext2_group_first_block_no(sb, |
@@ -674,7 +676,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, | |||
674 | memcpy(new_bh->b_data, header, new_bh->b_size); | 676 | memcpy(new_bh->b_data, header, new_bh->b_size); |
675 | set_buffer_uptodate(new_bh); | 677 | set_buffer_uptodate(new_bh); |
676 | unlock_buffer(new_bh); | 678 | unlock_buffer(new_bh); |
677 | ext2_xattr_cache_insert(new_bh); | 679 | ext2_xattr_cache_insert(ext2_mb_cache, new_bh); |
678 | 680 | ||
679 | ext2_xattr_update_super_block(sb); | 681 | ext2_xattr_update_super_block(sb); |
680 | } | 682 | } |
@@ -707,19 +709,21 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, | |||
707 | 709 | ||
708 | error = 0; | 710 | error = 0; |
709 | if (old_bh && old_bh != new_bh) { | 711 | if (old_bh && old_bh != new_bh) { |
710 | struct mb_cache_entry *ce; | ||
711 | |||
712 | /* | 712 | /* |
713 | * If there was an old block and we are no longer using it, | 713 | * If there was an old block and we are no longer using it, |
714 | * release the old block. | 714 | * release the old block. |
715 | */ | 715 | */ |
716 | ce = mb_cache_entry_get(ext2_xattr_cache, old_bh->b_bdev, | ||
717 | old_bh->b_blocknr); | ||
718 | lock_buffer(old_bh); | 716 | lock_buffer(old_bh); |
719 | if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { | 717 | if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) { |
718 | __u32 hash = le32_to_cpu(HDR(old_bh)->h_hash); | ||
719 | |||
720 | /* | ||
721 | * This must happen under buffer lock for | ||
722 | * ext2_xattr_set2() to reliably detect freed block | ||
723 | */ | ||
724 | mb_cache_entry_delete_block(ext2_mb_cache, | ||
725 | hash, old_bh->b_blocknr); | ||
720 | /* Free the old block. */ | 726 | /* Free the old block. */ |
721 | if (ce) | ||
722 | mb_cache_entry_free(ce); | ||
723 | ea_bdebug(old_bh, "freeing"); | 727 | ea_bdebug(old_bh, "freeing"); |
724 | ext2_free_blocks(inode, old_bh->b_blocknr, 1); | 728 | ext2_free_blocks(inode, old_bh->b_blocknr, 1); |
725 | mark_inode_dirty(inode); | 729 | mark_inode_dirty(inode); |
@@ -730,8 +734,6 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, | |||
730 | } else { | 734 | } else { |
731 | /* Decrement the refcount only. */ | 735 | /* Decrement the refcount only. */ |
732 | le32_add_cpu(&HDR(old_bh)->h_refcount, -1); | 736 | le32_add_cpu(&HDR(old_bh)->h_refcount, -1); |
733 | if (ce) | ||
734 | mb_cache_entry_release(ce); | ||
735 | dquot_free_block_nodirty(inode, 1); | 737 | dquot_free_block_nodirty(inode, 1); |
736 | mark_inode_dirty(inode); | 738 | mark_inode_dirty(inode); |
737 | mark_buffer_dirty(old_bh); | 739 | mark_buffer_dirty(old_bh); |
@@ -757,7 +759,6 @@ void | |||
757 | ext2_xattr_delete_inode(struct inode *inode) | 759 | ext2_xattr_delete_inode(struct inode *inode) |
758 | { | 760 | { |
759 | struct buffer_head *bh = NULL; | 761 | struct buffer_head *bh = NULL; |
760 | struct mb_cache_entry *ce; | ||
761 | 762 | ||
762 | down_write(&EXT2_I(inode)->xattr_sem); | 763 | down_write(&EXT2_I(inode)->xattr_sem); |
763 | if (!EXT2_I(inode)->i_file_acl) | 764 | if (!EXT2_I(inode)->i_file_acl) |
@@ -777,19 +778,22 @@ ext2_xattr_delete_inode(struct inode *inode) | |||
777 | EXT2_I(inode)->i_file_acl); | 778 | EXT2_I(inode)->i_file_acl); |
778 | goto cleanup; | 779 | goto cleanup; |
779 | } | 780 | } |
780 | ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_bdev, bh->b_blocknr); | ||
781 | lock_buffer(bh); | 781 | lock_buffer(bh); |
782 | if (HDR(bh)->h_refcount == cpu_to_le32(1)) { | 782 | if (HDR(bh)->h_refcount == cpu_to_le32(1)) { |
783 | if (ce) | 783 | __u32 hash = le32_to_cpu(HDR(bh)->h_hash); |
784 | mb_cache_entry_free(ce); | 784 | |
785 | /* | ||
786 | * This must happen under buffer lock for ext2_xattr_set2() to | ||
787 | * reliably detect freed block | ||
788 | */ | ||
789 | mb_cache_entry_delete_block(EXT2_SB(inode->i_sb)->s_mb_cache, | ||
790 | hash, bh->b_blocknr); | ||
785 | ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); | 791 | ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); |
786 | get_bh(bh); | 792 | get_bh(bh); |
787 | bforget(bh); | 793 | bforget(bh); |
788 | unlock_buffer(bh); | 794 | unlock_buffer(bh); |
789 | } else { | 795 | } else { |
790 | le32_add_cpu(&HDR(bh)->h_refcount, -1); | 796 | le32_add_cpu(&HDR(bh)->h_refcount, -1); |
791 | if (ce) | ||
792 | mb_cache_entry_release(ce); | ||
793 | ea_bdebug(bh, "refcount now=%d", | 797 | ea_bdebug(bh, "refcount now=%d", |
794 | le32_to_cpu(HDR(bh)->h_refcount)); | 798 | le32_to_cpu(HDR(bh)->h_refcount)); |
795 | unlock_buffer(bh); | 799 | unlock_buffer(bh); |
@@ -806,18 +810,6 @@ cleanup: | |||
806 | } | 810 | } |
807 | 811 | ||
808 | /* | 812 | /* |
809 | * ext2_xattr_put_super() | ||
810 | * | ||
811 | * This is called when a file system is unmounted. | ||
812 | */ | ||
813 | void | ||
814 | ext2_xattr_put_super(struct super_block *sb) | ||
815 | { | ||
816 | mb_cache_shrink(sb->s_bdev); | ||
817 | } | ||
818 | |||
819 | |||
820 | /* | ||
821 | * ext2_xattr_cache_insert() | 813 | * ext2_xattr_cache_insert() |
822 | * | 814 | * |
823 | * Create a new entry in the extended attribute cache, and insert | 815 | * Create a new entry in the extended attribute cache, and insert |
@@ -826,28 +818,20 @@ ext2_xattr_put_super(struct super_block *sb) | |||
826 | * Returns 0, or a negative error number on failure. | 818 | * Returns 0, or a negative error number on failure. |
827 | */ | 819 | */ |
828 | static int | 820 | static int |
829 | ext2_xattr_cache_insert(struct buffer_head *bh) | 821 | ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh) |
830 | { | 822 | { |
831 | __u32 hash = le32_to_cpu(HDR(bh)->h_hash); | 823 | __u32 hash = le32_to_cpu(HDR(bh)->h_hash); |
832 | struct mb_cache_entry *ce; | ||
833 | int error; | 824 | int error; |
834 | 825 | ||
835 | ce = mb_cache_entry_alloc(ext2_xattr_cache, GFP_NOFS); | 826 | error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr, 1); |
836 | if (!ce) | ||
837 | return -ENOMEM; | ||
838 | error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); | ||
839 | if (error) { | 827 | if (error) { |
840 | mb_cache_entry_free(ce); | ||
841 | if (error == -EBUSY) { | 828 | if (error == -EBUSY) { |
842 | ea_bdebug(bh, "already in cache (%d cache entries)", | 829 | ea_bdebug(bh, "already in cache (%d cache entries)", |
843 | atomic_read(&ext2_xattr_cache->c_entry_count)); | 830 | atomic_read(&ext2_xattr_cache->c_entry_count)); |
844 | error = 0; | 831 | error = 0; |
845 | } | 832 | } |
846 | } else { | 833 | } else |
847 | ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, | 834 | ea_bdebug(bh, "inserting [%x]", (int)hash); |
848 | atomic_read(&ext2_xattr_cache->c_entry_count)); | ||
849 | mb_cache_entry_release(ce); | ||
850 | } | ||
851 | return error; | 835 | return error; |
852 | } | 836 | } |
853 | 837 | ||
@@ -904,22 +888,16 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) | |||
904 | { | 888 | { |
905 | __u32 hash = le32_to_cpu(header->h_hash); | 889 | __u32 hash = le32_to_cpu(header->h_hash); |
906 | struct mb_cache_entry *ce; | 890 | struct mb_cache_entry *ce; |
891 | struct mb_cache *ext2_mb_cache = EXT2_SB(inode->i_sb)->s_mb_cache; | ||
907 | 892 | ||
908 | if (!header->h_hash) | 893 | if (!header->h_hash) |
909 | return NULL; /* never share */ | 894 | return NULL; /* never share */ |
910 | ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); | 895 | ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); |
911 | again: | 896 | again: |
912 | ce = mb_cache_entry_find_first(ext2_xattr_cache, inode->i_sb->s_bdev, | 897 | ce = mb_cache_entry_find_first(ext2_mb_cache, hash); |
913 | hash); | ||
914 | while (ce) { | 898 | while (ce) { |
915 | struct buffer_head *bh; | 899 | struct buffer_head *bh; |
916 | 900 | ||
917 | if (IS_ERR(ce)) { | ||
918 | if (PTR_ERR(ce) == -EAGAIN) | ||
919 | goto again; | ||
920 | break; | ||
921 | } | ||
922 | |||
923 | bh = sb_bread(inode->i_sb, ce->e_block); | 901 | bh = sb_bread(inode->i_sb, ce->e_block); |
924 | if (!bh) { | 902 | if (!bh) { |
925 | ext2_error(inode->i_sb, "ext2_xattr_cache_find", | 903 | ext2_error(inode->i_sb, "ext2_xattr_cache_find", |
@@ -927,7 +905,21 @@ again: | |||
927 | inode->i_ino, (unsigned long) ce->e_block); | 905 | inode->i_ino, (unsigned long) ce->e_block); |
928 | } else { | 906 | } else { |
929 | lock_buffer(bh); | 907 | lock_buffer(bh); |
930 | if (le32_to_cpu(HDR(bh)->h_refcount) > | 908 | /* |
909 | * We have to be careful about races with freeing or | ||
910 | * rehashing of xattr block. Once we hold buffer lock | ||
911 | * xattr block's state is stable so we can check | ||
912 | * whether the block got freed / rehashed or not. | ||
913 | * Since we unhash mbcache entry under buffer lock when | ||
914 | * freeing / rehashing xattr block, checking whether | ||
915 | * entry is still hashed is reliable. | ||
916 | */ | ||
917 | if (hlist_bl_unhashed(&ce->e_hash_list)) { | ||
918 | mb_cache_entry_put(ext2_mb_cache, ce); | ||
919 | unlock_buffer(bh); | ||
920 | brelse(bh); | ||
921 | goto again; | ||
922 | } else if (le32_to_cpu(HDR(bh)->h_refcount) > | ||
931 | EXT2_XATTR_REFCOUNT_MAX) { | 923 | EXT2_XATTR_REFCOUNT_MAX) { |
932 | ea_idebug(inode, "block %ld refcount %d>%d", | 924 | ea_idebug(inode, "block %ld refcount %d>%d", |
933 | (unsigned long) ce->e_block, | 925 | (unsigned long) ce->e_block, |
@@ -936,13 +928,14 @@ again: | |||
936 | } else if (!ext2_xattr_cmp(header, HDR(bh))) { | 928 | } else if (!ext2_xattr_cmp(header, HDR(bh))) { |
937 | ea_bdebug(bh, "b_count=%d", | 929 | ea_bdebug(bh, "b_count=%d", |
938 | atomic_read(&(bh->b_count))); | 930 | atomic_read(&(bh->b_count))); |
939 | mb_cache_entry_release(ce); | 931 | mb_cache_entry_touch(ext2_mb_cache, ce); |
932 | mb_cache_entry_put(ext2_mb_cache, ce); | ||
940 | return bh; | 933 | return bh; |
941 | } | 934 | } |
942 | unlock_buffer(bh); | 935 | unlock_buffer(bh); |
943 | brelse(bh); | 936 | brelse(bh); |
944 | } | 937 | } |
945 | ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); | 938 | ce = mb_cache_entry_find_next(ext2_mb_cache, ce); |
946 | } | 939 | } |
947 | return NULL; | 940 | return NULL; |
948 | } | 941 | } |
@@ -1015,17 +1008,15 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *header, | |||
1015 | 1008 | ||
1016 | #undef BLOCK_HASH_SHIFT | 1009 | #undef BLOCK_HASH_SHIFT |
1017 | 1010 | ||
1018 | int __init | 1011 | #define HASH_BUCKET_BITS 10 |
1019 | init_ext2_xattr(void) | 1012 | |
1013 | struct mb_cache *ext2_xattr_create_cache(void) | ||
1020 | { | 1014 | { |
1021 | ext2_xattr_cache = mb_cache_create("ext2_xattr", 6); | 1015 | return mb_cache_create(HASH_BUCKET_BITS); |
1022 | if (!ext2_xattr_cache) | ||
1023 | return -ENOMEM; | ||
1024 | return 0; | ||
1025 | } | 1016 | } |
1026 | 1017 | ||
1027 | void | 1018 | void ext2_xattr_destroy_cache(struct mb_cache *cache) |
1028 | exit_ext2_xattr(void) | ||
1029 | { | 1019 | { |
1030 | mb_cache_destroy(ext2_xattr_cache); | 1020 | if (cache) |
1021 | mb_cache_destroy(cache); | ||
1031 | } | 1022 | } |
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index 60edf298644e..6f82ab1b00ca 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h | |||
@@ -53,6 +53,8 @@ struct ext2_xattr_entry { | |||
53 | #define EXT2_XATTR_SIZE(size) \ | 53 | #define EXT2_XATTR_SIZE(size) \ |
54 | (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND) | 54 | (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND) |
55 | 55 | ||
56 | struct mb_cache; | ||
57 | |||
56 | # ifdef CONFIG_EXT2_FS_XATTR | 58 | # ifdef CONFIG_EXT2_FS_XATTR |
57 | 59 | ||
58 | extern const struct xattr_handler ext2_xattr_user_handler; | 60 | extern const struct xattr_handler ext2_xattr_user_handler; |
@@ -65,10 +67,9 @@ extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t); | |||
65 | extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int); | 67 | extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int); |
66 | 68 | ||
67 | extern void ext2_xattr_delete_inode(struct inode *); | 69 | extern void ext2_xattr_delete_inode(struct inode *); |
68 | extern void ext2_xattr_put_super(struct super_block *); | ||
69 | 70 | ||
70 | extern int init_ext2_xattr(void); | 71 | extern struct mb_cache *ext2_xattr_create_cache(void); |
71 | extern void exit_ext2_xattr(void); | 72 | extern void ext2_xattr_destroy_cache(struct mb_cache *cache); |
72 | 73 | ||
73 | extern const struct xattr_handler *ext2_xattr_handlers[]; | 74 | extern const struct xattr_handler *ext2_xattr_handlers[]; |
74 | 75 | ||
@@ -93,19 +94,7 @@ ext2_xattr_delete_inode(struct inode *inode) | |||
93 | { | 94 | { |
94 | } | 95 | } |
95 | 96 | ||
96 | static inline void | 97 | static inline void ext2_xattr_destroy_cache(struct mb_cache *cache) |
97 | ext2_xattr_put_super(struct super_block *sb) | ||
98 | { | ||
99 | } | ||
100 | |||
101 | static inline int | ||
102 | init_ext2_xattr(void) | ||
103 | { | ||
104 | return 0; | ||
105 | } | ||
106 | |||
107 | static inline void | ||
108 | exit_ext2_xattr(void) | ||
109 | { | 98 | { |
110 | } | 99 | } |
111 | 100 | ||
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 157b458a69d4..393689dfa1af 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -42,6 +42,18 @@ | |||
42 | */ | 42 | */ |
43 | 43 | ||
44 | /* | 44 | /* |
45 | * with AGGRESSIVE_CHECK allocator runs consistency checks over | ||
46 | * structures. these checks slow things down a lot | ||
47 | */ | ||
48 | #define AGGRESSIVE_CHECK__ | ||
49 | |||
50 | /* | ||
51 | * with DOUBLE_CHECK defined mballoc creates persistent in-core | ||
52 | * bitmaps, maintains and uses them to check for double allocations | ||
53 | */ | ||
54 | #define DOUBLE_CHECK__ | ||
55 | |||
56 | /* | ||
45 | * Define EXT4FS_DEBUG to produce debug messages | 57 | * Define EXT4FS_DEBUG to produce debug messages |
46 | */ | 58 | */ |
47 | #undef EXT4FS_DEBUG | 59 | #undef EXT4FS_DEBUG |
@@ -182,9 +194,9 @@ typedef struct ext4_io_end { | |||
182 | struct bio *bio; /* Linked list of completed | 194 | struct bio *bio; /* Linked list of completed |
183 | * bios covering the extent */ | 195 | * bios covering the extent */ |
184 | unsigned int flag; /* unwritten or not */ | 196 | unsigned int flag; /* unwritten or not */ |
197 | atomic_t count; /* reference counter */ | ||
185 | loff_t offset; /* offset in the file */ | 198 | loff_t offset; /* offset in the file */ |
186 | ssize_t size; /* size of the extent */ | 199 | ssize_t size; /* size of the extent */ |
187 | atomic_t count; /* reference counter */ | ||
188 | } ext4_io_end_t; | 200 | } ext4_io_end_t; |
189 | 201 | ||
190 | struct ext4_io_submit { | 202 | struct ext4_io_submit { |
@@ -1024,13 +1036,8 @@ struct ext4_inode_info { | |||
1024 | * transaction reserved | 1036 | * transaction reserved |
1025 | */ | 1037 | */ |
1026 | struct list_head i_rsv_conversion_list; | 1038 | struct list_head i_rsv_conversion_list; |
1027 | /* | ||
1028 | * Completed IOs that need unwritten extents handling and don't have | ||
1029 | * transaction reserved | ||
1030 | */ | ||
1031 | atomic_t i_ioend_count; /* Number of outstanding io_end structs */ | ||
1032 | atomic_t i_unwritten; /* Nr. of inflight conversions pending */ | ||
1033 | struct work_struct i_rsv_conversion_work; | 1039 | struct work_struct i_rsv_conversion_work; |
1040 | atomic_t i_unwritten; /* Nr. of inflight conversions pending */ | ||
1034 | 1041 | ||
1035 | spinlock_t i_block_reservation_lock; | 1042 | spinlock_t i_block_reservation_lock; |
1036 | 1043 | ||
@@ -1513,16 +1520,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, | |||
1513 | } | 1520 | } |
1514 | } | 1521 | } |
1515 | 1522 | ||
1516 | static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode) | ||
1517 | { | ||
1518 | return inode->i_private; | ||
1519 | } | ||
1520 | |||
1521 | static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io) | ||
1522 | { | ||
1523 | inode->i_private = io; | ||
1524 | } | ||
1525 | |||
1526 | /* | 1523 | /* |
1527 | * Inode dynamic state flags | 1524 | * Inode dynamic state flags |
1528 | */ | 1525 | */ |
@@ -2506,12 +2503,14 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); | |||
2506 | int ext4_inode_is_fast_symlink(struct inode *inode); | 2503 | int ext4_inode_is_fast_symlink(struct inode *inode); |
2507 | struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); | 2504 | struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); |
2508 | struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); | 2505 | struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); |
2509 | int ext4_get_block_write(struct inode *inode, sector_t iblock, | 2506 | int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, |
2510 | struct buffer_head *bh_result, int create); | 2507 | struct buffer_head *bh_result, int create); |
2511 | int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, | 2508 | int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, |
2512 | struct buffer_head *bh_result, int create); | 2509 | struct buffer_head *bh_result, int create); |
2513 | int ext4_get_block(struct inode *inode, sector_t iblock, | 2510 | int ext4_get_block(struct inode *inode, sector_t iblock, |
2514 | struct buffer_head *bh_result, int create); | 2511 | struct buffer_head *bh_result, int create); |
2512 | int ext4_dio_get_block(struct inode *inode, sector_t iblock, | ||
2513 | struct buffer_head *bh_result, int create); | ||
2515 | int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, | 2514 | int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, |
2516 | struct buffer_head *bh, int create); | 2515 | struct buffer_head *bh, int create); |
2517 | int ext4_walk_page_buffers(handle_t *handle, | 2516 | int ext4_walk_page_buffers(handle_t *handle, |
@@ -2559,6 +2558,9 @@ extern void ext4_da_update_reserve_space(struct inode *inode, | |||
2559 | int used, int quota_claim); | 2558 | int used, int quota_claim); |
2560 | extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, | 2559 | extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, |
2561 | ext4_fsblk_t pblk, ext4_lblk_t len); | 2560 | ext4_fsblk_t pblk, ext4_lblk_t len); |
2561 | extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, | ||
2562 | unsigned int map_len, | ||
2563 | struct extent_status *result); | ||
2562 | 2564 | ||
2563 | /* indirect.c */ | 2565 | /* indirect.c */ |
2564 | extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, | 2566 | extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, |
@@ -3285,10 +3287,7 @@ static inline void ext4_inode_resume_unlocked_dio(struct inode *inode) | |||
3285 | #define EXT4_WQ_HASH_SZ 37 | 3287 | #define EXT4_WQ_HASH_SZ 37 |
3286 | #define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ | 3288 | #define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ |
3287 | EXT4_WQ_HASH_SZ]) | 3289 | EXT4_WQ_HASH_SZ]) |
3288 | #define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\ | ||
3289 | EXT4_WQ_HASH_SZ]) | ||
3290 | extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; | 3290 | extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; |
3291 | extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; | ||
3292 | 3291 | ||
3293 | #define EXT4_RESIZING 0 | 3292 | #define EXT4_RESIZING 0 |
3294 | extern int ext4_resize_begin(struct super_block *sb); | 3293 | extern int ext4_resize_begin(struct super_block *sb); |
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 3c9381547094..8ecf84b8f5a1 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h | |||
@@ -11,7 +11,7 @@ | |||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | * GNU General Public License for more details. | 12 | * GNU General Public License for more details. |
13 | * | 13 | * |
14 | * You should have received a copy of the GNU General Public Licens | 14 | * You should have received a copy of the GNU General Public License |
15 | * along with this program; if not, write to the Free Software | 15 | * along with this program; if not, write to the Free Software |
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- | 16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- |
17 | */ | 17 | */ |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3753ceb0b0dd..95bf4679ac54 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -15,7 +15,7 @@ | |||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | * GNU General Public License for more details. | 16 | * GNU General Public License for more details. |
17 | * | 17 | * |
18 | * You should have received a copy of the GNU General Public Licens | 18 | * You should have received a copy of the GNU General Public License |
19 | * along with this program; if not, write to the Free Software | 19 | * along with this program; if not, write to the Free Software |
20 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- | 20 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- |
21 | */ | 21 | */ |
@@ -1736,6 +1736,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, | |||
1736 | */ | 1736 | */ |
1737 | if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) | 1737 | if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) |
1738 | return 0; | 1738 | return 0; |
1739 | /* | ||
1740 | * The check for IO to unwritten extent is somewhat racy as we | ||
1741 | * increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after | ||
1742 | * dropping i_data_sem. But reserved blocks should save us in that | ||
1743 | * case. | ||
1744 | */ | ||
1739 | if (ext4_ext_is_unwritten(ex1) && | 1745 | if (ext4_ext_is_unwritten(ex1) && |
1740 | (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || | 1746 | (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || |
1741 | atomic_read(&EXT4_I(inode)->i_unwritten) || | 1747 | atomic_read(&EXT4_I(inode)->i_unwritten) || |
@@ -2293,59 +2299,69 @@ static int ext4_fill_fiemap_extents(struct inode *inode, | |||
2293 | } | 2299 | } |
2294 | 2300 | ||
2295 | /* | 2301 | /* |
2296 | * ext4_ext_put_gap_in_cache: | 2302 | * ext4_ext_determine_hole - determine hole around given block |
2297 | * calculate boundaries of the gap that the requested block fits into | 2303 | * @inode: inode we lookup in |
2298 | * and cache this gap | 2304 | * @path: path in extent tree to @lblk |
2305 | * @lblk: pointer to logical block around which we want to determine hole | ||
2306 | * | ||
2307 | * Determine hole length (and start if easily possible) around given logical | ||
2308 | * block. We don't try too hard to find the beginning of the hole but @path | ||
2309 | * actually points to extent before @lblk, we provide it. | ||
2310 | * | ||
2311 | * The function returns the length of a hole starting at @lblk. We update @lblk | ||
2312 | * to the beginning of the hole if we managed to find it. | ||
2299 | */ | 2313 | */ |
2300 | static void | 2314 | static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode, |
2301 | ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, | 2315 | struct ext4_ext_path *path, |
2302 | ext4_lblk_t block) | 2316 | ext4_lblk_t *lblk) |
2303 | { | 2317 | { |
2304 | int depth = ext_depth(inode); | 2318 | int depth = ext_depth(inode); |
2305 | ext4_lblk_t len; | ||
2306 | ext4_lblk_t lblock; | ||
2307 | struct ext4_extent *ex; | 2319 | struct ext4_extent *ex; |
2308 | struct extent_status es; | 2320 | ext4_lblk_t len; |
2309 | 2321 | ||
2310 | ex = path[depth].p_ext; | 2322 | ex = path[depth].p_ext; |
2311 | if (ex == NULL) { | 2323 | if (ex == NULL) { |
2312 | /* there is no extent yet, so gap is [0;-] */ | 2324 | /* there is no extent yet, so gap is [0;-] */ |
2313 | lblock = 0; | 2325 | *lblk = 0; |
2314 | len = EXT_MAX_BLOCKS; | 2326 | len = EXT_MAX_BLOCKS; |
2315 | ext_debug("cache gap(whole file):"); | 2327 | } else if (*lblk < le32_to_cpu(ex->ee_block)) { |
2316 | } else if (block < le32_to_cpu(ex->ee_block)) { | 2328 | len = le32_to_cpu(ex->ee_block) - *lblk; |
2317 | lblock = block; | 2329 | } else if (*lblk >= le32_to_cpu(ex->ee_block) |
2318 | len = le32_to_cpu(ex->ee_block) - block; | ||
2319 | ext_debug("cache gap(before): %u [%u:%u]", | ||
2320 | block, | ||
2321 | le32_to_cpu(ex->ee_block), | ||
2322 | ext4_ext_get_actual_len(ex)); | ||
2323 | } else if (block >= le32_to_cpu(ex->ee_block) | ||
2324 | + ext4_ext_get_actual_len(ex)) { | 2330 | + ext4_ext_get_actual_len(ex)) { |
2325 | ext4_lblk_t next; | 2331 | ext4_lblk_t next; |
2326 | lblock = le32_to_cpu(ex->ee_block) | ||
2327 | + ext4_ext_get_actual_len(ex); | ||
2328 | 2332 | ||
2333 | *lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex); | ||
2329 | next = ext4_ext_next_allocated_block(path); | 2334 | next = ext4_ext_next_allocated_block(path); |
2330 | ext_debug("cache gap(after): [%u:%u] %u", | 2335 | BUG_ON(next == *lblk); |
2331 | le32_to_cpu(ex->ee_block), | 2336 | len = next - *lblk; |
2332 | ext4_ext_get_actual_len(ex), | ||
2333 | block); | ||
2334 | BUG_ON(next == lblock); | ||
2335 | len = next - lblock; | ||
2336 | } else { | 2337 | } else { |
2337 | BUG(); | 2338 | BUG(); |
2338 | } | 2339 | } |
2340 | return len; | ||
2341 | } | ||
2339 | 2342 | ||
2340 | ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es); | 2343 | /* |
2344 | * ext4_ext_put_gap_in_cache: | ||
2345 | * calculate boundaries of the gap that the requested block fits into | ||
2346 | * and cache this gap | ||
2347 | */ | ||
2348 | static void | ||
2349 | ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, | ||
2350 | ext4_lblk_t hole_len) | ||
2351 | { | ||
2352 | struct extent_status es; | ||
2353 | |||
2354 | ext4_es_find_delayed_extent_range(inode, hole_start, | ||
2355 | hole_start + hole_len - 1, &es); | ||
2341 | if (es.es_len) { | 2356 | if (es.es_len) { |
2342 | /* There's delayed extent containing lblock? */ | 2357 | /* There's delayed extent containing lblock? */ |
2343 | if (es.es_lblk <= lblock) | 2358 | if (es.es_lblk <= hole_start) |
2344 | return; | 2359 | return; |
2345 | len = min(es.es_lblk - lblock, len); | 2360 | hole_len = min(es.es_lblk - hole_start, hole_len); |
2346 | } | 2361 | } |
2347 | ext_debug(" -> %u:%u\n", lblock, len); | 2362 | ext_debug(" -> %u:%u\n", hole_start, hole_len); |
2348 | ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE); | 2363 | ext4_es_insert_extent(inode, hole_start, hole_len, ~0, |
2364 | EXTENT_STATUS_HOLE); | ||
2349 | } | 2365 | } |
2350 | 2366 | ||
2351 | /* | 2367 | /* |
@@ -3927,7 +3943,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, | |||
3927 | static int | 3943 | static int |
3928 | convert_initialized_extent(handle_t *handle, struct inode *inode, | 3944 | convert_initialized_extent(handle_t *handle, struct inode *inode, |
3929 | struct ext4_map_blocks *map, | 3945 | struct ext4_map_blocks *map, |
3930 | struct ext4_ext_path **ppath, int flags, | 3946 | struct ext4_ext_path **ppath, |
3931 | unsigned int allocated) | 3947 | unsigned int allocated) |
3932 | { | 3948 | { |
3933 | struct ext4_ext_path *path = *ppath; | 3949 | struct ext4_ext_path *path = *ppath; |
@@ -4007,7 +4023,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, | |||
4007 | struct ext4_ext_path *path = *ppath; | 4023 | struct ext4_ext_path *path = *ppath; |
4008 | int ret = 0; | 4024 | int ret = 0; |
4009 | int err = 0; | 4025 | int err = 0; |
4010 | ext4_io_end_t *io = ext4_inode_aio(inode); | ||
4011 | 4026 | ||
4012 | ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " | 4027 | ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " |
4013 | "block %llu, max_blocks %u, flags %x, allocated %u\n", | 4028 | "block %llu, max_blocks %u, flags %x, allocated %u\n", |
@@ -4030,15 +4045,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, | |||
4030 | flags | EXT4_GET_BLOCKS_CONVERT); | 4045 | flags | EXT4_GET_BLOCKS_CONVERT); |
4031 | if (ret <= 0) | 4046 | if (ret <= 0) |
4032 | goto out; | 4047 | goto out; |
4033 | /* | ||
4034 | * Flag the inode(non aio case) or end_io struct (aio case) | ||
4035 | * that this IO needs to conversion to written when IO is | ||
4036 | * completed | ||
4037 | */ | ||
4038 | if (io) | ||
4039 | ext4_set_io_unwritten_flag(inode, io); | ||
4040 | else | ||
4041 | ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); | ||
4042 | map->m_flags |= EXT4_MAP_UNWRITTEN; | 4048 | map->m_flags |= EXT4_MAP_UNWRITTEN; |
4043 | goto out; | 4049 | goto out; |
4044 | } | 4050 | } |
@@ -4283,9 +4289,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
4283 | unsigned int allocated = 0, offset = 0; | 4289 | unsigned int allocated = 0, offset = 0; |
4284 | unsigned int allocated_clusters = 0; | 4290 | unsigned int allocated_clusters = 0; |
4285 | struct ext4_allocation_request ar; | 4291 | struct ext4_allocation_request ar; |
4286 | ext4_io_end_t *io = ext4_inode_aio(inode); | ||
4287 | ext4_lblk_t cluster_offset; | 4292 | ext4_lblk_t cluster_offset; |
4288 | int set_unwritten = 0; | ||
4289 | bool map_from_cluster = false; | 4293 | bool map_from_cluster = false; |
4290 | 4294 | ||
4291 | ext_debug("blocks %u/%u requested for inode %lu\n", | 4295 | ext_debug("blocks %u/%u requested for inode %lu\n", |
@@ -4347,7 +4351,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
4347 | (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { | 4351 | (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { |
4348 | allocated = convert_initialized_extent( | 4352 | allocated = convert_initialized_extent( |
4349 | handle, inode, map, &path, | 4353 | handle, inode, map, &path, |
4350 | flags, allocated); | 4354 | allocated); |
4351 | goto out2; | 4355 | goto out2; |
4352 | } else if (!ext4_ext_is_unwritten(ex)) | 4356 | } else if (!ext4_ext_is_unwritten(ex)) |
4353 | goto out; | 4357 | goto out; |
@@ -4368,11 +4372,22 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
4368 | * we couldn't try to create block if create flag is zero | 4372 | * we couldn't try to create block if create flag is zero |
4369 | */ | 4373 | */ |
4370 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { | 4374 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { |
4375 | ext4_lblk_t hole_start, hole_len; | ||
4376 | |||
4377 | hole_start = map->m_lblk; | ||
4378 | hole_len = ext4_ext_determine_hole(inode, path, &hole_start); | ||
4371 | /* | 4379 | /* |
4372 | * put just found gap into cache to speed up | 4380 | * put just found gap into cache to speed up |
4373 | * subsequent requests | 4381 | * subsequent requests |
4374 | */ | 4382 | */ |
4375 | ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); | 4383 | ext4_ext_put_gap_in_cache(inode, hole_start, hole_len); |
4384 | |||
4385 | /* Update hole_len to reflect hole size after map->m_lblk */ | ||
4386 | if (hole_start != map->m_lblk) | ||
4387 | hole_len -= map->m_lblk - hole_start; | ||
4388 | map->m_pblk = 0; | ||
4389 | map->m_len = min_t(unsigned int, map->m_len, hole_len); | ||
4390 | |||
4376 | goto out2; | 4391 | goto out2; |
4377 | } | 4392 | } |
4378 | 4393 | ||
@@ -4482,15 +4497,6 @@ got_allocated_blocks: | |||
4482 | if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ | 4497 | if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ |
4483 | ext4_ext_mark_unwritten(&newex); | 4498 | ext4_ext_mark_unwritten(&newex); |
4484 | map->m_flags |= EXT4_MAP_UNWRITTEN; | 4499 | map->m_flags |= EXT4_MAP_UNWRITTEN; |
4485 | /* | ||
4486 | * io_end structure was created for every IO write to an | ||
4487 | * unwritten extent. To avoid unnecessary conversion, | ||
4488 | * here we flag the IO that really needs the conversion. | ||
4489 | * For non asycn direct IO case, flag the inode state | ||
4490 | * that we need to perform conversion when IO is done. | ||
4491 | */ | ||
4492 | if (flags & EXT4_GET_BLOCKS_PRE_IO) | ||
4493 | set_unwritten = 1; | ||
4494 | } | 4500 | } |
4495 | 4501 | ||
4496 | err = 0; | 4502 | err = 0; |
@@ -4501,14 +4507,6 @@ got_allocated_blocks: | |||
4501 | err = ext4_ext_insert_extent(handle, inode, &path, | 4507 | err = ext4_ext_insert_extent(handle, inode, &path, |
4502 | &newex, flags); | 4508 | &newex, flags); |
4503 | 4509 | ||
4504 | if (!err && set_unwritten) { | ||
4505 | if (io) | ||
4506 | ext4_set_io_unwritten_flag(inode, io); | ||
4507 | else | ||
4508 | ext4_set_inode_state(inode, | ||
4509 | EXT4_STATE_DIO_UNWRITTEN); | ||
4510 | } | ||
4511 | |||
4512 | if (err && free_on_err) { | 4510 | if (err && free_on_err) { |
4513 | int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? | 4511 | int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? |
4514 | EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; | 4512 | EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; |
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index ac748b3af1c1..e38b987ac7f5 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c | |||
@@ -823,8 +823,8 @@ out: | |||
823 | es->es_lblk = es1->es_lblk; | 823 | es->es_lblk = es1->es_lblk; |
824 | es->es_len = es1->es_len; | 824 | es->es_len = es1->es_len; |
825 | es->es_pblk = es1->es_pblk; | 825 | es->es_pblk = es1->es_pblk; |
826 | if (!ext4_es_is_referenced(es)) | 826 | if (!ext4_es_is_referenced(es1)) |
827 | ext4_es_set_referenced(es); | 827 | ext4_es_set_referenced(es1); |
828 | stats->es_stats_cache_hits++; | 828 | stats->es_stats_cache_hits++; |
829 | } else { | 829 | } else { |
830 | stats->es_stats_cache_misses++; | 830 | stats->es_stats_cache_misses++; |
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 4cd318f31cbe..6659e216385e 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c | |||
@@ -93,31 +93,29 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) | |||
93 | { | 93 | { |
94 | struct file *file = iocb->ki_filp; | 94 | struct file *file = iocb->ki_filp; |
95 | struct inode *inode = file_inode(iocb->ki_filp); | 95 | struct inode *inode = file_inode(iocb->ki_filp); |
96 | struct mutex *aio_mutex = NULL; | ||
97 | struct blk_plug plug; | 96 | struct blk_plug plug; |
98 | int o_direct = iocb->ki_flags & IOCB_DIRECT; | 97 | int o_direct = iocb->ki_flags & IOCB_DIRECT; |
98 | int unaligned_aio = 0; | ||
99 | int overwrite = 0; | 99 | int overwrite = 0; |
100 | ssize_t ret; | 100 | ssize_t ret; |
101 | 101 | ||
102 | inode_lock(inode); | ||
103 | ret = generic_write_checks(iocb, from); | ||
104 | if (ret <= 0) | ||
105 | goto out; | ||
106 | |||
102 | /* | 107 | /* |
103 | * Unaligned direct AIO must be serialized; see comment above | 108 | * Unaligned direct AIO must be serialized among each other as zeroing |
104 | * In the case of O_APPEND, assume that we must always serialize | 109 | * of partial blocks of two competing unaligned AIOs can result in data |
110 | * corruption. | ||
105 | */ | 111 | */ |
106 | if (o_direct && | 112 | if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && |
107 | ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && | ||
108 | !is_sync_kiocb(iocb) && | 113 | !is_sync_kiocb(iocb) && |
109 | (iocb->ki_flags & IOCB_APPEND || | 114 | ext4_unaligned_aio(inode, from, iocb->ki_pos)) { |
110 | ext4_unaligned_aio(inode, from, iocb->ki_pos))) { | 115 | unaligned_aio = 1; |
111 | aio_mutex = ext4_aio_mutex(inode); | ||
112 | mutex_lock(aio_mutex); | ||
113 | ext4_unwritten_wait(inode); | 116 | ext4_unwritten_wait(inode); |
114 | } | 117 | } |
115 | 118 | ||
116 | inode_lock(inode); | ||
117 | ret = generic_write_checks(iocb, from); | ||
118 | if (ret <= 0) | ||
119 | goto out; | ||
120 | |||
121 | /* | 119 | /* |
122 | * If we have encountered a bitmap-format file, the size limit | 120 | * If we have encountered a bitmap-format file, the size limit |
123 | * is smaller than s_maxbytes, which is for extent-mapped files. | 121 | * is smaller than s_maxbytes, which is for extent-mapped files. |
@@ -139,7 +137,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) | |||
139 | blk_start_plug(&plug); | 137 | blk_start_plug(&plug); |
140 | 138 | ||
141 | /* check whether we do a DIO overwrite or not */ | 139 | /* check whether we do a DIO overwrite or not */ |
142 | if (ext4_should_dioread_nolock(inode) && !aio_mutex && | 140 | if (ext4_should_dioread_nolock(inode) && !unaligned_aio && |
143 | !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { | 141 | !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { |
144 | struct ext4_map_blocks map; | 142 | struct ext4_map_blocks map; |
145 | unsigned int blkbits = inode->i_blkbits; | 143 | unsigned int blkbits = inode->i_blkbits; |
@@ -181,14 +179,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) | |||
181 | if (o_direct) | 179 | if (o_direct) |
182 | blk_finish_plug(&plug); | 180 | blk_finish_plug(&plug); |
183 | 181 | ||
184 | if (aio_mutex) | ||
185 | mutex_unlock(aio_mutex); | ||
186 | return ret; | 182 | return ret; |
187 | 183 | ||
188 | out: | 184 | out: |
189 | inode_unlock(inode); | 185 | inode_unlock(inode); |
190 | if (aio_mutex) | ||
191 | mutex_unlock(aio_mutex); | ||
192 | return ret; | 186 | return ret; |
193 | } | 187 | } |
194 | 188 | ||
@@ -417,7 +411,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) | |||
417 | */ | 411 | */ |
418 | static int ext4_find_unwritten_pgoff(struct inode *inode, | 412 | static int ext4_find_unwritten_pgoff(struct inode *inode, |
419 | int whence, | 413 | int whence, |
420 | struct ext4_map_blocks *map, | 414 | ext4_lblk_t end_blk, |
421 | loff_t *offset) | 415 | loff_t *offset) |
422 | { | 416 | { |
423 | struct pagevec pvec; | 417 | struct pagevec pvec; |
@@ -432,7 +426,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, | |||
432 | blkbits = inode->i_sb->s_blocksize_bits; | 426 | blkbits = inode->i_sb->s_blocksize_bits; |
433 | startoff = *offset; | 427 | startoff = *offset; |
434 | lastoff = startoff; | 428 | lastoff = startoff; |
435 | endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; | 429 | endoff = (loff_t)end_blk << blkbits; |
436 | 430 | ||
437 | index = startoff >> PAGE_CACHE_SHIFT; | 431 | index = startoff >> PAGE_CACHE_SHIFT; |
438 | end = endoff >> PAGE_CACHE_SHIFT; | 432 | end = endoff >> PAGE_CACHE_SHIFT; |
@@ -550,12 +544,11 @@ out: | |||
550 | static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) | 544 | static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) |
551 | { | 545 | { |
552 | struct inode *inode = file->f_mapping->host; | 546 | struct inode *inode = file->f_mapping->host; |
553 | struct ext4_map_blocks map; | ||
554 | struct extent_status es; | 547 | struct extent_status es; |
555 | ext4_lblk_t start, last, end; | 548 | ext4_lblk_t start, last, end; |
556 | loff_t dataoff, isize; | 549 | loff_t dataoff, isize; |
557 | int blkbits; | 550 | int blkbits; |
558 | int ret = 0; | 551 | int ret; |
559 | 552 | ||
560 | inode_lock(inode); | 553 | inode_lock(inode); |
561 | 554 | ||
@@ -572,41 +565,32 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) | |||
572 | dataoff = offset; | 565 | dataoff = offset; |
573 | 566 | ||
574 | do { | 567 | do { |
575 | map.m_lblk = last; | 568 | ret = ext4_get_next_extent(inode, last, end - last + 1, &es); |
576 | map.m_len = end - last + 1; | 569 | if (ret <= 0) { |
577 | ret = ext4_map_blocks(NULL, inode, &map, 0); | 570 | /* No extent found -> no data */ |
578 | if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { | 571 | if (ret == 0) |
579 | if (last != start) | 572 | ret = -ENXIO; |
580 | dataoff = (loff_t)last << blkbits; | 573 | inode_unlock(inode); |
581 | break; | 574 | return ret; |
582 | } | 575 | } |
583 | 576 | ||
584 | /* | 577 | last = es.es_lblk; |
585 | * If there is a delay extent at this offset, | 578 | if (last != start) |
586 | * it will be as a data. | 579 | dataoff = (loff_t)last << blkbits; |
587 | */ | 580 | if (!ext4_es_is_unwritten(&es)) |
588 | ext4_es_find_delayed_extent_range(inode, last, last, &es); | ||
589 | if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { | ||
590 | if (last != start) | ||
591 | dataoff = (loff_t)last << blkbits; | ||
592 | break; | 581 | break; |
593 | } | ||
594 | 582 | ||
595 | /* | 583 | /* |
596 | * If there is a unwritten extent at this offset, | 584 | * If there is a unwritten extent at this offset, |
597 | * it will be as a data or a hole according to page | 585 | * it will be as a data or a hole according to page |
598 | * cache that has data or not. | 586 | * cache that has data or not. |
599 | */ | 587 | */ |
600 | if (map.m_flags & EXT4_MAP_UNWRITTEN) { | 588 | if (ext4_find_unwritten_pgoff(inode, SEEK_DATA, |
601 | int unwritten; | 589 | es.es_lblk + es.es_len, &dataoff)) |
602 | unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, | 590 | break; |
603 | &map, &dataoff); | 591 | last += es.es_len; |
604 | if (unwritten) | ||
605 | break; | ||
606 | } | ||
607 | |||
608 | last++; | ||
609 | dataoff = (loff_t)last << blkbits; | 592 | dataoff = (loff_t)last << blkbits; |
593 | cond_resched(); | ||
610 | } while (last <= end); | 594 | } while (last <= end); |
611 | 595 | ||
612 | inode_unlock(inode); | 596 | inode_unlock(inode); |
@@ -623,12 +607,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) | |||
623 | static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) | 607 | static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) |
624 | { | 608 | { |
625 | struct inode *inode = file->f_mapping->host; | 609 | struct inode *inode = file->f_mapping->host; |
626 | struct ext4_map_blocks map; | ||
627 | struct extent_status es; | 610 | struct extent_status es; |
628 | ext4_lblk_t start, last, end; | 611 | ext4_lblk_t start, last, end; |
629 | loff_t holeoff, isize; | 612 | loff_t holeoff, isize; |
630 | int blkbits; | 613 | int blkbits; |
631 | int ret = 0; | 614 | int ret; |
632 | 615 | ||
633 | inode_lock(inode); | 616 | inode_lock(inode); |
634 | 617 | ||
@@ -645,44 +628,30 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) | |||
645 | holeoff = offset; | 628 | holeoff = offset; |
646 | 629 | ||
647 | do { | 630 | do { |
648 | map.m_lblk = last; | 631 | ret = ext4_get_next_extent(inode, last, end - last + 1, &es); |
649 | map.m_len = end - last + 1; | 632 | if (ret < 0) { |
650 | ret = ext4_map_blocks(NULL, inode, &map, 0); | 633 | inode_unlock(inode); |
651 | if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { | 634 | return ret; |
652 | last += ret; | ||
653 | holeoff = (loff_t)last << blkbits; | ||
654 | continue; | ||
655 | } | 635 | } |
656 | 636 | /* Found a hole? */ | |
657 | /* | 637 | if (ret == 0 || es.es_lblk > last) { |
658 | * If there is a delay extent at this offset, | 638 | if (last != start) |
659 | * we will skip this extent. | 639 | holeoff = (loff_t)last << blkbits; |
660 | */ | 640 | break; |
661 | ext4_es_find_delayed_extent_range(inode, last, last, &es); | ||
662 | if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { | ||
663 | last = es.es_lblk + es.es_len; | ||
664 | holeoff = (loff_t)last << blkbits; | ||
665 | continue; | ||
666 | } | 641 | } |
667 | |||
668 | /* | 642 | /* |
669 | * If there is a unwritten extent at this offset, | 643 | * If there is a unwritten extent at this offset, |
670 | * it will be as a data or a hole according to page | 644 | * it will be as a data or a hole according to page |
671 | * cache that has data or not. | 645 | * cache that has data or not. |
672 | */ | 646 | */ |
673 | if (map.m_flags & EXT4_MAP_UNWRITTEN) { | 647 | if (ext4_es_is_unwritten(&es) && |
674 | int unwritten; | 648 | ext4_find_unwritten_pgoff(inode, SEEK_HOLE, |
675 | unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, | 649 | last + es.es_len, &holeoff)) |
676 | &map, &holeoff); | 650 | break; |
677 | if (!unwritten) { | ||
678 | last += ret; | ||
679 | holeoff = (loff_t)last << blkbits; | ||
680 | continue; | ||
681 | } | ||
682 | } | ||
683 | 651 | ||
684 | /* find a hole */ | 652 | last += es.es_len; |
685 | break; | 653 | holeoff = (loff_t)last << blkbits; |
654 | cond_resched(); | ||
686 | } while (last <= end); | 655 | } while (last <= end); |
687 | 656 | ||
688 | inode_unlock(inode); | 657 | inode_unlock(inode); |
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index acc0ad56bf2f..237b877d316d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c | |||
@@ -787,7 +787,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, | |||
787 | sbi = EXT4_SB(sb); | 787 | sbi = EXT4_SB(sb); |
788 | 788 | ||
789 | /* | 789 | /* |
790 | * Initalize owners and quota early so that we don't have to account | 790 | * Initialize owners and quota early so that we don't have to account |
791 | * for quota initialization worst case in standard inode creating | 791 | * for quota initialization worst case in standard inode creating |
792 | * transaction | 792 | * transaction |
793 | */ | 793 | */ |
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 355ef9c36c87..3027fa681de5 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c | |||
@@ -555,8 +555,23 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, | |||
555 | goto got_it; | 555 | goto got_it; |
556 | } | 556 | } |
557 | 557 | ||
558 | /* Next simple case - plain lookup or failed read of indirect block */ | 558 | /* Next simple case - plain lookup failed */ |
559 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) | 559 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { |
560 | unsigned epb = inode->i_sb->s_blocksize / sizeof(u32); | ||
561 | int i; | ||
562 | |||
563 | /* Count number blocks in a subtree under 'partial' */ | ||
564 | count = 1; | ||
565 | for (i = 0; partial + i != chain + depth - 1; i++) | ||
566 | count *= epb; | ||
567 | /* Fill in size of a hole we found */ | ||
568 | map->m_pblk = 0; | ||
569 | map->m_len = min_t(unsigned int, map->m_len, count); | ||
570 | goto cleanup; | ||
571 | } | ||
572 | |||
573 | /* Failed read of indirect block */ | ||
574 | if (err == -EIO) | ||
560 | goto cleanup; | 575 | goto cleanup; |
561 | 576 | ||
562 | /* | 577 | /* |
@@ -693,21 +708,21 @@ retry: | |||
693 | } | 708 | } |
694 | if (IS_DAX(inode)) | 709 | if (IS_DAX(inode)) |
695 | ret = dax_do_io(iocb, inode, iter, offset, | 710 | ret = dax_do_io(iocb, inode, iter, offset, |
696 | ext4_get_block, NULL, 0); | 711 | ext4_dio_get_block, NULL, 0); |
697 | else | 712 | else |
698 | ret = __blockdev_direct_IO(iocb, inode, | 713 | ret = __blockdev_direct_IO(iocb, inode, |
699 | inode->i_sb->s_bdev, iter, | 714 | inode->i_sb->s_bdev, iter, |
700 | offset, ext4_get_block, NULL, | 715 | offset, ext4_dio_get_block, |
701 | NULL, 0); | 716 | NULL, NULL, 0); |
702 | inode_dio_end(inode); | 717 | inode_dio_end(inode); |
703 | } else { | 718 | } else { |
704 | locked: | 719 | locked: |
705 | if (IS_DAX(inode)) | 720 | if (IS_DAX(inode)) |
706 | ret = dax_do_io(iocb, inode, iter, offset, | 721 | ret = dax_do_io(iocb, inode, iter, offset, |
707 | ext4_get_block, NULL, DIO_LOCKING); | 722 | ext4_dio_get_block, NULL, DIO_LOCKING); |
708 | else | 723 | else |
709 | ret = blockdev_direct_IO(iocb, inode, iter, offset, | 724 | ret = blockdev_direct_IO(iocb, inode, iter, offset, |
710 | ext4_get_block); | 725 | ext4_dio_get_block); |
711 | 726 | ||
712 | if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { | 727 | if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { |
713 | loff_t isize = i_size_read(inode); | 728 | loff_t isize = i_size_read(inode); |
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index dfe3b9bafc0d..7cbdd3752ba5 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c | |||
@@ -581,9 +581,10 @@ retry: | |||
581 | if (ret) | 581 | if (ret) |
582 | goto out; | 582 | goto out; |
583 | 583 | ||
584 | if (ext4_should_dioread_nolock(inode)) | 584 | if (ext4_should_dioread_nolock(inode)) { |
585 | ret = __block_write_begin(page, from, to, ext4_get_block_write); | 585 | ret = __block_write_begin(page, from, to, |
586 | else | 586 | ext4_get_block_unwritten); |
587 | } else | ||
587 | ret = __block_write_begin(page, from, to, ext4_get_block); | 588 | ret = __block_write_begin(page, from, to, ext4_get_block); |
588 | 589 | ||
589 | if (!ret && ext4_should_journal_data(inode)) { | 590 | if (!ret && ext4_should_journal_data(inode)) { |
@@ -1696,7 +1697,6 @@ int ext4_delete_inline_entry(handle_t *handle, | |||
1696 | if (err) | 1697 | if (err) |
1697 | goto out; | 1698 | goto out; |
1698 | 1699 | ||
1699 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | ||
1700 | err = ext4_mark_inode_dirty(handle, dir); | 1700 | err = ext4_mark_inode_dirty(handle, dir); |
1701 | if (unlikely(err)) | 1701 | if (unlikely(err)) |
1702 | goto out; | 1702 | goto out; |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index aee960b1af34..b2e9576450eb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -216,7 +216,6 @@ void ext4_evict_inode(struct inode *inode) | |||
216 | } | 216 | } |
217 | truncate_inode_pages_final(&inode->i_data); | 217 | truncate_inode_pages_final(&inode->i_data); |
218 | 218 | ||
219 | WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); | ||
220 | goto no_delete; | 219 | goto no_delete; |
221 | } | 220 | } |
222 | 221 | ||
@@ -228,8 +227,6 @@ void ext4_evict_inode(struct inode *inode) | |||
228 | ext4_begin_ordered_truncate(inode, 0); | 227 | ext4_begin_ordered_truncate(inode, 0); |
229 | truncate_inode_pages_final(&inode->i_data); | 228 | truncate_inode_pages_final(&inode->i_data); |
230 | 229 | ||
231 | WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); | ||
232 | |||
233 | /* | 230 | /* |
234 | * Protect us against freezing - iput() caller didn't have to have any | 231 | * Protect us against freezing - iput() caller didn't have to have any |
235 | * protection against it | 232 | * protection against it |
@@ -458,13 +455,13 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, | |||
458 | * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping | 455 | * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping |
459 | * based files | 456 | * based files |
460 | * | 457 | * |
461 | * On success, it returns the number of blocks being mapped or allocated. | 458 | * On success, it returns the number of blocks being mapped or allocated. if |
462 | * if create==0 and the blocks are pre-allocated and unwritten block, | 459 | * create==0 and the blocks are pre-allocated and unwritten, the resulting @map |
463 | * the result buffer head is unmapped. If the create ==1, it will make sure | 460 | * is marked as unwritten. If the create == 1, it will mark @map as mapped. |
464 | * the buffer head is mapped. | ||
465 | * | 461 | * |
466 | * It returns 0 if plain look up failed (blocks have not been allocated), in | 462 | * It returns 0 if plain look up failed (blocks have not been allocated), in |
467 | * that case, buffer head is unmapped | 463 | * that case, @map is returned as unmapped but we still do fill map->m_len to |
464 | * indicate the length of a hole starting at map->m_lblk. | ||
468 | * | 465 | * |
469 | * It returns the error in case of allocation failure. | 466 | * It returns the error in case of allocation failure. |
470 | */ | 467 | */ |
@@ -507,6 +504,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, | |||
507 | retval = map->m_len; | 504 | retval = map->m_len; |
508 | map->m_len = retval; | 505 | map->m_len = retval; |
509 | } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { | 506 | } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { |
507 | map->m_pblk = 0; | ||
508 | retval = es.es_len - (map->m_lblk - es.es_lblk); | ||
509 | if (retval > map->m_len) | ||
510 | retval = map->m_len; | ||
511 | map->m_len = retval; | ||
510 | retval = 0; | 512 | retval = 0; |
511 | } else { | 513 | } else { |
512 | BUG_ON(1); | 514 | BUG_ON(1); |
@@ -714,16 +716,11 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) | |||
714 | cmpxchg(&bh->b_state, old_state, new_state) != old_state)); | 716 | cmpxchg(&bh->b_state, old_state, new_state) != old_state)); |
715 | } | 717 | } |
716 | 718 | ||
717 | /* Maximum number of blocks we map for direct IO at once. */ | ||
718 | #define DIO_MAX_BLOCKS 4096 | ||
719 | |||
720 | static int _ext4_get_block(struct inode *inode, sector_t iblock, | 719 | static int _ext4_get_block(struct inode *inode, sector_t iblock, |
721 | struct buffer_head *bh, int flags) | 720 | struct buffer_head *bh, int flags) |
722 | { | 721 | { |
723 | handle_t *handle = ext4_journal_current_handle(); | ||
724 | struct ext4_map_blocks map; | 722 | struct ext4_map_blocks map; |
725 | int ret = 0, started = 0; | 723 | int ret = 0; |
726 | int dio_credits; | ||
727 | 724 | ||
728 | if (ext4_has_inline_data(inode)) | 725 | if (ext4_has_inline_data(inode)) |
729 | return -ERANGE; | 726 | return -ERANGE; |
@@ -731,33 +728,14 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, | |||
731 | map.m_lblk = iblock; | 728 | map.m_lblk = iblock; |
732 | map.m_len = bh->b_size >> inode->i_blkbits; | 729 | map.m_len = bh->b_size >> inode->i_blkbits; |
733 | 730 | ||
734 | if (flags && !handle) { | 731 | ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map, |
735 | /* Direct IO write... */ | 732 | flags); |
736 | if (map.m_len > DIO_MAX_BLOCKS) | ||
737 | map.m_len = DIO_MAX_BLOCKS; | ||
738 | dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); | ||
739 | handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, | ||
740 | dio_credits); | ||
741 | if (IS_ERR(handle)) { | ||
742 | ret = PTR_ERR(handle); | ||
743 | return ret; | ||
744 | } | ||
745 | started = 1; | ||
746 | } | ||
747 | |||
748 | ret = ext4_map_blocks(handle, inode, &map, flags); | ||
749 | if (ret > 0) { | 733 | if (ret > 0) { |
750 | ext4_io_end_t *io_end = ext4_inode_aio(inode); | ||
751 | |||
752 | map_bh(bh, inode->i_sb, map.m_pblk); | 734 | map_bh(bh, inode->i_sb, map.m_pblk); |
753 | ext4_update_bh_state(bh, map.m_flags); | 735 | ext4_update_bh_state(bh, map.m_flags); |
754 | if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) | ||
755 | set_buffer_defer_completion(bh); | ||
756 | bh->b_size = inode->i_sb->s_blocksize * map.m_len; | 736 | bh->b_size = inode->i_sb->s_blocksize * map.m_len; |
757 | ret = 0; | 737 | ret = 0; |
758 | } | 738 | } |
759 | if (started) | ||
760 | ext4_journal_stop(handle); | ||
761 | return ret; | 739 | return ret; |
762 | } | 740 | } |
763 | 741 | ||
@@ -769,6 +747,155 @@ int ext4_get_block(struct inode *inode, sector_t iblock, | |||
769 | } | 747 | } |
770 | 748 | ||
771 | /* | 749 | /* |
750 | * Get block function used when preparing for buffered write if we require | ||
751 | * creating an unwritten extent if blocks haven't been allocated. The extent | ||
752 | * will be converted to written after the IO is complete. | ||
753 | */ | ||
754 | int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, | ||
755 | struct buffer_head *bh_result, int create) | ||
756 | { | ||
757 | ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", | ||
758 | inode->i_ino, create); | ||
759 | return _ext4_get_block(inode, iblock, bh_result, | ||
760 | EXT4_GET_BLOCKS_IO_CREATE_EXT); | ||
761 | } | ||
762 | |||
763 | /* Maximum number of blocks we map for direct IO at once. */ | ||
764 | #define DIO_MAX_BLOCKS 4096 | ||
765 | |||
766 | static handle_t *start_dio_trans(struct inode *inode, | ||
767 | struct buffer_head *bh_result) | ||
768 | { | ||
769 | int dio_credits; | ||
770 | |||
771 | /* Trim mapping request to maximum we can map at once for DIO */ | ||
772 | if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS) | ||
773 | bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits; | ||
774 | dio_credits = ext4_chunk_trans_blocks(inode, | ||
775 | bh_result->b_size >> inode->i_blkbits); | ||
776 | return ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); | ||
777 | } | ||
778 | |||
779 | /* Get block function for DIO reads and writes to inodes without extents */ | ||
780 | int ext4_dio_get_block(struct inode *inode, sector_t iblock, | ||
781 | struct buffer_head *bh, int create) | ||
782 | { | ||
783 | handle_t *handle; | ||
784 | int ret; | ||
785 | |||
786 | /* We don't expect handle for direct IO */ | ||
787 | WARN_ON_ONCE(ext4_journal_current_handle()); | ||
788 | |||
789 | if (create) { | ||
790 | handle = start_dio_trans(inode, bh); | ||
791 | if (IS_ERR(handle)) | ||
792 | return PTR_ERR(handle); | ||
793 | } | ||
794 | ret = _ext4_get_block(inode, iblock, bh, | ||
795 | create ? EXT4_GET_BLOCKS_CREATE : 0); | ||
796 | if (create) | ||
797 | ext4_journal_stop(handle); | ||
798 | return ret; | ||
799 | } | ||
800 | |||
801 | /* | ||
802 | * Get block function for AIO DIO writes when we create unwritten extent if | ||
803 | * blocks are not allocated yet. The extent will be converted to written | ||
804 | * after IO is complete. | ||
805 | */ | ||
806 | static int ext4_dio_get_block_unwritten_async(struct inode *inode, | ||
807 | sector_t iblock, struct buffer_head *bh_result, int create) | ||
808 | { | ||
809 | handle_t *handle; | ||
810 | int ret; | ||
811 | |||
812 | /* We don't expect handle for direct IO */ | ||
813 | WARN_ON_ONCE(ext4_journal_current_handle()); | ||
814 | |||
815 | handle = start_dio_trans(inode, bh_result); | ||
816 | if (IS_ERR(handle)) | ||
817 | return PTR_ERR(handle); | ||
818 | ret = _ext4_get_block(inode, iblock, bh_result, | ||
819 | EXT4_GET_BLOCKS_IO_CREATE_EXT); | ||
820 | ext4_journal_stop(handle); | ||
821 | |||
822 | /* | ||
823 | * When doing DIO using unwritten extents, we need io_end to convert | ||
824 | * unwritten extents to written on IO completion. We allocate io_end | ||
825 | * once we spot unwritten extent and store it in b_private. Generic | ||
826 | * DIO code keeps b_private set and furthermore passes the value to | ||
827 | * our completion callback in 'private' argument. | ||
828 | */ | ||
829 | if (!ret && buffer_unwritten(bh_result)) { | ||
830 | if (!bh_result->b_private) { | ||
831 | ext4_io_end_t *io_end; | ||
832 | |||
833 | io_end = ext4_init_io_end(inode, GFP_KERNEL); | ||
834 | if (!io_end) | ||
835 | return -ENOMEM; | ||
836 | bh_result->b_private = io_end; | ||
837 | ext4_set_io_unwritten_flag(inode, io_end); | ||
838 | } | ||
839 | set_buffer_defer_completion(bh_result); | ||
840 | } | ||
841 | |||
842 | return ret; | ||
843 | } | ||
844 | |||
845 | /* | ||
846 | * Get block function for non-AIO DIO writes when we create unwritten extent if | ||
847 | * blocks are not allocated yet. The extent will be converted to written | ||
848 | * after IO is complete from ext4_ext_direct_IO() function. | ||
849 | */ | ||
850 | static int ext4_dio_get_block_unwritten_sync(struct inode *inode, | ||
851 | sector_t iblock, struct buffer_head *bh_result, int create) | ||
852 | { | ||
853 | handle_t *handle; | ||
854 | int ret; | ||
855 | |||
856 | /* We don't expect handle for direct IO */ | ||
857 | WARN_ON_ONCE(ext4_journal_current_handle()); | ||
858 | |||
859 | handle = start_dio_trans(inode, bh_result); | ||
860 | if (IS_ERR(handle)) | ||
861 | return PTR_ERR(handle); | ||
862 | ret = _ext4_get_block(inode, iblock, bh_result, | ||
863 | EXT4_GET_BLOCKS_IO_CREATE_EXT); | ||
864 | ext4_journal_stop(handle); | ||
865 | |||
866 | /* | ||
867 | * Mark inode as having pending DIO writes to unwritten extents. | ||
868 | * ext4_ext_direct_IO() checks this flag and converts extents to | ||
869 | * written. | ||
870 | */ | ||
871 | if (!ret && buffer_unwritten(bh_result)) | ||
872 | ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); | ||
873 | |||
874 | return ret; | ||
875 | } | ||
876 | |||
877 | static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, | ||
878 | struct buffer_head *bh_result, int create) | ||
879 | { | ||
880 | int ret; | ||
881 | |||
882 | ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n", | ||
883 | inode->i_ino, create); | ||
884 | /* We don't expect handle for direct IO */ | ||
885 | WARN_ON_ONCE(ext4_journal_current_handle()); | ||
886 | |||
887 | ret = _ext4_get_block(inode, iblock, bh_result, 0); | ||
888 | /* | ||
889 | * Blocks should have been preallocated! ext4_file_write_iter() checks | ||
890 | * that. | ||
891 | */ | ||
892 | WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result)); | ||
893 | |||
894 | return ret; | ||
895 | } | ||
896 | |||
897 | |||
898 | /* | ||
772 | * `handle' can be NULL if create is zero | 899 | * `handle' can be NULL if create is zero |
773 | */ | 900 | */ |
774 | struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, | 901 | struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, |
@@ -1079,13 +1206,14 @@ retry_journal: | |||
1079 | #ifdef CONFIG_EXT4_FS_ENCRYPTION | 1206 | #ifdef CONFIG_EXT4_FS_ENCRYPTION |
1080 | if (ext4_should_dioread_nolock(inode)) | 1207 | if (ext4_should_dioread_nolock(inode)) |
1081 | ret = ext4_block_write_begin(page, pos, len, | 1208 | ret = ext4_block_write_begin(page, pos, len, |
1082 | ext4_get_block_write); | 1209 | ext4_get_block_unwritten); |
1083 | else | 1210 | else |
1084 | ret = ext4_block_write_begin(page, pos, len, | 1211 | ret = ext4_block_write_begin(page, pos, len, |
1085 | ext4_get_block); | 1212 | ext4_get_block); |
1086 | #else | 1213 | #else |
1087 | if (ext4_should_dioread_nolock(inode)) | 1214 | if (ext4_should_dioread_nolock(inode)) |
1088 | ret = __block_write_begin(page, pos, len, ext4_get_block_write); | 1215 | ret = __block_write_begin(page, pos, len, |
1216 | ext4_get_block_unwritten); | ||
1089 | else | 1217 | else |
1090 | ret = __block_write_begin(page, pos, len, ext4_get_block); | 1218 | ret = __block_write_begin(page, pos, len, ext4_get_block); |
1091 | #endif | 1219 | #endif |
@@ -3088,37 +3216,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait) | |||
3088 | return try_to_free_buffers(page); | 3216 | return try_to_free_buffers(page); |
3089 | } | 3217 | } |
3090 | 3218 | ||
3091 | /* | ||
3092 | * ext4_get_block used when preparing for a DIO write or buffer write. | ||
3093 | * We allocate an uinitialized extent if blocks haven't been allocated. | ||
3094 | * The extent will be converted to initialized after the IO is complete. | ||
3095 | */ | ||
3096 | int ext4_get_block_write(struct inode *inode, sector_t iblock, | ||
3097 | struct buffer_head *bh_result, int create) | ||
3098 | { | ||
3099 | ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", | ||
3100 | inode->i_ino, create); | ||
3101 | return _ext4_get_block(inode, iblock, bh_result, | ||
3102 | EXT4_GET_BLOCKS_IO_CREATE_EXT); | ||
3103 | } | ||
3104 | |||
3105 | static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock, | ||
3106 | struct buffer_head *bh_result, int create) | ||
3107 | { | ||
3108 | int ret; | ||
3109 | |||
3110 | ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n", | ||
3111 | inode->i_ino, create); | ||
3112 | ret = _ext4_get_block(inode, iblock, bh_result, 0); | ||
3113 | /* | ||
3114 | * Blocks should have been preallocated! ext4_file_write_iter() checks | ||
3115 | * that. | ||
3116 | */ | ||
3117 | WARN_ON_ONCE(!buffer_mapped(bh_result)); | ||
3118 | |||
3119 | return ret; | ||
3120 | } | ||
3121 | |||
3122 | #ifdef CONFIG_FS_DAX | 3219 | #ifdef CONFIG_FS_DAX |
3123 | int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, | 3220 | int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, |
3124 | struct buffer_head *bh_result, int create) | 3221 | struct buffer_head *bh_result, int create) |
@@ -3179,13 +3276,12 @@ out: | |||
3179 | WARN_ON_ONCE(ret == 0 && create); | 3276 | WARN_ON_ONCE(ret == 0 && create); |
3180 | if (ret > 0) { | 3277 | if (ret > 0) { |
3181 | map_bh(bh_result, inode->i_sb, map.m_pblk); | 3278 | map_bh(bh_result, inode->i_sb, map.m_pblk); |
3182 | bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) | | ||
3183 | map.m_flags; | ||
3184 | /* | 3279 | /* |
3185 | * At least for now we have to clear BH_New so that DAX code | 3280 | * At least for now we have to clear BH_New so that DAX code |
3186 | * doesn't attempt to zero blocks again in a racy way. | 3281 | * doesn't attempt to zero blocks again in a racy way. |
3187 | */ | 3282 | */ |
3188 | bh_result->b_state &= ~(1 << BH_New); | 3283 | map.m_flags &= ~EXT4_MAP_NEW; |
3284 | ext4_update_bh_state(bh_result, map.m_flags); | ||
3189 | bh_result->b_size = map.m_len << inode->i_blkbits; | 3285 | bh_result->b_size = map.m_len << inode->i_blkbits; |
3190 | ret = 0; | 3286 | ret = 0; |
3191 | } | 3287 | } |
@@ -3196,7 +3292,7 @@ out: | |||
3196 | static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | 3292 | static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, |
3197 | ssize_t size, void *private) | 3293 | ssize_t size, void *private) |
3198 | { | 3294 | { |
3199 | ext4_io_end_t *io_end = iocb->private; | 3295 | ext4_io_end_t *io_end = private; |
3200 | 3296 | ||
3201 | /* if not async direct IO just return */ | 3297 | /* if not async direct IO just return */ |
3202 | if (!io_end) | 3298 | if (!io_end) |
@@ -3204,10 +3300,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | |||
3204 | 3300 | ||
3205 | ext_debug("ext4_end_io_dio(): io_end 0x%p " | 3301 | ext_debug("ext4_end_io_dio(): io_end 0x%p " |
3206 | "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", | 3302 | "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", |
3207 | iocb->private, io_end->inode->i_ino, iocb, offset, | 3303 | io_end, io_end->inode->i_ino, iocb, offset, size); |
3208 | size); | ||
3209 | 3304 | ||
3210 | iocb->private = NULL; | ||
3211 | io_end->offset = offset; | 3305 | io_end->offset = offset; |
3212 | io_end->size = size; | 3306 | io_end->size = size; |
3213 | ext4_put_io_end(io_end); | 3307 | ext4_put_io_end(io_end); |
@@ -3243,7 +3337,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | |||
3243 | get_block_t *get_block_func = NULL; | 3337 | get_block_t *get_block_func = NULL; |
3244 | int dio_flags = 0; | 3338 | int dio_flags = 0; |
3245 | loff_t final_size = offset + count; | 3339 | loff_t final_size = offset + count; |
3246 | ext4_io_end_t *io_end = NULL; | ||
3247 | 3340 | ||
3248 | /* Use the old path for reads and writes beyond i_size. */ | 3341 | /* Use the old path for reads and writes beyond i_size. */ |
3249 | if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) | 3342 | if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) |
@@ -3268,16 +3361,17 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | |||
3268 | /* | 3361 | /* |
3269 | * We could direct write to holes and fallocate. | 3362 | * We could direct write to holes and fallocate. |
3270 | * | 3363 | * |
3271 | * Allocated blocks to fill the hole are marked as | 3364 | * Allocated blocks to fill the hole are marked as unwritten to prevent |
3272 | * unwritten to prevent parallel buffered read to expose | 3365 | * parallel buffered read to expose the stale data before DIO complete |
3273 | * the stale data before DIO complete the data IO. | 3366 | * the data IO. |
3274 | * | 3367 | * |
3275 | * As to previously fallocated extents, ext4 get_block will | 3368 | * As to previously fallocated extents, ext4 get_block will just simply |
3276 | * just simply mark the buffer mapped but still keep the | 3369 | * mark the buffer mapped but still keep the extents unwritten. |
3277 | * extents unwritten. | ||
3278 | * | 3370 | * |
3279 | * For non AIO case, we will convert those unwritten extents | 3371 | * For non AIO case, we will convert those unwritten extents to written |
3280 | * to written after return back from blockdev_direct_IO. | 3372 | * after return back from blockdev_direct_IO. That way we save us from |
3373 | * allocating io_end structure and also the overhead of offloading | ||
3374 | * the extent convertion to a workqueue. | ||
3281 | * | 3375 | * |
3282 | * For async DIO, the conversion needs to be deferred when the | 3376 | * For async DIO, the conversion needs to be deferred when the |
3283 | * IO is completed. The ext4 end_io callback function will be | 3377 | * IO is completed. The ext4 end_io callback function will be |
@@ -3285,30 +3379,13 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | |||
3285 | * case, we allocate an io_end structure to hook to the iocb. | 3379 | * case, we allocate an io_end structure to hook to the iocb. |
3286 | */ | 3380 | */ |
3287 | iocb->private = NULL; | 3381 | iocb->private = NULL; |
3288 | if (overwrite) { | 3382 | if (overwrite) |
3289 | get_block_func = ext4_get_block_overwrite; | 3383 | get_block_func = ext4_dio_get_block_overwrite; |
3384 | else if (is_sync_kiocb(iocb)) { | ||
3385 | get_block_func = ext4_dio_get_block_unwritten_sync; | ||
3386 | dio_flags = DIO_LOCKING; | ||
3290 | } else { | 3387 | } else { |
3291 | ext4_inode_aio_set(inode, NULL); | 3388 | get_block_func = ext4_dio_get_block_unwritten_async; |
3292 | if (!is_sync_kiocb(iocb)) { | ||
3293 | io_end = ext4_init_io_end(inode, GFP_NOFS); | ||
3294 | if (!io_end) { | ||
3295 | ret = -ENOMEM; | ||
3296 | goto retake_lock; | ||
3297 | } | ||
3298 | /* | ||
3299 | * Grab reference for DIO. Will be dropped in | ||
3300 | * ext4_end_io_dio() | ||
3301 | */ | ||
3302 | iocb->private = ext4_get_io_end(io_end); | ||
3303 | /* | ||
3304 | * we save the io structure for current async direct | ||
3305 | * IO, so that later ext4_map_blocks() could flag the | ||
3306 | * io structure whether there is a unwritten extents | ||
3307 | * needs to be converted when IO is completed. | ||
3308 | */ | ||
3309 | ext4_inode_aio_set(inode, io_end); | ||
3310 | } | ||
3311 | get_block_func = ext4_get_block_write; | ||
3312 | dio_flags = DIO_LOCKING; | 3389 | dio_flags = DIO_LOCKING; |
3313 | } | 3390 | } |
3314 | #ifdef CONFIG_EXT4_FS_ENCRYPTION | 3391 | #ifdef CONFIG_EXT4_FS_ENCRYPTION |
@@ -3323,27 +3400,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | |||
3323 | get_block_func, | 3400 | get_block_func, |
3324 | ext4_end_io_dio, NULL, dio_flags); | 3401 | ext4_end_io_dio, NULL, dio_flags); |
3325 | 3402 | ||
3326 | /* | ||
3327 | * Put our reference to io_end. This can free the io_end structure e.g. | ||
3328 | * in sync IO case or in case of error. It can even perform extent | ||
3329 | * conversion if all bios we submitted finished before we got here. | ||
3330 | * Note that in that case iocb->private can be already set to NULL | ||
3331 | * here. | ||
3332 | */ | ||
3333 | if (io_end) { | ||
3334 | ext4_inode_aio_set(inode, NULL); | ||
3335 | ext4_put_io_end(io_end); | ||
3336 | /* | ||
3337 | * When no IO was submitted ext4_end_io_dio() was not | ||
3338 | * called so we have to put iocb's reference. | ||
3339 | */ | ||
3340 | if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) { | ||
3341 | WARN_ON(iocb->private != io_end); | ||
3342 | WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); | ||
3343 | ext4_put_io_end(io_end); | ||
3344 | iocb->private = NULL; | ||
3345 | } | ||
3346 | } | ||
3347 | if (ret > 0 && !overwrite && ext4_test_inode_state(inode, | 3403 | if (ret > 0 && !overwrite && ext4_test_inode_state(inode, |
3348 | EXT4_STATE_DIO_UNWRITTEN)) { | 3404 | EXT4_STATE_DIO_UNWRITTEN)) { |
3349 | int err; | 3405 | int err; |
@@ -3358,7 +3414,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, | |||
3358 | ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); | 3414 | ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); |
3359 | } | 3415 | } |
3360 | 3416 | ||
3361 | retake_lock: | ||
3362 | if (iov_iter_rw(iter) == WRITE) | 3417 | if (iov_iter_rw(iter) == WRITE) |
3363 | inode_dio_end(inode); | 3418 | inode_dio_end(inode); |
3364 | /* take i_mutex locking again if we do a ovewrite dio */ | 3419 | /* take i_mutex locking again if we do a ovewrite dio */ |
@@ -5261,6 +5316,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) | |||
5261 | might_sleep(); | 5316 | might_sleep(); |
5262 | trace_ext4_mark_inode_dirty(inode, _RET_IP_); | 5317 | trace_ext4_mark_inode_dirty(inode, _RET_IP_); |
5263 | err = ext4_reserve_inode_write(handle, inode, &iloc); | 5318 | err = ext4_reserve_inode_write(handle, inode, &iloc); |
5319 | if (err) | ||
5320 | return err; | ||
5264 | if (ext4_handle_valid(handle) && | 5321 | if (ext4_handle_valid(handle) && |
5265 | EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && | 5322 | EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && |
5266 | !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { | 5323 | !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { |
@@ -5291,9 +5348,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) | |||
5291 | } | 5348 | } |
5292 | } | 5349 | } |
5293 | } | 5350 | } |
5294 | if (!err) | 5351 | return ext4_mark_iloc_dirty(handle, inode, &iloc); |
5295 | err = ext4_mark_iloc_dirty(handle, inode, &iloc); | ||
5296 | return err; | ||
5297 | } | 5352 | } |
5298 | 5353 | ||
5299 | /* | 5354 | /* |
@@ -5502,7 +5557,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
5502 | unlock_page(page); | 5557 | unlock_page(page); |
5503 | /* OK, we need to fill the hole... */ | 5558 | /* OK, we need to fill the hole... */ |
5504 | if (ext4_should_dioread_nolock(inode)) | 5559 | if (ext4_should_dioread_nolock(inode)) |
5505 | get_block = ext4_get_block_write; | 5560 | get_block = ext4_get_block_unwritten; |
5506 | else | 5561 | else |
5507 | get_block = ext4_get_block; | 5562 | get_block = ext4_get_block; |
5508 | retry_alloc: | 5563 | retry_alloc: |
@@ -5545,3 +5600,70 @@ int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
5545 | 5600 | ||
5546 | return err; | 5601 | return err; |
5547 | } | 5602 | } |
5603 | |||
5604 | /* | ||
5605 | * Find the first extent at or after @lblk in an inode that is not a hole. | ||
5606 | * Search for @map_len blocks at most. The extent is returned in @result. | ||
5607 | * | ||
5608 | * The function returns 1 if we found an extent. The function returns 0 in | ||
5609 | * case there is no extent at or after @lblk and in that case also sets | ||
5610 | * @result->es_len to 0. In case of error, the error code is returned. | ||
5611 | */ | ||
5612 | int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, | ||
5613 | unsigned int map_len, struct extent_status *result) | ||
5614 | { | ||
5615 | struct ext4_map_blocks map; | ||
5616 | struct extent_status es = {}; | ||
5617 | int ret; | ||
5618 | |||
5619 | map.m_lblk = lblk; | ||
5620 | map.m_len = map_len; | ||
5621 | |||
5622 | /* | ||
5623 | * For non-extent based files this loop may iterate several times since | ||
5624 | * we do not determine full hole size. | ||
5625 | */ | ||
5626 | while (map.m_len > 0) { | ||
5627 | ret = ext4_map_blocks(NULL, inode, &map, 0); | ||
5628 | if (ret < 0) | ||
5629 | return ret; | ||
5630 | /* There's extent covering m_lblk? Just return it. */ | ||
5631 | if (ret > 0) { | ||
5632 | int status; | ||
5633 | |||
5634 | ext4_es_store_pblock(result, map.m_pblk); | ||
5635 | result->es_lblk = map.m_lblk; | ||
5636 | result->es_len = map.m_len; | ||
5637 | if (map.m_flags & EXT4_MAP_UNWRITTEN) | ||
5638 | status = EXTENT_STATUS_UNWRITTEN; | ||
5639 | else | ||
5640 | status = EXTENT_STATUS_WRITTEN; | ||
5641 | ext4_es_store_status(result, status); | ||
5642 | return 1; | ||
5643 | } | ||
5644 | ext4_es_find_delayed_extent_range(inode, map.m_lblk, | ||
5645 | map.m_lblk + map.m_len - 1, | ||
5646 | &es); | ||
5647 | /* Is delalloc data before next block in extent tree? */ | ||
5648 | if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) { | ||
5649 | ext4_lblk_t offset = 0; | ||
5650 | |||
5651 | if (es.es_lblk < lblk) | ||
5652 | offset = lblk - es.es_lblk; | ||
5653 | result->es_lblk = es.es_lblk + offset; | ||
5654 | ext4_es_store_pblock(result, | ||
5655 | ext4_es_pblock(&es) + offset); | ||
5656 | result->es_len = es.es_len - offset; | ||
5657 | ext4_es_store_status(result, ext4_es_status(&es)); | ||
5658 | |||
5659 | return 1; | ||
5660 | } | ||
5661 | /* There's a hole at m_lblk, advance us after it */ | ||
5662 | map.m_lblk += map.m_len; | ||
5663 | map_len -= map.m_len; | ||
5664 | map.m_len = map_len; | ||
5665 | cond_resched(); | ||
5666 | } | ||
5667 | result->es_len = 0; | ||
5668 | return 0; | ||
5669 | } | ||
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4424b7bf8ac6..50e05df28f66 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c | |||
@@ -11,7 +11,7 @@ | |||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | * GNU General Public License for more details. | 12 | * GNU General Public License for more details. |
13 | * | 13 | * |
14 | * You should have received a copy of the GNU General Public Licens | 14 | * You should have received a copy of the GNU General Public License |
15 | * along with this program; if not, write to the Free Software | 15 | * along with this program; if not, write to the Free Software |
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- | 16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- |
17 | */ | 17 | */ |
@@ -815,7 +815,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b) | |||
815 | * for this page; do not hold this lock when calling this routine! | 815 | * for this page; do not hold this lock when calling this routine! |
816 | */ | 816 | */ |
817 | 817 | ||
818 | static int ext4_mb_init_cache(struct page *page, char *incore) | 818 | static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) |
819 | { | 819 | { |
820 | ext4_group_t ngroups; | 820 | ext4_group_t ngroups; |
821 | int blocksize; | 821 | int blocksize; |
@@ -848,7 +848,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) | |||
848 | /* allocate buffer_heads to read bitmaps */ | 848 | /* allocate buffer_heads to read bitmaps */ |
849 | if (groups_per_page > 1) { | 849 | if (groups_per_page > 1) { |
850 | i = sizeof(struct buffer_head *) * groups_per_page; | 850 | i = sizeof(struct buffer_head *) * groups_per_page; |
851 | bh = kzalloc(i, GFP_NOFS); | 851 | bh = kzalloc(i, gfp); |
852 | if (bh == NULL) { | 852 | if (bh == NULL) { |
853 | err = -ENOMEM; | 853 | err = -ENOMEM; |
854 | goto out; | 854 | goto out; |
@@ -983,7 +983,7 @@ out: | |||
983 | * are on the same page e4b->bd_buddy_page is NULL and return value is 0. | 983 | * are on the same page e4b->bd_buddy_page is NULL and return value is 0. |
984 | */ | 984 | */ |
985 | static int ext4_mb_get_buddy_page_lock(struct super_block *sb, | 985 | static int ext4_mb_get_buddy_page_lock(struct super_block *sb, |
986 | ext4_group_t group, struct ext4_buddy *e4b) | 986 | ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) |
987 | { | 987 | { |
988 | struct inode *inode = EXT4_SB(sb)->s_buddy_cache; | 988 | struct inode *inode = EXT4_SB(sb)->s_buddy_cache; |
989 | int block, pnum, poff; | 989 | int block, pnum, poff; |
@@ -1002,7 +1002,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, | |||
1002 | block = group * 2; | 1002 | block = group * 2; |
1003 | pnum = block / blocks_per_page; | 1003 | pnum = block / blocks_per_page; |
1004 | poff = block % blocks_per_page; | 1004 | poff = block % blocks_per_page; |
1005 | page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | 1005 | page = find_or_create_page(inode->i_mapping, pnum, gfp); |
1006 | if (!page) | 1006 | if (!page) |
1007 | return -ENOMEM; | 1007 | return -ENOMEM; |
1008 | BUG_ON(page->mapping != inode->i_mapping); | 1008 | BUG_ON(page->mapping != inode->i_mapping); |
@@ -1016,7 +1016,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, | |||
1016 | 1016 | ||
1017 | block++; | 1017 | block++; |
1018 | pnum = block / blocks_per_page; | 1018 | pnum = block / blocks_per_page; |
1019 | page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | 1019 | page = find_or_create_page(inode->i_mapping, pnum, gfp); |
1020 | if (!page) | 1020 | if (!page) |
1021 | return -ENOMEM; | 1021 | return -ENOMEM; |
1022 | BUG_ON(page->mapping != inode->i_mapping); | 1022 | BUG_ON(page->mapping != inode->i_mapping); |
@@ -1042,7 +1042,7 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) | |||
1042 | * calling this routine! | 1042 | * calling this routine! |
1043 | */ | 1043 | */ |
1044 | static noinline_for_stack | 1044 | static noinline_for_stack |
1045 | int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) | 1045 | int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) |
1046 | { | 1046 | { |
1047 | 1047 | ||
1048 | struct ext4_group_info *this_grp; | 1048 | struct ext4_group_info *this_grp; |
@@ -1062,7 +1062,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) | |||
1062 | * The call to ext4_mb_get_buddy_page_lock will mark the | 1062 | * The call to ext4_mb_get_buddy_page_lock will mark the |
1063 | * page accessed. | 1063 | * page accessed. |
1064 | */ | 1064 | */ |
1065 | ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); | 1065 | ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); |
1066 | if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { | 1066 | if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { |
1067 | /* | 1067 | /* |
1068 | * somebody initialized the group | 1068 | * somebody initialized the group |
@@ -1072,7 +1072,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) | |||
1072 | } | 1072 | } |
1073 | 1073 | ||
1074 | page = e4b.bd_bitmap_page; | 1074 | page = e4b.bd_bitmap_page; |
1075 | ret = ext4_mb_init_cache(page, NULL); | 1075 | ret = ext4_mb_init_cache(page, NULL, gfp); |
1076 | if (ret) | 1076 | if (ret) |
1077 | goto err; | 1077 | goto err; |
1078 | if (!PageUptodate(page)) { | 1078 | if (!PageUptodate(page)) { |
@@ -1091,7 +1091,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) | |||
1091 | } | 1091 | } |
1092 | /* init buddy cache */ | 1092 | /* init buddy cache */ |
1093 | page = e4b.bd_buddy_page; | 1093 | page = e4b.bd_buddy_page; |
1094 | ret = ext4_mb_init_cache(page, e4b.bd_bitmap); | 1094 | ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp); |
1095 | if (ret) | 1095 | if (ret) |
1096 | goto err; | 1096 | goto err; |
1097 | if (!PageUptodate(page)) { | 1097 | if (!PageUptodate(page)) { |
@@ -1109,8 +1109,8 @@ err: | |||
1109 | * calling this routine! | 1109 | * calling this routine! |
1110 | */ | 1110 | */ |
1111 | static noinline_for_stack int | 1111 | static noinline_for_stack int |
1112 | ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | 1112 | ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, |
1113 | struct ext4_buddy *e4b) | 1113 | struct ext4_buddy *e4b, gfp_t gfp) |
1114 | { | 1114 | { |
1115 | int blocks_per_page; | 1115 | int blocks_per_page; |
1116 | int block; | 1116 | int block; |
@@ -1140,7 +1140,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |||
1140 | * we need full data about the group | 1140 | * we need full data about the group |
1141 | * to make a good selection | 1141 | * to make a good selection |
1142 | */ | 1142 | */ |
1143 | ret = ext4_mb_init_group(sb, group); | 1143 | ret = ext4_mb_init_group(sb, group, gfp); |
1144 | if (ret) | 1144 | if (ret) |
1145 | return ret; | 1145 | return ret; |
1146 | } | 1146 | } |
@@ -1168,11 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |||
1168 | * wait for it to initialize. | 1168 | * wait for it to initialize. |
1169 | */ | 1169 | */ |
1170 | page_cache_release(page); | 1170 | page_cache_release(page); |
1171 | page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | 1171 | page = find_or_create_page(inode->i_mapping, pnum, gfp); |
1172 | if (page) { | 1172 | if (page) { |
1173 | BUG_ON(page->mapping != inode->i_mapping); | 1173 | BUG_ON(page->mapping != inode->i_mapping); |
1174 | if (!PageUptodate(page)) { | 1174 | if (!PageUptodate(page)) { |
1175 | ret = ext4_mb_init_cache(page, NULL); | 1175 | ret = ext4_mb_init_cache(page, NULL, gfp); |
1176 | if (ret) { | 1176 | if (ret) { |
1177 | unlock_page(page); | 1177 | unlock_page(page); |
1178 | goto err; | 1178 | goto err; |
@@ -1204,11 +1204,12 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |||
1204 | if (page == NULL || !PageUptodate(page)) { | 1204 | if (page == NULL || !PageUptodate(page)) { |
1205 | if (page) | 1205 | if (page) |
1206 | page_cache_release(page); | 1206 | page_cache_release(page); |
1207 | page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | 1207 | page = find_or_create_page(inode->i_mapping, pnum, gfp); |
1208 | if (page) { | 1208 | if (page) { |
1209 | BUG_ON(page->mapping != inode->i_mapping); | 1209 | BUG_ON(page->mapping != inode->i_mapping); |
1210 | if (!PageUptodate(page)) { | 1210 | if (!PageUptodate(page)) { |
1211 | ret = ext4_mb_init_cache(page, e4b->bd_bitmap); | 1211 | ret = ext4_mb_init_cache(page, e4b->bd_bitmap, |
1212 | gfp); | ||
1212 | if (ret) { | 1213 | if (ret) { |
1213 | unlock_page(page); | 1214 | unlock_page(page); |
1214 | goto err; | 1215 | goto err; |
@@ -1247,6 +1248,12 @@ err: | |||
1247 | return ret; | 1248 | return ret; |
1248 | } | 1249 | } |
1249 | 1250 | ||
1251 | static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | ||
1252 | struct ext4_buddy *e4b) | ||
1253 | { | ||
1254 | return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS); | ||
1255 | } | ||
1256 | |||
1250 | static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) | 1257 | static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) |
1251 | { | 1258 | { |
1252 | if (e4b->bd_bitmap_page) | 1259 | if (e4b->bd_bitmap_page) |
@@ -2045,7 +2052,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, | |||
2045 | 2052 | ||
2046 | /* We only do this if the grp has never been initialized */ | 2053 | /* We only do this if the grp has never been initialized */ |
2047 | if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { | 2054 | if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { |
2048 | int ret = ext4_mb_init_group(ac->ac_sb, group); | 2055 | int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); |
2049 | if (ret) | 2056 | if (ret) |
2050 | return ret; | 2057 | return ret; |
2051 | } | 2058 | } |
@@ -4695,16 +4702,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, | |||
4695 | } | 4702 | } |
4696 | 4703 | ||
4697 | /* | 4704 | /* |
4698 | * We need to make sure we don't reuse the freed block until | ||
4699 | * after the transaction is committed, which we can do by | ||
4700 | * treating the block as metadata, below. We make an | ||
4701 | * exception if the inode is to be written in writeback mode | ||
4702 | * since writeback mode has weak data consistency guarantees. | ||
4703 | */ | ||
4704 | if (!ext4_should_writeback_data(inode)) | ||
4705 | flags |= EXT4_FREE_BLOCKS_METADATA; | ||
4706 | |||
4707 | /* | ||
4708 | * If the extent to be freed does not begin on a cluster | 4705 | * If the extent to be freed does not begin on a cluster |
4709 | * boundary, we need to deal with partial clusters at the | 4706 | * boundary, we need to deal with partial clusters at the |
4710 | * beginning and end of the extent. Normally we will free | 4707 | * beginning and end of the extent. Normally we will free |
@@ -4738,14 +4735,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, | |||
4738 | 4735 | ||
4739 | if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { | 4736 | if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { |
4740 | int i; | 4737 | int i; |
4738 | int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA; | ||
4741 | 4739 | ||
4742 | for (i = 0; i < count; i++) { | 4740 | for (i = 0; i < count; i++) { |
4743 | cond_resched(); | 4741 | cond_resched(); |
4744 | bh = sb_find_get_block(inode->i_sb, block + i); | 4742 | if (is_metadata) |
4745 | if (!bh) | 4743 | bh = sb_find_get_block(inode->i_sb, block + i); |
4746 | continue; | 4744 | ext4_forget(handle, is_metadata, inode, bh, block + i); |
4747 | ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, | ||
4748 | inode, bh, block + i); | ||
4749 | } | 4745 | } |
4750 | } | 4746 | } |
4751 | 4747 | ||
@@ -4815,16 +4811,23 @@ do_more: | |||
4815 | #endif | 4811 | #endif |
4816 | trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); | 4812 | trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); |
4817 | 4813 | ||
4818 | err = ext4_mb_load_buddy(sb, block_group, &e4b); | 4814 | /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ |
4815 | err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, | ||
4816 | GFP_NOFS|__GFP_NOFAIL); | ||
4819 | if (err) | 4817 | if (err) |
4820 | goto error_return; | 4818 | goto error_return; |
4821 | 4819 | ||
4822 | if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { | 4820 | /* |
4821 | * We need to make sure we don't reuse the freed block until after the | ||
4822 | * transaction is committed. We make an exception if the inode is to be | ||
4823 | * written in writeback mode since writeback mode has weak data | ||
4824 | * consistency guarantees. | ||
4825 | */ | ||
4826 | if (ext4_handle_valid(handle) && | ||
4827 | ((flags & EXT4_FREE_BLOCKS_METADATA) || | ||
4828 | !ext4_should_writeback_data(inode))) { | ||
4823 | struct ext4_free_data *new_entry; | 4829 | struct ext4_free_data *new_entry; |
4824 | /* | 4830 | /* |
4825 | * blocks being freed are metadata. these blocks shouldn't | ||
4826 | * be used until this transaction is committed | ||
4827 | * | ||
4828 | * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed | 4831 | * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed |
4829 | * to fail. | 4832 | * to fail. |
4830 | */ | 4833 | */ |
@@ -5217,7 +5220,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) | |||
5217 | grp = ext4_get_group_info(sb, group); | 5220 | grp = ext4_get_group_info(sb, group); |
5218 | /* We only do this if the grp has never been initialized */ | 5221 | /* We only do this if the grp has never been initialized */ |
5219 | if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { | 5222 | if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { |
5220 | ret = ext4_mb_init_group(sb, group); | 5223 | ret = ext4_mb_init_group(sb, group, GFP_NOFS); |
5221 | if (ret) | 5224 | if (ret) |
5222 | break; | 5225 | break; |
5223 | } | 5226 | } |
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index d634e183b4d4..3ef1df6ae9ec 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h | |||
@@ -23,18 +23,6 @@ | |||
23 | #include "ext4.h" | 23 | #include "ext4.h" |
24 | 24 | ||
25 | /* | 25 | /* |
26 | * with AGGRESSIVE_CHECK allocator runs consistency checks over | ||
27 | * structures. these checks slow things down a lot | ||
28 | */ | ||
29 | #define AGGRESSIVE_CHECK__ | ||
30 | |||
31 | /* | ||
32 | * with DOUBLE_CHECK defined mballoc creates persistent in-core | ||
33 | * bitmaps, maintains and uses them to check for double allocations | ||
34 | */ | ||
35 | #define DOUBLE_CHECK__ | ||
36 | |||
37 | /* | ||
38 | */ | 26 | */ |
39 | #ifdef CONFIG_EXT4_DEBUG | 27 | #ifdef CONFIG_EXT4_DEBUG |
40 | extern ushort ext4_mballoc_debug; | 28 | extern ushort ext4_mballoc_debug; |
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index a4651894cc33..364ea4d4a943 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c | |||
@@ -361,7 +361,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, | |||
361 | * blocks. | 361 | * blocks. |
362 | * | 362 | * |
363 | * While converting to extents we need not | 363 | * While converting to extents we need not |
364 | * update the orignal inode i_blocks for extent blocks | 364 | * update the original inode i_blocks for extent blocks |
365 | * via quota APIs. The quota update happened via tmp_inode already. | 365 | * via quota APIs. The quota update happened via tmp_inode already. |
366 | */ | 366 | */ |
367 | spin_lock(&inode->i_lock); | 367 | spin_lock(&inode->i_lock); |
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 0a512aa81bf7..24445275d330 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c | |||
@@ -91,21 +91,22 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, | |||
91 | submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); | 91 | submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); |
92 | wait_on_buffer(*bh); | 92 | wait_on_buffer(*bh); |
93 | if (!buffer_uptodate(*bh)) { | 93 | if (!buffer_uptodate(*bh)) { |
94 | brelse(*bh); | ||
95 | *bh = NULL; | ||
96 | ret = -EIO; | 94 | ret = -EIO; |
97 | goto warn_exit; | 95 | goto warn_exit; |
98 | } | 96 | } |
99 | |||
100 | mmp = (struct mmp_struct *)((*bh)->b_data); | 97 | mmp = (struct mmp_struct *)((*bh)->b_data); |
101 | if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) | 98 | if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) { |
102 | ret = -EFSCORRUPTED; | 99 | ret = -EFSCORRUPTED; |
103 | else if (!ext4_mmp_csum_verify(sb, mmp)) | 100 | goto warn_exit; |
101 | } | ||
102 | if (!ext4_mmp_csum_verify(sb, mmp)) { | ||
104 | ret = -EFSBADCRC; | 103 | ret = -EFSBADCRC; |
105 | else | 104 | goto warn_exit; |
106 | return 0; | 105 | } |
107 | 106 | return 0; | |
108 | warn_exit: | 107 | warn_exit: |
108 | brelse(*bh); | ||
109 | *bh = NULL; | ||
109 | ext4_warning(sb, "Error %d while reading MMP block %llu", | 110 | ext4_warning(sb, "Error %d while reading MMP block %llu", |
110 | ret, mmp_block); | 111 | ret, mmp_block); |
111 | return ret; | 112 | return ret; |
@@ -181,15 +182,13 @@ static int kmmpd(void *data) | |||
181 | EXT4_FEATURE_INCOMPAT_MMP)) { | 182 | EXT4_FEATURE_INCOMPAT_MMP)) { |
182 | ext4_warning(sb, "kmmpd being stopped since MMP feature" | 183 | ext4_warning(sb, "kmmpd being stopped since MMP feature" |
183 | " has been disabled."); | 184 | " has been disabled."); |
184 | EXT4_SB(sb)->s_mmp_tsk = NULL; | 185 | goto exit_thread; |
185 | goto failed; | ||
186 | } | 186 | } |
187 | 187 | ||
188 | if (sb->s_flags & MS_RDONLY) { | 188 | if (sb->s_flags & MS_RDONLY) { |
189 | ext4_warning(sb, "kmmpd being stopped since filesystem " | 189 | ext4_warning(sb, "kmmpd being stopped since filesystem " |
190 | "has been remounted as readonly."); | 190 | "has been remounted as readonly."); |
191 | EXT4_SB(sb)->s_mmp_tsk = NULL; | 191 | goto exit_thread; |
192 | goto failed; | ||
193 | } | 192 | } |
194 | 193 | ||
195 | diff = jiffies - last_update_time; | 194 | diff = jiffies - last_update_time; |
@@ -211,9 +210,7 @@ static int kmmpd(void *data) | |||
211 | if (retval) { | 210 | if (retval) { |
212 | ext4_error(sb, "error reading MMP data: %d", | 211 | ext4_error(sb, "error reading MMP data: %d", |
213 | retval); | 212 | retval); |
214 | 213 | goto exit_thread; | |
215 | EXT4_SB(sb)->s_mmp_tsk = NULL; | ||
216 | goto failed; | ||
217 | } | 214 | } |
218 | 215 | ||
219 | mmp_check = (struct mmp_struct *)(bh_check->b_data); | 216 | mmp_check = (struct mmp_struct *)(bh_check->b_data); |
@@ -225,7 +222,9 @@ static int kmmpd(void *data) | |||
225 | "The filesystem seems to have been" | 222 | "The filesystem seems to have been" |
226 | " multiply mounted."); | 223 | " multiply mounted."); |
227 | ext4_error(sb, "abort"); | 224 | ext4_error(sb, "abort"); |
228 | goto failed; | 225 | put_bh(bh_check); |
226 | retval = -EBUSY; | ||
227 | goto exit_thread; | ||
229 | } | 228 | } |
230 | put_bh(bh_check); | 229 | put_bh(bh_check); |
231 | } | 230 | } |
@@ -248,7 +247,8 @@ static int kmmpd(void *data) | |||
248 | 247 | ||
249 | retval = write_mmp_block(sb, bh); | 248 | retval = write_mmp_block(sb, bh); |
250 | 249 | ||
251 | failed: | 250 | exit_thread: |
251 | EXT4_SB(sb)->s_mmp_tsk = NULL; | ||
252 | kfree(data); | 252 | kfree(data); |
253 | brelse(bh); | 253 | brelse(bh); |
254 | return retval; | 254 | return retval; |
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 090b3498638e..349d7aa04fe7 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c | |||
@@ -128,9 +128,6 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) | |||
128 | BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); | 128 | BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); |
129 | WARN_ON(io_end->handle); | 129 | WARN_ON(io_end->handle); |
130 | 130 | ||
131 | if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) | ||
132 | wake_up_all(ext4_ioend_wq(io_end->inode)); | ||
133 | |||
134 | for (bio = io_end->bio; bio; bio = next_bio) { | 131 | for (bio = io_end->bio; bio; bio = next_bio) { |
135 | next_bio = bio->bi_private; | 132 | next_bio = bio->bi_private; |
136 | ext4_finish_bio(bio); | 133 | ext4_finish_bio(bio); |
@@ -265,7 +262,6 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) | |||
265 | { | 262 | { |
266 | ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); | 263 | ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); |
267 | if (io) { | 264 | if (io) { |
268 | atomic_inc(&EXT4_I(inode)->i_ioend_count); | ||
269 | io->inode = inode; | 265 | io->inode = inode; |
270 | INIT_LIST_HEAD(&io->list); | 266 | INIT_LIST_HEAD(&io->list); |
271 | atomic_set(&io->count, 1); | 267 | atomic_set(&io->count, 1); |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3ed01ec011d7..99996e9a8f57 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -55,7 +55,6 @@ | |||
55 | 55 | ||
56 | static struct ext4_lazy_init *ext4_li_info; | 56 | static struct ext4_lazy_init *ext4_li_info; |
57 | static struct mutex ext4_li_mtx; | 57 | static struct mutex ext4_li_mtx; |
58 | static int ext4_mballoc_ready; | ||
59 | static struct ratelimit_state ext4_mount_msg_ratelimit; | 58 | static struct ratelimit_state ext4_mount_msg_ratelimit; |
60 | 59 | ||
61 | static int ext4_load_journal(struct super_block *, struct ext4_super_block *, | 60 | static int ext4_load_journal(struct super_block *, struct ext4_super_block *, |
@@ -844,7 +843,6 @@ static void ext4_put_super(struct super_block *sb) | |||
844 | ext4_release_system_zone(sb); | 843 | ext4_release_system_zone(sb); |
845 | ext4_mb_release(sb); | 844 | ext4_mb_release(sb); |
846 | ext4_ext_release(sb); | 845 | ext4_ext_release(sb); |
847 | ext4_xattr_put_super(sb); | ||
848 | 846 | ||
849 | if (!(sb->s_flags & MS_RDONLY)) { | 847 | if (!(sb->s_flags & MS_RDONLY)) { |
850 | ext4_clear_feature_journal_needs_recovery(sb); | 848 | ext4_clear_feature_journal_needs_recovery(sb); |
@@ -944,7 +942,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) | |||
944 | spin_lock_init(&ei->i_completed_io_lock); | 942 | spin_lock_init(&ei->i_completed_io_lock); |
945 | ei->i_sync_tid = 0; | 943 | ei->i_sync_tid = 0; |
946 | ei->i_datasync_tid = 0; | 944 | ei->i_datasync_tid = 0; |
947 | atomic_set(&ei->i_ioend_count, 0); | ||
948 | atomic_set(&ei->i_unwritten, 0); | 945 | atomic_set(&ei->i_unwritten, 0); |
949 | INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); | 946 | INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); |
950 | #ifdef CONFIG_EXT4_FS_ENCRYPTION | 947 | #ifdef CONFIG_EXT4_FS_ENCRYPTION |
@@ -1425,9 +1422,9 @@ static const struct mount_opts { | |||
1425 | {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, | 1422 | {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, |
1426 | {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, | 1423 | {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, |
1427 | {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, | 1424 | {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, |
1428 | MOPT_NO_EXT2 | MOPT_SET}, | 1425 | MOPT_NO_EXT2}, |
1429 | {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, | 1426 | {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, |
1430 | MOPT_NO_EXT2 | MOPT_CLEAR}, | 1427 | MOPT_NO_EXT2}, |
1431 | {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, | 1428 | {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET}, |
1432 | {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, | 1429 | {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR}, |
1433 | {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, | 1430 | {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET}, |
@@ -1705,6 +1702,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, | |||
1705 | ext4_msg(sb, KERN_INFO, "dax option not supported"); | 1702 | ext4_msg(sb, KERN_INFO, "dax option not supported"); |
1706 | return -1; | 1703 | return -1; |
1707 | #endif | 1704 | #endif |
1705 | } else if (token == Opt_data_err_abort) { | ||
1706 | sbi->s_mount_opt |= m->mount_opt; | ||
1707 | } else if (token == Opt_data_err_ignore) { | ||
1708 | sbi->s_mount_opt &= ~m->mount_opt; | ||
1708 | } else { | 1709 | } else { |
1709 | if (!args->from) | 1710 | if (!args->from) |
1710 | arg = 1; | 1711 | arg = 1; |
@@ -1914,6 +1915,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, | |||
1914 | SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); | 1915 | SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult); |
1915 | if (nodefs || sbi->s_max_dir_size_kb) | 1916 | if (nodefs || sbi->s_max_dir_size_kb) |
1916 | SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); | 1917 | SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); |
1918 | if (test_opt(sb, DATA_ERR_ABORT)) | ||
1919 | SEQ_OPTS_PUTS("data_err=abort"); | ||
1917 | 1920 | ||
1918 | ext4_show_quota_options(seq, sb); | 1921 | ext4_show_quota_options(seq, sb); |
1919 | return 0; | 1922 | return 0; |
@@ -3796,12 +3799,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
3796 | sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; | 3799 | sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; |
3797 | 3800 | ||
3798 | no_journal: | 3801 | no_journal: |
3799 | if (ext4_mballoc_ready) { | 3802 | sbi->s_mb_cache = ext4_xattr_create_cache(); |
3800 | sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); | 3803 | if (!sbi->s_mb_cache) { |
3801 | if (!sbi->s_mb_cache) { | 3804 | ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); |
3802 | ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache"); | 3805 | goto failed_mount_wq; |
3803 | goto failed_mount_wq; | ||
3804 | } | ||
3805 | } | 3806 | } |
3806 | 3807 | ||
3807 | if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && | 3808 | if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && |
@@ -4027,6 +4028,10 @@ failed_mount4: | |||
4027 | if (EXT4_SB(sb)->rsv_conversion_wq) | 4028 | if (EXT4_SB(sb)->rsv_conversion_wq) |
4028 | destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); | 4029 | destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); |
4029 | failed_mount_wq: | 4030 | failed_mount_wq: |
4031 | if (sbi->s_mb_cache) { | ||
4032 | ext4_xattr_destroy_cache(sbi->s_mb_cache); | ||
4033 | sbi->s_mb_cache = NULL; | ||
4034 | } | ||
4030 | if (sbi->s_journal) { | 4035 | if (sbi->s_journal) { |
4031 | jbd2_journal_destroy(sbi->s_journal); | 4036 | jbd2_journal_destroy(sbi->s_journal); |
4032 | sbi->s_journal = NULL; | 4037 | sbi->s_journal = NULL; |
@@ -5321,7 +5326,6 @@ MODULE_ALIAS_FS("ext4"); | |||
5321 | 5326 | ||
5322 | /* Shared across all ext4 file systems */ | 5327 | /* Shared across all ext4 file systems */ |
5323 | wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; | 5328 | wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; |
5324 | struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; | ||
5325 | 5329 | ||
5326 | static int __init ext4_init_fs(void) | 5330 | static int __init ext4_init_fs(void) |
5327 | { | 5331 | { |
@@ -5334,10 +5338,8 @@ static int __init ext4_init_fs(void) | |||
5334 | /* Build-time check for flags consistency */ | 5338 | /* Build-time check for flags consistency */ |
5335 | ext4_check_flag_values(); | 5339 | ext4_check_flag_values(); |
5336 | 5340 | ||
5337 | for (i = 0; i < EXT4_WQ_HASH_SZ; i++) { | 5341 | for (i = 0; i < EXT4_WQ_HASH_SZ; i++) |
5338 | mutex_init(&ext4__aio_mutex[i]); | ||
5339 | init_waitqueue_head(&ext4__ioend_wq[i]); | 5342 | init_waitqueue_head(&ext4__ioend_wq[i]); |
5340 | } | ||
5341 | 5343 | ||
5342 | err = ext4_init_es(); | 5344 | err = ext4_init_es(); |
5343 | if (err) | 5345 | if (err) |
@@ -5358,8 +5360,6 @@ static int __init ext4_init_fs(void) | |||
5358 | err = ext4_init_mballoc(); | 5360 | err = ext4_init_mballoc(); |
5359 | if (err) | 5361 | if (err) |
5360 | goto out2; | 5362 | goto out2; |
5361 | else | ||
5362 | ext4_mballoc_ready = 1; | ||
5363 | err = init_inodecache(); | 5363 | err = init_inodecache(); |
5364 | if (err) | 5364 | if (err) |
5365 | goto out1; | 5365 | goto out1; |
@@ -5375,7 +5375,6 @@ out: | |||
5375 | unregister_as_ext3(); | 5375 | unregister_as_ext3(); |
5376 | destroy_inodecache(); | 5376 | destroy_inodecache(); |
5377 | out1: | 5377 | out1: |
5378 | ext4_mballoc_ready = 0; | ||
5379 | ext4_exit_mballoc(); | 5378 | ext4_exit_mballoc(); |
5380 | out2: | 5379 | out2: |
5381 | ext4_exit_sysfs(); | 5380 | ext4_exit_sysfs(); |
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index a95151e875bd..0441e055c8e8 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c | |||
@@ -545,30 +545,44 @@ static void | |||
545 | ext4_xattr_release_block(handle_t *handle, struct inode *inode, | 545 | ext4_xattr_release_block(handle_t *handle, struct inode *inode, |
546 | struct buffer_head *bh) | 546 | struct buffer_head *bh) |
547 | { | 547 | { |
548 | struct mb_cache_entry *ce = NULL; | ||
549 | int error = 0; | ||
550 | struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); | 548 | struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); |
549 | u32 hash, ref; | ||
550 | int error = 0; | ||
551 | 551 | ||
552 | ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr); | ||
553 | BUFFER_TRACE(bh, "get_write_access"); | 552 | BUFFER_TRACE(bh, "get_write_access"); |
554 | error = ext4_journal_get_write_access(handle, bh); | 553 | error = ext4_journal_get_write_access(handle, bh); |
555 | if (error) | 554 | if (error) |
556 | goto out; | 555 | goto out; |
557 | 556 | ||
558 | lock_buffer(bh); | 557 | lock_buffer(bh); |
559 | if (BHDR(bh)->h_refcount == cpu_to_le32(1)) { | 558 | hash = le32_to_cpu(BHDR(bh)->h_hash); |
559 | ref = le32_to_cpu(BHDR(bh)->h_refcount); | ||
560 | if (ref == 1) { | ||
560 | ea_bdebug(bh, "refcount now=0; freeing"); | 561 | ea_bdebug(bh, "refcount now=0; freeing"); |
561 | if (ce) | 562 | /* |
562 | mb_cache_entry_free(ce); | 563 | * This must happen under buffer lock for |
564 | * ext4_xattr_block_set() to reliably detect freed block | ||
565 | */ | ||
566 | mb_cache_entry_delete_block(ext4_mb_cache, hash, bh->b_blocknr); | ||
563 | get_bh(bh); | 567 | get_bh(bh); |
564 | unlock_buffer(bh); | 568 | unlock_buffer(bh); |
565 | ext4_free_blocks(handle, inode, bh, 0, 1, | 569 | ext4_free_blocks(handle, inode, bh, 0, 1, |
566 | EXT4_FREE_BLOCKS_METADATA | | 570 | EXT4_FREE_BLOCKS_METADATA | |
567 | EXT4_FREE_BLOCKS_FORGET); | 571 | EXT4_FREE_BLOCKS_FORGET); |
568 | } else { | 572 | } else { |
569 | le32_add_cpu(&BHDR(bh)->h_refcount, -1); | 573 | ref--; |
570 | if (ce) | 574 | BHDR(bh)->h_refcount = cpu_to_le32(ref); |
571 | mb_cache_entry_release(ce); | 575 | if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) { |
576 | struct mb_cache_entry *ce; | ||
577 | |||
578 | ce = mb_cache_entry_get(ext4_mb_cache, hash, | ||
579 | bh->b_blocknr); | ||
580 | if (ce) { | ||
581 | ce->e_reusable = 1; | ||
582 | mb_cache_entry_put(ext4_mb_cache, ce); | ||
583 | } | ||
584 | } | ||
585 | |||
572 | /* | 586 | /* |
573 | * Beware of this ugliness: Releasing of xattr block references | 587 | * Beware of this ugliness: Releasing of xattr block references |
574 | * from different inodes can race and so we have to protect | 588 | * from different inodes can race and so we have to protect |
@@ -790,8 +804,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, | |||
790 | if (i->value && i->value_len > sb->s_blocksize) | 804 | if (i->value && i->value_len > sb->s_blocksize) |
791 | return -ENOSPC; | 805 | return -ENOSPC; |
792 | if (s->base) { | 806 | if (s->base) { |
793 | ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev, | ||
794 | bs->bh->b_blocknr); | ||
795 | BUFFER_TRACE(bs->bh, "get_write_access"); | 807 | BUFFER_TRACE(bs->bh, "get_write_access"); |
796 | error = ext4_journal_get_write_access(handle, bs->bh); | 808 | error = ext4_journal_get_write_access(handle, bs->bh); |
797 | if (error) | 809 | if (error) |
@@ -799,10 +811,15 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, | |||
799 | lock_buffer(bs->bh); | 811 | lock_buffer(bs->bh); |
800 | 812 | ||
801 | if (header(s->base)->h_refcount == cpu_to_le32(1)) { | 813 | if (header(s->base)->h_refcount == cpu_to_le32(1)) { |
802 | if (ce) { | 814 | __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash); |
803 | mb_cache_entry_free(ce); | 815 | |
804 | ce = NULL; | 816 | /* |
805 | } | 817 | * This must happen under buffer lock for |
818 | * ext4_xattr_block_set() to reliably detect modified | ||
819 | * block | ||
820 | */ | ||
821 | mb_cache_entry_delete_block(ext4_mb_cache, hash, | ||
822 | bs->bh->b_blocknr); | ||
806 | ea_bdebug(bs->bh, "modifying in-place"); | 823 | ea_bdebug(bs->bh, "modifying in-place"); |
807 | error = ext4_xattr_set_entry(i, s); | 824 | error = ext4_xattr_set_entry(i, s); |
808 | if (!error) { | 825 | if (!error) { |
@@ -826,10 +843,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, | |||
826 | int offset = (char *)s->here - bs->bh->b_data; | 843 | int offset = (char *)s->here - bs->bh->b_data; |
827 | 844 | ||
828 | unlock_buffer(bs->bh); | 845 | unlock_buffer(bs->bh); |
829 | if (ce) { | ||
830 | mb_cache_entry_release(ce); | ||
831 | ce = NULL; | ||
832 | } | ||
833 | ea_bdebug(bs->bh, "cloning"); | 846 | ea_bdebug(bs->bh, "cloning"); |
834 | s->base = kmalloc(bs->bh->b_size, GFP_NOFS); | 847 | s->base = kmalloc(bs->bh->b_size, GFP_NOFS); |
835 | error = -ENOMEM; | 848 | error = -ENOMEM; |
@@ -872,6 +885,8 @@ inserted: | |||
872 | if (new_bh == bs->bh) | 885 | if (new_bh == bs->bh) |
873 | ea_bdebug(new_bh, "keeping"); | 886 | ea_bdebug(new_bh, "keeping"); |
874 | else { | 887 | else { |
888 | u32 ref; | ||
889 | |||
875 | /* The old block is released after updating | 890 | /* The old block is released after updating |
876 | the inode. */ | 891 | the inode. */ |
877 | error = dquot_alloc_block(inode, | 892 | error = dquot_alloc_block(inode, |
@@ -884,9 +899,40 @@ inserted: | |||
884 | if (error) | 899 | if (error) |
885 | goto cleanup_dquot; | 900 | goto cleanup_dquot; |
886 | lock_buffer(new_bh); | 901 | lock_buffer(new_bh); |
887 | le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); | 902 | /* |
903 | * We have to be careful about races with | ||
904 | * freeing, rehashing or adding references to | ||
905 | * xattr block. Once we hold buffer lock xattr | ||
906 | * block's state is stable so we can check | ||
907 | * whether the block got freed / rehashed or | ||
908 | * not. Since we unhash mbcache entry under | ||
909 | * buffer lock when freeing / rehashing xattr | ||
910 | * block, checking whether entry is still | ||
911 | * hashed is reliable. Same rules hold for | ||
912 | * e_reusable handling. | ||
913 | */ | ||
914 | if (hlist_bl_unhashed(&ce->e_hash_list) || | ||
915 | !ce->e_reusable) { | ||
916 | /* | ||
917 | * Undo everything and check mbcache | ||
918 | * again. | ||
919 | */ | ||
920 | unlock_buffer(new_bh); | ||
921 | dquot_free_block(inode, | ||
922 | EXT4_C2B(EXT4_SB(sb), | ||
923 | 1)); | ||
924 | brelse(new_bh); | ||
925 | mb_cache_entry_put(ext4_mb_cache, ce); | ||
926 | ce = NULL; | ||
927 | new_bh = NULL; | ||
928 | goto inserted; | ||
929 | } | ||
930 | ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1; | ||
931 | BHDR(new_bh)->h_refcount = cpu_to_le32(ref); | ||
932 | if (ref >= EXT4_XATTR_REFCOUNT_MAX) | ||
933 | ce->e_reusable = 0; | ||
888 | ea_bdebug(new_bh, "reusing; refcount now=%d", | 934 | ea_bdebug(new_bh, "reusing; refcount now=%d", |
889 | le32_to_cpu(BHDR(new_bh)->h_refcount)); | 935 | ref); |
890 | unlock_buffer(new_bh); | 936 | unlock_buffer(new_bh); |
891 | error = ext4_handle_dirty_xattr_block(handle, | 937 | error = ext4_handle_dirty_xattr_block(handle, |
892 | inode, | 938 | inode, |
@@ -894,7 +940,8 @@ inserted: | |||
894 | if (error) | 940 | if (error) |
895 | goto cleanup_dquot; | 941 | goto cleanup_dquot; |
896 | } | 942 | } |
897 | mb_cache_entry_release(ce); | 943 | mb_cache_entry_touch(ext4_mb_cache, ce); |
944 | mb_cache_entry_put(ext4_mb_cache, ce); | ||
898 | ce = NULL; | 945 | ce = NULL; |
899 | } else if (bs->bh && s->base == bs->bh->b_data) { | 946 | } else if (bs->bh && s->base == bs->bh->b_data) { |
900 | /* We were modifying this block in-place. */ | 947 | /* We were modifying this block in-place. */ |
@@ -959,7 +1006,7 @@ getblk_failed: | |||
959 | 1006 | ||
960 | cleanup: | 1007 | cleanup: |
961 | if (ce) | 1008 | if (ce) |
962 | mb_cache_entry_release(ce); | 1009 | mb_cache_entry_put(ext4_mb_cache, ce); |
963 | brelse(new_bh); | 1010 | brelse(new_bh); |
964 | if (!(bs->bh && s->base == bs->bh->b_data)) | 1011 | if (!(bs->bh && s->base == bs->bh->b_data)) |
965 | kfree(s->base); | 1012 | kfree(s->base); |
@@ -1070,6 +1117,17 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, | |||
1070 | return 0; | 1117 | return 0; |
1071 | } | 1118 | } |
1072 | 1119 | ||
1120 | static int ext4_xattr_value_same(struct ext4_xattr_search *s, | ||
1121 | struct ext4_xattr_info *i) | ||
1122 | { | ||
1123 | void *value; | ||
1124 | |||
1125 | if (le32_to_cpu(s->here->e_value_size) != i->value_len) | ||
1126 | return 0; | ||
1127 | value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs); | ||
1128 | return !memcmp(value, i->value, i->value_len); | ||
1129 | } | ||
1130 | |||
1073 | /* | 1131 | /* |
1074 | * ext4_xattr_set_handle() | 1132 | * ext4_xattr_set_handle() |
1075 | * | 1133 | * |
@@ -1146,6 +1204,13 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, | |||
1146 | else if (!bs.s.not_found) | 1204 | else if (!bs.s.not_found) |
1147 | error = ext4_xattr_block_set(handle, inode, &i, &bs); | 1205 | error = ext4_xattr_block_set(handle, inode, &i, &bs); |
1148 | } else { | 1206 | } else { |
1207 | error = 0; | ||
1208 | /* Xattr value did not change? Save us some work and bail out */ | ||
1209 | if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i)) | ||
1210 | goto cleanup; | ||
1211 | if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) | ||
1212 | goto cleanup; | ||
1213 | |||
1149 | error = ext4_xattr_ibody_set(handle, inode, &i, &is); | 1214 | error = ext4_xattr_ibody_set(handle, inode, &i, &is); |
1150 | if (!error && !bs.s.not_found) { | 1215 | if (!error && !bs.s.not_found) { |
1151 | i.value = NULL; | 1216 | i.value = NULL; |
@@ -1512,17 +1577,6 @@ cleanup: | |||
1512 | } | 1577 | } |
1513 | 1578 | ||
1514 | /* | 1579 | /* |
1515 | * ext4_xattr_put_super() | ||
1516 | * | ||
1517 | * This is called when a file system is unmounted. | ||
1518 | */ | ||
1519 | void | ||
1520 | ext4_xattr_put_super(struct super_block *sb) | ||
1521 | { | ||
1522 | mb_cache_shrink(sb->s_bdev); | ||
1523 | } | ||
1524 | |||
1525 | /* | ||
1526 | * ext4_xattr_cache_insert() | 1580 | * ext4_xattr_cache_insert() |
1527 | * | 1581 | * |
1528 | * Create a new entry in the extended attribute cache, and insert | 1582 | * Create a new entry in the extended attribute cache, and insert |
@@ -1533,26 +1587,19 @@ ext4_xattr_put_super(struct super_block *sb) | |||
1533 | static void | 1587 | static void |
1534 | ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) | 1588 | ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh) |
1535 | { | 1589 | { |
1536 | __u32 hash = le32_to_cpu(BHDR(bh)->h_hash); | 1590 | struct ext4_xattr_header *header = BHDR(bh); |
1537 | struct mb_cache_entry *ce; | 1591 | __u32 hash = le32_to_cpu(header->h_hash); |
1592 | int reusable = le32_to_cpu(header->h_refcount) < | ||
1593 | EXT4_XATTR_REFCOUNT_MAX; | ||
1538 | int error; | 1594 | int error; |
1539 | 1595 | ||
1540 | ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS); | 1596 | error = mb_cache_entry_create(ext4_mb_cache, GFP_NOFS, hash, |
1541 | if (!ce) { | 1597 | bh->b_blocknr, reusable); |
1542 | ea_bdebug(bh, "out of memory"); | ||
1543 | return; | ||
1544 | } | ||
1545 | error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash); | ||
1546 | if (error) { | 1598 | if (error) { |
1547 | mb_cache_entry_free(ce); | 1599 | if (error == -EBUSY) |
1548 | if (error == -EBUSY) { | ||
1549 | ea_bdebug(bh, "already in cache"); | 1600 | ea_bdebug(bh, "already in cache"); |
1550 | error = 0; | 1601 | } else |
1551 | } | ||
1552 | } else { | ||
1553 | ea_bdebug(bh, "inserting [%x]", (int)hash); | 1602 | ea_bdebug(bh, "inserting [%x]", (int)hash); |
1554 | mb_cache_entry_release(ce); | ||
1555 | } | ||
1556 | } | 1603 | } |
1557 | 1604 | ||
1558 | /* | 1605 | /* |
@@ -1614,33 +1661,20 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header, | |||
1614 | if (!header->h_hash) | 1661 | if (!header->h_hash) |
1615 | return NULL; /* never share */ | 1662 | return NULL; /* never share */ |
1616 | ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); | 1663 | ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); |
1617 | again: | 1664 | ce = mb_cache_entry_find_first(ext4_mb_cache, hash); |
1618 | ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev, | ||
1619 | hash); | ||
1620 | while (ce) { | 1665 | while (ce) { |
1621 | struct buffer_head *bh; | 1666 | struct buffer_head *bh; |
1622 | 1667 | ||
1623 | if (IS_ERR(ce)) { | ||
1624 | if (PTR_ERR(ce) == -EAGAIN) | ||
1625 | goto again; | ||
1626 | break; | ||
1627 | } | ||
1628 | bh = sb_bread(inode->i_sb, ce->e_block); | 1668 | bh = sb_bread(inode->i_sb, ce->e_block); |
1629 | if (!bh) { | 1669 | if (!bh) { |
1630 | EXT4_ERROR_INODE(inode, "block %lu read error", | 1670 | EXT4_ERROR_INODE(inode, "block %lu read error", |
1631 | (unsigned long) ce->e_block); | 1671 | (unsigned long) ce->e_block); |
1632 | } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= | ||
1633 | EXT4_XATTR_REFCOUNT_MAX) { | ||
1634 | ea_idebug(inode, "block %lu refcount %d>=%d", | ||
1635 | (unsigned long) ce->e_block, | ||
1636 | le32_to_cpu(BHDR(bh)->h_refcount), | ||
1637 | EXT4_XATTR_REFCOUNT_MAX); | ||
1638 | } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { | 1672 | } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { |
1639 | *pce = ce; | 1673 | *pce = ce; |
1640 | return bh; | 1674 | return bh; |
1641 | } | 1675 | } |
1642 | brelse(bh); | 1676 | brelse(bh); |
1643 | ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash); | 1677 | ce = mb_cache_entry_find_next(ext4_mb_cache, ce); |
1644 | } | 1678 | } |
1645 | return NULL; | 1679 | return NULL; |
1646 | } | 1680 | } |
@@ -1716,9 +1750,9 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header, | |||
1716 | #define HASH_BUCKET_BITS 10 | 1750 | #define HASH_BUCKET_BITS 10 |
1717 | 1751 | ||
1718 | struct mb_cache * | 1752 | struct mb_cache * |
1719 | ext4_xattr_create_cache(char *name) | 1753 | ext4_xattr_create_cache(void) |
1720 | { | 1754 | { |
1721 | return mb_cache_create(name, HASH_BUCKET_BITS); | 1755 | return mb_cache_create(HASH_BUCKET_BITS); |
1722 | } | 1756 | } |
1723 | 1757 | ||
1724 | void ext4_xattr_destroy_cache(struct mb_cache *cache) | 1758 | void ext4_xattr_destroy_cache(struct mb_cache *cache) |
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index ddc0957760ba..69dd3e6566e0 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h | |||
@@ -108,7 +108,6 @@ extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_ | |||
108 | extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); | 108 | extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); |
109 | 109 | ||
110 | extern void ext4_xattr_delete_inode(handle_t *, struct inode *); | 110 | extern void ext4_xattr_delete_inode(handle_t *, struct inode *); |
111 | extern void ext4_xattr_put_super(struct super_block *); | ||
112 | 111 | ||
113 | extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, | 112 | extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, |
114 | struct ext4_inode *raw_inode, handle_t *handle); | 113 | struct ext4_inode *raw_inode, handle_t *handle); |
@@ -124,7 +123,7 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, | |||
124 | struct ext4_xattr_info *i, | 123 | struct ext4_xattr_info *i, |
125 | struct ext4_xattr_ibody_find *is); | 124 | struct ext4_xattr_ibody_find *is); |
126 | 125 | ||
127 | extern struct mb_cache *ext4_xattr_create_cache(char *name); | 126 | extern struct mb_cache *ext4_xattr_create_cache(void); |
128 | extern void ext4_xattr_destroy_cache(struct mb_cache *); | 127 | extern void ext4_xattr_destroy_cache(struct mb_cache *); |
129 | 128 | ||
130 | #ifdef CONFIG_EXT4_FS_SECURITY | 129 | #ifdef CONFIG_EXT4_FS_SECURITY |
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 36345fefa3ff..517f2de784cf 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c | |||
@@ -131,14 +131,12 @@ static int journal_submit_commit_record(journal_t *journal, | |||
131 | if (is_journal_aborted(journal)) | 131 | if (is_journal_aborted(journal)) |
132 | return 0; | 132 | return 0; |
133 | 133 | ||
134 | bh = jbd2_journal_get_descriptor_buffer(journal); | 134 | bh = jbd2_journal_get_descriptor_buffer(commit_transaction, |
135 | JBD2_COMMIT_BLOCK); | ||
135 | if (!bh) | 136 | if (!bh) |
136 | return 1; | 137 | return 1; |
137 | 138 | ||
138 | tmp = (struct commit_header *)bh->b_data; | 139 | tmp = (struct commit_header *)bh->b_data; |
139 | tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); | ||
140 | tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); | ||
141 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); | ||
142 | tmp->h_commit_sec = cpu_to_be64(now.tv_sec); | 140 | tmp->h_commit_sec = cpu_to_be64(now.tv_sec); |
143 | tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); | 141 | tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); |
144 | 142 | ||
@@ -222,7 +220,7 @@ static int journal_submit_data_buffers(journal_t *journal, | |||
222 | spin_lock(&journal->j_list_lock); | 220 | spin_lock(&journal->j_list_lock); |
223 | list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { | 221 | list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { |
224 | mapping = jinode->i_vfs_inode->i_mapping; | 222 | mapping = jinode->i_vfs_inode->i_mapping; |
225 | set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); | 223 | jinode->i_flags |= JI_COMMIT_RUNNING; |
226 | spin_unlock(&journal->j_list_lock); | 224 | spin_unlock(&journal->j_list_lock); |
227 | /* | 225 | /* |
228 | * submit the inode data buffers. We use writepage | 226 | * submit the inode data buffers. We use writepage |
@@ -236,8 +234,8 @@ static int journal_submit_data_buffers(journal_t *journal, | |||
236 | ret = err; | 234 | ret = err; |
237 | spin_lock(&journal->j_list_lock); | 235 | spin_lock(&journal->j_list_lock); |
238 | J_ASSERT(jinode->i_transaction == commit_transaction); | 236 | J_ASSERT(jinode->i_transaction == commit_transaction); |
239 | clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); | 237 | jinode->i_flags &= ~JI_COMMIT_RUNNING; |
240 | smp_mb__after_atomic(); | 238 | smp_mb(); |
241 | wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); | 239 | wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); |
242 | } | 240 | } |
243 | spin_unlock(&journal->j_list_lock); | 241 | spin_unlock(&journal->j_list_lock); |
@@ -258,7 +256,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, | |||
258 | /* For locking, see the comment in journal_submit_data_buffers() */ | 256 | /* For locking, see the comment in journal_submit_data_buffers() */ |
259 | spin_lock(&journal->j_list_lock); | 257 | spin_lock(&journal->j_list_lock); |
260 | list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { | 258 | list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { |
261 | set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); | 259 | jinode->i_flags |= JI_COMMIT_RUNNING; |
262 | spin_unlock(&journal->j_list_lock); | 260 | spin_unlock(&journal->j_list_lock); |
263 | err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); | 261 | err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); |
264 | if (err) { | 262 | if (err) { |
@@ -274,8 +272,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal, | |||
274 | ret = err; | 272 | ret = err; |
275 | } | 273 | } |
276 | spin_lock(&journal->j_list_lock); | 274 | spin_lock(&journal->j_list_lock); |
277 | clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags); | 275 | jinode->i_flags &= ~JI_COMMIT_RUNNING; |
278 | smp_mb__after_atomic(); | 276 | smp_mb(); |
279 | wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); | 277 | wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); |
280 | } | 278 | } |
281 | 279 | ||
@@ -319,22 +317,6 @@ static void write_tag_block(journal_t *j, journal_block_tag_t *tag, | |||
319 | tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); | 317 | tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); |
320 | } | 318 | } |
321 | 319 | ||
322 | static void jbd2_descr_block_csum_set(journal_t *j, | ||
323 | struct buffer_head *bh) | ||
324 | { | ||
325 | struct jbd2_journal_block_tail *tail; | ||
326 | __u32 csum; | ||
327 | |||
328 | if (!jbd2_journal_has_csum_v2or3(j)) | ||
329 | return; | ||
330 | |||
331 | tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - | ||
332 | sizeof(struct jbd2_journal_block_tail)); | ||
333 | tail->t_checksum = 0; | ||
334 | csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); | ||
335 | tail->t_checksum = cpu_to_be32(csum); | ||
336 | } | ||
337 | |||
338 | static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, | 320 | static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, |
339 | struct buffer_head *bh, __u32 sequence) | 321 | struct buffer_head *bh, __u32 sequence) |
340 | { | 322 | { |
@@ -379,7 +361,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
379 | ktime_t start_time; | 361 | ktime_t start_time; |
380 | u64 commit_time; | 362 | u64 commit_time; |
381 | char *tagp = NULL; | 363 | char *tagp = NULL; |
382 | journal_header_t *header; | ||
383 | journal_block_tag_t *tag = NULL; | 364 | journal_block_tag_t *tag = NULL; |
384 | int space_left = 0; | 365 | int space_left = 0; |
385 | int first_tag = 0; | 366 | int first_tag = 0; |
@@ -554,8 +535,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
554 | jbd2_journal_abort(journal, err); | 535 | jbd2_journal_abort(journal, err); |
555 | 536 | ||
556 | blk_start_plug(&plug); | 537 | blk_start_plug(&plug); |
557 | jbd2_journal_write_revoke_records(journal, commit_transaction, | 538 | jbd2_journal_write_revoke_records(commit_transaction, &log_bufs); |
558 | &log_bufs, WRITE_SYNC); | ||
559 | 539 | ||
560 | jbd_debug(3, "JBD2: commit phase 2b\n"); | 540 | jbd_debug(3, "JBD2: commit phase 2b\n"); |
561 | 541 | ||
@@ -616,7 +596,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
616 | 596 | ||
617 | jbd_debug(4, "JBD2: get descriptor\n"); | 597 | jbd_debug(4, "JBD2: get descriptor\n"); |
618 | 598 | ||
619 | descriptor = jbd2_journal_get_descriptor_buffer(journal); | 599 | descriptor = jbd2_journal_get_descriptor_buffer( |
600 | commit_transaction, | ||
601 | JBD2_DESCRIPTOR_BLOCK); | ||
620 | if (!descriptor) { | 602 | if (!descriptor) { |
621 | jbd2_journal_abort(journal, -EIO); | 603 | jbd2_journal_abort(journal, -EIO); |
622 | continue; | 604 | continue; |
@@ -625,11 +607,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
625 | jbd_debug(4, "JBD2: got buffer %llu (%p)\n", | 607 | jbd_debug(4, "JBD2: got buffer %llu (%p)\n", |
626 | (unsigned long long)descriptor->b_blocknr, | 608 | (unsigned long long)descriptor->b_blocknr, |
627 | descriptor->b_data); | 609 | descriptor->b_data); |
628 | header = (journal_header_t *)descriptor->b_data; | ||
629 | header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); | ||
630 | header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK); | ||
631 | header->h_sequence = cpu_to_be32(commit_transaction->t_tid); | ||
632 | |||
633 | tagp = &descriptor->b_data[sizeof(journal_header_t)]; | 610 | tagp = &descriptor->b_data[sizeof(journal_header_t)]; |
634 | space_left = descriptor->b_size - | 611 | space_left = descriptor->b_size - |
635 | sizeof(journal_header_t); | 612 | sizeof(journal_header_t); |
@@ -721,7 +698,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
721 | 698 | ||
722 | tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); | 699 | tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); |
723 | 700 | ||
724 | jbd2_descr_block_csum_set(journal, descriptor); | 701 | jbd2_descriptor_block_csum_set(journal, descriptor); |
725 | start_journal_io: | 702 | start_journal_io: |
726 | for (i = 0; i < bufs; i++) { | 703 | for (i = 0; i < bufs; i++) { |
727 | struct buffer_head *bh = wbuf[i]; | 704 | struct buffer_head *bh = wbuf[i]; |
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 81e622681c82..de73a9516a54 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c | |||
@@ -805,10 +805,13 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, | |||
805 | * But we don't bother doing that, so there will be coherency problems with | 805 | * But we don't bother doing that, so there will be coherency problems with |
806 | * mmaps of blockdevs which hold live JBD-controlled filesystems. | 806 | * mmaps of blockdevs which hold live JBD-controlled filesystems. |
807 | */ | 807 | */ |
808 | struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) | 808 | struct buffer_head * |
809 | jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type) | ||
809 | { | 810 | { |
811 | journal_t *journal = transaction->t_journal; | ||
810 | struct buffer_head *bh; | 812 | struct buffer_head *bh; |
811 | unsigned long long blocknr; | 813 | unsigned long long blocknr; |
814 | journal_header_t *header; | ||
812 | int err; | 815 | int err; |
813 | 816 | ||
814 | err = jbd2_journal_next_log_block(journal, &blocknr); | 817 | err = jbd2_journal_next_log_block(journal, &blocknr); |
@@ -821,12 +824,31 @@ struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) | |||
821 | return NULL; | 824 | return NULL; |
822 | lock_buffer(bh); | 825 | lock_buffer(bh); |
823 | memset(bh->b_data, 0, journal->j_blocksize); | 826 | memset(bh->b_data, 0, journal->j_blocksize); |
827 | header = (journal_header_t *)bh->b_data; | ||
828 | header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); | ||
829 | header->h_blocktype = cpu_to_be32(type); | ||
830 | header->h_sequence = cpu_to_be32(transaction->t_tid); | ||
824 | set_buffer_uptodate(bh); | 831 | set_buffer_uptodate(bh); |
825 | unlock_buffer(bh); | 832 | unlock_buffer(bh); |
826 | BUFFER_TRACE(bh, "return this buffer"); | 833 | BUFFER_TRACE(bh, "return this buffer"); |
827 | return bh; | 834 | return bh; |
828 | } | 835 | } |
829 | 836 | ||
837 | void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh) | ||
838 | { | ||
839 | struct jbd2_journal_block_tail *tail; | ||
840 | __u32 csum; | ||
841 | |||
842 | if (!jbd2_journal_has_csum_v2or3(j)) | ||
843 | return; | ||
844 | |||
845 | tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - | ||
846 | sizeof(struct jbd2_journal_block_tail)); | ||
847 | tail->t_checksum = 0; | ||
848 | csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); | ||
849 | tail->t_checksum = cpu_to_be32(csum); | ||
850 | } | ||
851 | |||
830 | /* | 852 | /* |
831 | * Return tid of the oldest transaction in the journal and block in the journal | 853 | * Return tid of the oldest transaction in the journal and block in the journal |
832 | * where the transaction starts. | 854 | * where the transaction starts. |
@@ -1408,11 +1430,12 @@ out: | |||
1408 | /** | 1430 | /** |
1409 | * jbd2_mark_journal_empty() - Mark on disk journal as empty. | 1431 | * jbd2_mark_journal_empty() - Mark on disk journal as empty. |
1410 | * @journal: The journal to update. | 1432 | * @journal: The journal to update. |
1433 | * @write_op: With which operation should we write the journal sb | ||
1411 | * | 1434 | * |
1412 | * Update a journal's dynamic superblock fields to show that journal is empty. | 1435 | * Update a journal's dynamic superblock fields to show that journal is empty. |
1413 | * Write updated superblock to disk waiting for IO to complete. | 1436 | * Write updated superblock to disk waiting for IO to complete. |
1414 | */ | 1437 | */ |
1415 | static void jbd2_mark_journal_empty(journal_t *journal) | 1438 | static void jbd2_mark_journal_empty(journal_t *journal, int write_op) |
1416 | { | 1439 | { |
1417 | journal_superblock_t *sb = journal->j_superblock; | 1440 | journal_superblock_t *sb = journal->j_superblock; |
1418 | 1441 | ||
@@ -1430,7 +1453,7 @@ static void jbd2_mark_journal_empty(journal_t *journal) | |||
1430 | sb->s_start = cpu_to_be32(0); | 1453 | sb->s_start = cpu_to_be32(0); |
1431 | read_unlock(&journal->j_state_lock); | 1454 | read_unlock(&journal->j_state_lock); |
1432 | 1455 | ||
1433 | jbd2_write_superblock(journal, WRITE_FUA); | 1456 | jbd2_write_superblock(journal, write_op); |
1434 | 1457 | ||
1435 | /* Log is no longer empty */ | 1458 | /* Log is no longer empty */ |
1436 | write_lock(&journal->j_state_lock); | 1459 | write_lock(&journal->j_state_lock); |
@@ -1716,7 +1739,13 @@ int jbd2_journal_destroy(journal_t *journal) | |||
1716 | if (journal->j_sb_buffer) { | 1739 | if (journal->j_sb_buffer) { |
1717 | if (!is_journal_aborted(journal)) { | 1740 | if (!is_journal_aborted(journal)) { |
1718 | mutex_lock(&journal->j_checkpoint_mutex); | 1741 | mutex_lock(&journal->j_checkpoint_mutex); |
1719 | jbd2_mark_journal_empty(journal); | 1742 | |
1743 | write_lock(&journal->j_state_lock); | ||
1744 | journal->j_tail_sequence = | ||
1745 | ++journal->j_transaction_sequence; | ||
1746 | write_unlock(&journal->j_state_lock); | ||
1747 | |||
1748 | jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA); | ||
1720 | mutex_unlock(&journal->j_checkpoint_mutex); | 1749 | mutex_unlock(&journal->j_checkpoint_mutex); |
1721 | } else | 1750 | } else |
1722 | err = -EIO; | 1751 | err = -EIO; |
@@ -1975,7 +2004,7 @@ int jbd2_journal_flush(journal_t *journal) | |||
1975 | * the magic code for a fully-recovered superblock. Any future | 2004 | * the magic code for a fully-recovered superblock. Any future |
1976 | * commits of data to the journal will restore the current | 2005 | * commits of data to the journal will restore the current |
1977 | * s_start value. */ | 2006 | * s_start value. */ |
1978 | jbd2_mark_journal_empty(journal); | 2007 | jbd2_mark_journal_empty(journal, WRITE_FUA); |
1979 | mutex_unlock(&journal->j_checkpoint_mutex); | 2008 | mutex_unlock(&journal->j_checkpoint_mutex); |
1980 | write_lock(&journal->j_state_lock); | 2009 | write_lock(&journal->j_state_lock); |
1981 | J_ASSERT(!journal->j_running_transaction); | 2010 | J_ASSERT(!journal->j_running_transaction); |
@@ -2021,7 +2050,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) | |||
2021 | if (write) { | 2050 | if (write) { |
2022 | /* Lock to make assertions happy... */ | 2051 | /* Lock to make assertions happy... */ |
2023 | mutex_lock(&journal->j_checkpoint_mutex); | 2052 | mutex_lock(&journal->j_checkpoint_mutex); |
2024 | jbd2_mark_journal_empty(journal); | 2053 | jbd2_mark_journal_empty(journal, WRITE_FUA); |
2025 | mutex_unlock(&journal->j_checkpoint_mutex); | 2054 | mutex_unlock(&journal->j_checkpoint_mutex); |
2026 | } | 2055 | } |
2027 | 2056 | ||
@@ -2565,7 +2594,7 @@ void jbd2_journal_release_jbd_inode(journal_t *journal, | |||
2565 | restart: | 2594 | restart: |
2566 | spin_lock(&journal->j_list_lock); | 2595 | spin_lock(&journal->j_list_lock); |
2567 | /* Is commit writing out inode - we have to wait */ | 2596 | /* Is commit writing out inode - we have to wait */ |
2568 | if (test_bit(__JI_COMMIT_RUNNING, &jinode->i_flags)) { | 2597 | if (jinode->i_flags & JI_COMMIT_RUNNING) { |
2569 | wait_queue_head_t *wq; | 2598 | wait_queue_head_t *wq; |
2570 | DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); | 2599 | DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); |
2571 | wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); | 2600 | wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); |
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 7f277e49fe88..08a456b96e4e 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c | |||
@@ -174,8 +174,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal, | |||
174 | return 0; | 174 | return 0; |
175 | } | 175 | } |
176 | 176 | ||
177 | static int jbd2_descr_block_csum_verify(journal_t *j, | 177 | static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf) |
178 | void *buf) | ||
179 | { | 178 | { |
180 | struct jbd2_journal_block_tail *tail; | 179 | struct jbd2_journal_block_tail *tail; |
181 | __be32 provided; | 180 | __be32 provided; |
@@ -522,8 +521,8 @@ static int do_one_pass(journal_t *journal, | |||
522 | descr_csum_size = | 521 | descr_csum_size = |
523 | sizeof(struct jbd2_journal_block_tail); | 522 | sizeof(struct jbd2_journal_block_tail); |
524 | if (descr_csum_size > 0 && | 523 | if (descr_csum_size > 0 && |
525 | !jbd2_descr_block_csum_verify(journal, | 524 | !jbd2_descriptor_block_csum_verify(journal, |
526 | bh->b_data)) { | 525 | bh->b_data)) { |
527 | printk(KERN_ERR "JBD2: Invalid checksum " | 526 | printk(KERN_ERR "JBD2: Invalid checksum " |
528 | "recovering block %lu in log\n", | 527 | "recovering block %lu in log\n", |
529 | next_log_block); | 528 | next_log_block); |
@@ -811,26 +810,6 @@ static int do_one_pass(journal_t *journal, | |||
811 | return err; | 810 | return err; |
812 | } | 811 | } |
813 | 812 | ||
814 | static int jbd2_revoke_block_csum_verify(journal_t *j, | ||
815 | void *buf) | ||
816 | { | ||
817 | struct jbd2_journal_revoke_tail *tail; | ||
818 | __be32 provided; | ||
819 | __u32 calculated; | ||
820 | |||
821 | if (!jbd2_journal_has_csum_v2or3(j)) | ||
822 | return 1; | ||
823 | |||
824 | tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize - | ||
825 | sizeof(struct jbd2_journal_revoke_tail)); | ||
826 | provided = tail->r_checksum; | ||
827 | tail->r_checksum = 0; | ||
828 | calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); | ||
829 | tail->r_checksum = provided; | ||
830 | |||
831 | return provided == cpu_to_be32(calculated); | ||
832 | } | ||
833 | |||
834 | /* Scan a revoke record, marking all blocks mentioned as revoked. */ | 813 | /* Scan a revoke record, marking all blocks mentioned as revoked. */ |
835 | 814 | ||
836 | static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, | 815 | static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, |
@@ -846,11 +825,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, | |||
846 | offset = sizeof(jbd2_journal_revoke_header_t); | 825 | offset = sizeof(jbd2_journal_revoke_header_t); |
847 | rcount = be32_to_cpu(header->r_count); | 826 | rcount = be32_to_cpu(header->r_count); |
848 | 827 | ||
849 | if (!jbd2_revoke_block_csum_verify(journal, header)) | 828 | if (!jbd2_descriptor_block_csum_verify(journal, header)) |
850 | return -EFSBADCRC; | 829 | return -EFSBADCRC; |
851 | 830 | ||
852 | if (jbd2_journal_has_csum_v2or3(journal)) | 831 | if (jbd2_journal_has_csum_v2or3(journal)) |
853 | csum_size = sizeof(struct jbd2_journal_revoke_tail); | 832 | csum_size = sizeof(struct jbd2_journal_block_tail); |
854 | if (rcount > journal->j_blocksize - csum_size) | 833 | if (rcount > journal->j_blocksize - csum_size) |
855 | return -EINVAL; | 834 | return -EINVAL; |
856 | max = rcount; | 835 | max = rcount; |
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 705ae577882b..91171dc352cb 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c | |||
@@ -122,11 +122,11 @@ struct jbd2_revoke_table_s | |||
122 | 122 | ||
123 | 123 | ||
124 | #ifdef __KERNEL__ | 124 | #ifdef __KERNEL__ |
125 | static void write_one_revoke_record(journal_t *, transaction_t *, | 125 | static void write_one_revoke_record(transaction_t *, |
126 | struct list_head *, | 126 | struct list_head *, |
127 | struct buffer_head **, int *, | 127 | struct buffer_head **, int *, |
128 | struct jbd2_revoke_record_s *, int); | 128 | struct jbd2_revoke_record_s *); |
129 | static void flush_descriptor(journal_t *, struct buffer_head *, int, int); | 129 | static void flush_descriptor(journal_t *, struct buffer_head *, int); |
130 | #endif | 130 | #endif |
131 | 131 | ||
132 | /* Utility functions to maintain the revoke table */ | 132 | /* Utility functions to maintain the revoke table */ |
@@ -519,11 +519,10 @@ void jbd2_journal_switch_revoke_table(journal_t *journal) | |||
519 | * Write revoke records to the journal for all entries in the current | 519 | * Write revoke records to the journal for all entries in the current |
520 | * revoke hash, deleting the entries as we go. | 520 | * revoke hash, deleting the entries as we go. |
521 | */ | 521 | */ |
522 | void jbd2_journal_write_revoke_records(journal_t *journal, | 522 | void jbd2_journal_write_revoke_records(transaction_t *transaction, |
523 | transaction_t *transaction, | 523 | struct list_head *log_bufs) |
524 | struct list_head *log_bufs, | ||
525 | int write_op) | ||
526 | { | 524 | { |
525 | journal_t *journal = transaction->t_journal; | ||
527 | struct buffer_head *descriptor; | 526 | struct buffer_head *descriptor; |
528 | struct jbd2_revoke_record_s *record; | 527 | struct jbd2_revoke_record_s *record; |
529 | struct jbd2_revoke_table_s *revoke; | 528 | struct jbd2_revoke_table_s *revoke; |
@@ -544,16 +543,15 @@ void jbd2_journal_write_revoke_records(journal_t *journal, | |||
544 | while (!list_empty(hash_list)) { | 543 | while (!list_empty(hash_list)) { |
545 | record = (struct jbd2_revoke_record_s *) | 544 | record = (struct jbd2_revoke_record_s *) |
546 | hash_list->next; | 545 | hash_list->next; |
547 | write_one_revoke_record(journal, transaction, log_bufs, | 546 | write_one_revoke_record(transaction, log_bufs, |
548 | &descriptor, &offset, | 547 | &descriptor, &offset, record); |
549 | record, write_op); | ||
550 | count++; | 548 | count++; |
551 | list_del(&record->hash); | 549 | list_del(&record->hash); |
552 | kmem_cache_free(jbd2_revoke_record_cache, record); | 550 | kmem_cache_free(jbd2_revoke_record_cache, record); |
553 | } | 551 | } |
554 | } | 552 | } |
555 | if (descriptor) | 553 | if (descriptor) |
556 | flush_descriptor(journal, descriptor, offset, write_op); | 554 | flush_descriptor(journal, descriptor, offset); |
557 | jbd_debug(1, "Wrote %d revoke records\n", count); | 555 | jbd_debug(1, "Wrote %d revoke records\n", count); |
558 | } | 556 | } |
559 | 557 | ||
@@ -562,18 +560,16 @@ void jbd2_journal_write_revoke_records(journal_t *journal, | |||
562 | * block if the old one is full or if we have not already created one. | 560 | * block if the old one is full or if we have not already created one. |
563 | */ | 561 | */ |
564 | 562 | ||
565 | static void write_one_revoke_record(journal_t *journal, | 563 | static void write_one_revoke_record(transaction_t *transaction, |
566 | transaction_t *transaction, | ||
567 | struct list_head *log_bufs, | 564 | struct list_head *log_bufs, |
568 | struct buffer_head **descriptorp, | 565 | struct buffer_head **descriptorp, |
569 | int *offsetp, | 566 | int *offsetp, |
570 | struct jbd2_revoke_record_s *record, | 567 | struct jbd2_revoke_record_s *record) |
571 | int write_op) | ||
572 | { | 568 | { |
569 | journal_t *journal = transaction->t_journal; | ||
573 | int csum_size = 0; | 570 | int csum_size = 0; |
574 | struct buffer_head *descriptor; | 571 | struct buffer_head *descriptor; |
575 | int sz, offset; | 572 | int sz, offset; |
576 | journal_header_t *header; | ||
577 | 573 | ||
578 | /* If we are already aborting, this all becomes a noop. We | 574 | /* If we are already aborting, this all becomes a noop. We |
579 | still need to go round the loop in | 575 | still need to go round the loop in |
@@ -587,7 +583,7 @@ static void write_one_revoke_record(journal_t *journal, | |||
587 | 583 | ||
588 | /* Do we need to leave space at the end for a checksum? */ | 584 | /* Do we need to leave space at the end for a checksum? */ |
589 | if (jbd2_journal_has_csum_v2or3(journal)) | 585 | if (jbd2_journal_has_csum_v2or3(journal)) |
590 | csum_size = sizeof(struct jbd2_journal_revoke_tail); | 586 | csum_size = sizeof(struct jbd2_journal_block_tail); |
591 | 587 | ||
592 | if (jbd2_has_feature_64bit(journal)) | 588 | if (jbd2_has_feature_64bit(journal)) |
593 | sz = 8; | 589 | sz = 8; |
@@ -597,19 +593,16 @@ static void write_one_revoke_record(journal_t *journal, | |||
597 | /* Make sure we have a descriptor with space left for the record */ | 593 | /* Make sure we have a descriptor with space left for the record */ |
598 | if (descriptor) { | 594 | if (descriptor) { |
599 | if (offset + sz > journal->j_blocksize - csum_size) { | 595 | if (offset + sz > journal->j_blocksize - csum_size) { |
600 | flush_descriptor(journal, descriptor, offset, write_op); | 596 | flush_descriptor(journal, descriptor, offset); |
601 | descriptor = NULL; | 597 | descriptor = NULL; |
602 | } | 598 | } |
603 | } | 599 | } |
604 | 600 | ||
605 | if (!descriptor) { | 601 | if (!descriptor) { |
606 | descriptor = jbd2_journal_get_descriptor_buffer(journal); | 602 | descriptor = jbd2_journal_get_descriptor_buffer(transaction, |
603 | JBD2_REVOKE_BLOCK); | ||
607 | if (!descriptor) | 604 | if (!descriptor) |
608 | return; | 605 | return; |
609 | header = (journal_header_t *)descriptor->b_data; | ||
610 | header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); | ||
611 | header->h_blocktype = cpu_to_be32(JBD2_REVOKE_BLOCK); | ||
612 | header->h_sequence = cpu_to_be32(transaction->t_tid); | ||
613 | 606 | ||
614 | /* Record it so that we can wait for IO completion later */ | 607 | /* Record it so that we can wait for IO completion later */ |
615 | BUFFER_TRACE(descriptor, "file in log_bufs"); | 608 | BUFFER_TRACE(descriptor, "file in log_bufs"); |
@@ -630,21 +623,6 @@ static void write_one_revoke_record(journal_t *journal, | |||
630 | *offsetp = offset; | 623 | *offsetp = offset; |
631 | } | 624 | } |
632 | 625 | ||
633 | static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh) | ||
634 | { | ||
635 | struct jbd2_journal_revoke_tail *tail; | ||
636 | __u32 csum; | ||
637 | |||
638 | if (!jbd2_journal_has_csum_v2or3(j)) | ||
639 | return; | ||
640 | |||
641 | tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize - | ||
642 | sizeof(struct jbd2_journal_revoke_tail)); | ||
643 | tail->r_checksum = 0; | ||
644 | csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); | ||
645 | tail->r_checksum = cpu_to_be32(csum); | ||
646 | } | ||
647 | |||
648 | /* | 626 | /* |
649 | * Flush a revoke descriptor out to the journal. If we are aborting, | 627 | * Flush a revoke descriptor out to the journal. If we are aborting, |
650 | * this is a noop; otherwise we are generating a buffer which needs to | 628 | * this is a noop; otherwise we are generating a buffer which needs to |
@@ -654,7 +632,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh) | |||
654 | 632 | ||
655 | static void flush_descriptor(journal_t *journal, | 633 | static void flush_descriptor(journal_t *journal, |
656 | struct buffer_head *descriptor, | 634 | struct buffer_head *descriptor, |
657 | int offset, int write_op) | 635 | int offset) |
658 | { | 636 | { |
659 | jbd2_journal_revoke_header_t *header; | 637 | jbd2_journal_revoke_header_t *header; |
660 | 638 | ||
@@ -665,12 +643,12 @@ static void flush_descriptor(journal_t *journal, | |||
665 | 643 | ||
666 | header = (jbd2_journal_revoke_header_t *)descriptor->b_data; | 644 | header = (jbd2_journal_revoke_header_t *)descriptor->b_data; |
667 | header->r_count = cpu_to_be32(offset); | 645 | header->r_count = cpu_to_be32(offset); |
668 | jbd2_revoke_csum_set(journal, descriptor); | 646 | jbd2_descriptor_block_csum_set(journal, descriptor); |
669 | 647 | ||
670 | set_buffer_jwrite(descriptor); | 648 | set_buffer_jwrite(descriptor); |
671 | BUFFER_TRACE(descriptor, "write"); | 649 | BUFFER_TRACE(descriptor, "write"); |
672 | set_buffer_dirty(descriptor); | 650 | set_buffer_dirty(descriptor); |
673 | write_dirty_buffer(descriptor, write_op); | 651 | write_dirty_buffer(descriptor, WRITE_SYNC); |
674 | } | 652 | } |
675 | #endif | 653 | #endif |
676 | 654 | ||
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 081dff087fc0..01e4652d88f6 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c | |||
@@ -966,14 +966,8 @@ repeat: | |||
966 | if (!frozen_buffer) { | 966 | if (!frozen_buffer) { |
967 | JBUFFER_TRACE(jh, "allocate memory for buffer"); | 967 | JBUFFER_TRACE(jh, "allocate memory for buffer"); |
968 | jbd_unlock_bh_state(bh); | 968 | jbd_unlock_bh_state(bh); |
969 | frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); | 969 | frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, |
970 | if (!frozen_buffer) { | 970 | GFP_NOFS | __GFP_NOFAIL); |
971 | printk(KERN_ERR "%s: OOM for frozen_buffer\n", | ||
972 | __func__); | ||
973 | JBUFFER_TRACE(jh, "oom!"); | ||
974 | error = -ENOMEM; | ||
975 | goto out; | ||
976 | } | ||
977 | goto repeat; | 971 | goto repeat; |
978 | } | 972 | } |
979 | jh->b_frozen_data = frozen_buffer; | 973 | jh->b_frozen_data = frozen_buffer; |
@@ -1226,15 +1220,9 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) | |||
1226 | goto out; | 1220 | goto out; |
1227 | 1221 | ||
1228 | repeat: | 1222 | repeat: |
1229 | if (!jh->b_committed_data) { | 1223 | if (!jh->b_committed_data) |
1230 | committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); | 1224 | committed_data = jbd2_alloc(jh2bh(jh)->b_size, |
1231 | if (!committed_data) { | 1225 | GFP_NOFS|__GFP_NOFAIL); |
1232 | printk(KERN_ERR "%s: No memory for committed data\n", | ||
1233 | __func__); | ||
1234 | err = -ENOMEM; | ||
1235 | goto out; | ||
1236 | } | ||
1237 | } | ||
1238 | 1226 | ||
1239 | jbd_lock_bh_state(bh); | 1227 | jbd_lock_bh_state(bh); |
1240 | if (!jh->b_committed_data) { | 1228 | if (!jh->b_committed_data) { |
diff --git a/fs/mbcache.c b/fs/mbcache.c index 187477ded6b3..eccda3a02de6 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c | |||
@@ -1,858 +1,433 @@ | |||
1 | /* | 1 | #include <linux/spinlock.h> |
2 | * linux/fs/mbcache.c | ||
3 | * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org> | ||
4 | */ | ||
5 | |||
6 | /* | ||
7 | * Filesystem Meta Information Block Cache (mbcache) | ||
8 | * | ||
9 | * The mbcache caches blocks of block devices that need to be located | ||
10 | * by their device/block number, as well as by other criteria (such | ||
11 | * as the block's contents). | ||
12 | * | ||
13 | * There can only be one cache entry in a cache per device and block number. | ||
14 | * Additional indexes need not be unique in this sense. The number of | ||
15 | * additional indexes (=other criteria) can be hardwired at compile time | ||
16 | * or specified at cache create time. | ||
17 | * | ||
18 | * Each cache entry is of fixed size. An entry may be `valid' or `invalid' | ||
19 | * in the cache. A valid entry is in the main hash tables of the cache, | ||
20 | * and may also be in the lru list. An invalid entry is not in any hashes | ||
21 | * or lists. | ||
22 | * | ||
23 | * A valid cache entry is only in the lru list if no handles refer to it. | ||
24 | * Invalid cache entries will be freed when the last handle to the cache | ||
25 | * entry is released. Entries that cannot be freed immediately are put | ||
26 | * back on the lru list. | ||
27 | */ | ||
28 | |||
29 | /* | ||
30 | * Lock descriptions and usage: | ||
31 | * | ||
32 | * Each hash chain of both the block and index hash tables now contains | ||
33 | * a built-in lock used to serialize accesses to the hash chain. | ||
34 | * | ||
35 | * Accesses to global data structures mb_cache_list and mb_cache_lru_list | ||
36 | * are serialized via the global spinlock mb_cache_spinlock. | ||
37 | * | ||
38 | * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize | ||
39 | * accesses to its local data, such as e_used and e_queued. | ||
40 | * | ||
41 | * Lock ordering: | ||
42 | * | ||
43 | * Each block hash chain's lock has the highest lock order, followed by an | ||
44 | * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's | ||
45 | * lock), and mb_cach_spinlock, with the lowest order. While holding | ||
46 | * either a block or index hash chain lock, a thread can acquire an | ||
47 | * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock. | ||
48 | * | ||
49 | * Synchronization: | ||
50 | * | ||
51 | * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and | ||
52 | * index hash chian, it needs to lock the corresponding hash chain. For each | ||
53 | * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to | ||
54 | * prevent either any simultaneous release or free on the entry and also | ||
55 | * to serialize accesses to either the e_used or e_queued member of the entry. | ||
56 | * | ||
57 | * To avoid having a dangling reference to an already freed | ||
58 | * mb_cache_entry, an mb_cache_entry is only freed when it is not on a | ||
59 | * block hash chain and also no longer being referenced, both e_used, | ||
60 | * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is | ||
61 | * first removed from a block hash chain. | ||
62 | */ | ||
63 | |||
64 | #include <linux/kernel.h> | ||
65 | #include <linux/module.h> | ||
66 | |||
67 | #include <linux/hash.h> | ||
68 | #include <linux/fs.h> | ||
69 | #include <linux/mm.h> | ||
70 | #include <linux/slab.h> | 2 | #include <linux/slab.h> |
71 | #include <linux/sched.h> | 3 | #include <linux/list.h> |
72 | #include <linux/list_bl.h> | 4 | #include <linux/list_bl.h> |
5 | #include <linux/module.h> | ||
6 | #include <linux/sched.h> | ||
7 | #include <linux/workqueue.h> | ||
73 | #include <linux/mbcache.h> | 8 | #include <linux/mbcache.h> |
74 | #include <linux/init.h> | ||
75 | #include <linux/blockgroup_lock.h> | ||
76 | #include <linux/log2.h> | ||
77 | |||
78 | #ifdef MB_CACHE_DEBUG | ||
79 | # define mb_debug(f...) do { \ | ||
80 | printk(KERN_DEBUG f); \ | ||
81 | printk("\n"); \ | ||
82 | } while (0) | ||
83 | #define mb_assert(c) do { if (!(c)) \ | ||
84 | printk(KERN_ERR "assertion " #c " failed\n"); \ | ||
85 | } while(0) | ||
86 | #else | ||
87 | # define mb_debug(f...) do { } while(0) | ||
88 | # define mb_assert(c) do { } while(0) | ||
89 | #endif | ||
90 | #define mb_error(f...) do { \ | ||
91 | printk(KERN_ERR f); \ | ||
92 | printk("\n"); \ | ||
93 | } while(0) | ||
94 | |||
95 | #define MB_CACHE_WRITER ((unsigned short)~0U >> 1) | ||
96 | |||
97 | #define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS) | ||
98 | #define MB_CACHE_ENTRY_LOCK_INDEX(ce) \ | ||
99 | (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS)) | ||
100 | |||
101 | static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue); | ||
102 | static struct blockgroup_lock *mb_cache_bg_lock; | ||
103 | static struct kmem_cache *mb_cache_kmem_cache; | ||
104 | |||
105 | MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>"); | ||
106 | MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); | ||
107 | MODULE_LICENSE("GPL"); | ||
108 | |||
109 | EXPORT_SYMBOL(mb_cache_create); | ||
110 | EXPORT_SYMBOL(mb_cache_shrink); | ||
111 | EXPORT_SYMBOL(mb_cache_destroy); | ||
112 | EXPORT_SYMBOL(mb_cache_entry_alloc); | ||
113 | EXPORT_SYMBOL(mb_cache_entry_insert); | ||
114 | EXPORT_SYMBOL(mb_cache_entry_release); | ||
115 | EXPORT_SYMBOL(mb_cache_entry_free); | ||
116 | EXPORT_SYMBOL(mb_cache_entry_get); | ||
117 | #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) | ||
118 | EXPORT_SYMBOL(mb_cache_entry_find_first); | ||
119 | EXPORT_SYMBOL(mb_cache_entry_find_next); | ||
120 | #endif | ||
121 | 9 | ||
122 | /* | 10 | /* |
123 | * Global data: list of all mbcache's, lru list, and a spinlock for | 11 | * Mbcache is a simple key-value store. Keys need not be unique, however |
124 | * accessing cache data structures on SMP machines. The lru list is | 12 | * key-value pairs are expected to be unique (we use this fact in |
125 | * global across all mbcaches. | 13 | * mb_cache_entry_delete_block()). |
14 | * | ||
15 | * Ext2 and ext4 use this cache for deduplication of extended attribute blocks. | ||
16 | * They use hash of a block contents as a key and block number as a value. | ||
17 | * That's why keys need not be unique (different xattr blocks may end up having | ||
18 | * the same hash). However block number always uniquely identifies a cache | ||
19 | * entry. | ||
20 | * | ||
21 | * We provide functions for creation and removal of entries, search by key, | ||
22 | * and a special "delete entry with given key-value pair" operation. Fixed | ||
23 | * size hash table is used for fast key lookups. | ||
126 | */ | 24 | */ |
127 | 25 | ||
128 | static LIST_HEAD(mb_cache_list); | 26 | struct mb_cache { |
129 | static LIST_HEAD(mb_cache_lru_list); | 27 | /* Hash table of entries */ |
130 | static DEFINE_SPINLOCK(mb_cache_spinlock); | 28 | struct hlist_bl_head *c_hash; |
131 | 29 | /* log2 of hash table size */ | |
132 | static inline void | 30 | int c_bucket_bits; |
133 | __spin_lock_mb_cache_entry(struct mb_cache_entry *ce) | 31 | /* Maximum entries in cache to avoid degrading hash too much */ |
134 | { | 32 | int c_max_entries; |
135 | spin_lock(bgl_lock_ptr(mb_cache_bg_lock, | 33 | /* Protects c_list, c_entry_count */ |
136 | MB_CACHE_ENTRY_LOCK_INDEX(ce))); | 34 | spinlock_t c_list_lock; |
137 | } | 35 | struct list_head c_list; |
138 | 36 | /* Number of entries in cache */ | |
139 | static inline void | 37 | unsigned long c_entry_count; |
140 | __spin_unlock_mb_cache_entry(struct mb_cache_entry *ce) | 38 | struct shrinker c_shrink; |
141 | { | 39 | /* Work for shrinking when the cache has too many entries */ |
142 | spin_unlock(bgl_lock_ptr(mb_cache_bg_lock, | 40 | struct work_struct c_shrink_work; |
143 | MB_CACHE_ENTRY_LOCK_INDEX(ce))); | 41 | }; |
144 | } | ||
145 | |||
146 | static inline int | ||
147 | __mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce) | ||
148 | { | ||
149 | return !hlist_bl_unhashed(&ce->e_block_list); | ||
150 | } | ||
151 | 42 | ||
43 | static struct kmem_cache *mb_entry_cache; | ||
152 | 44 | ||
153 | static inline void | 45 | static unsigned long mb_cache_shrink(struct mb_cache *cache, |
154 | __mb_cache_entry_unhash_block(struct mb_cache_entry *ce) | 46 | unsigned int nr_to_scan); |
155 | { | ||
156 | if (__mb_cache_entry_is_block_hashed(ce)) | ||
157 | hlist_bl_del_init(&ce->e_block_list); | ||
158 | } | ||
159 | 47 | ||
160 | static inline int | 48 | static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache, |
161 | __mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce) | 49 | u32 key) |
162 | { | 50 | { |
163 | return !hlist_bl_unhashed(&ce->e_index.o_list); | 51 | return &cache->c_hash[hash_32(key, cache->c_bucket_bits)]; |
164 | } | 52 | } |
165 | 53 | ||
166 | static inline void | 54 | /* |
167 | __mb_cache_entry_unhash_index(struct mb_cache_entry *ce) | 55 | * Number of entries to reclaim synchronously when there are too many entries |
168 | { | 56 | * in cache |
169 | if (__mb_cache_entry_is_index_hashed(ce)) | 57 | */ |
170 | hlist_bl_del_init(&ce->e_index.o_list); | 58 | #define SYNC_SHRINK_BATCH 64 |
171 | } | ||
172 | 59 | ||
173 | /* | 60 | /* |
174 | * __mb_cache_entry_unhash_unlock() | 61 | * mb_cache_entry_create - create entry in cache |
175 | * | 62 | * @cache - cache where the entry should be created |
176 | * This function is called to unhash both the block and index hash | 63 | * @mask - gfp mask with which the entry should be allocated |
177 | * chain. | 64 | * @key - key of the entry |
178 | * It assumes both the block and index hash chain is locked upon entry. | 65 | * @block - block that contains data |
179 | * It also unlock both hash chains both exit | 66 | * @reusable - is the block reusable by other inodes? |
67 | * | ||
68 | * Creates entry in @cache with key @key and records that data is stored in | ||
69 | * block @block. The function returns -EBUSY if entry with the same key | ||
70 | * and for the same block already exists in cache. Otherwise 0 is returned. | ||
180 | */ | 71 | */ |
181 | static inline void | 72 | int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, |
182 | __mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce) | 73 | sector_t block, bool reusable) |
183 | { | 74 | { |
184 | __mb_cache_entry_unhash_index(ce); | 75 | struct mb_cache_entry *entry, *dup; |
185 | hlist_bl_unlock(ce->e_index_hash_p); | 76 | struct hlist_bl_node *dup_node; |
186 | __mb_cache_entry_unhash_block(ce); | 77 | struct hlist_bl_head *head; |
187 | hlist_bl_unlock(ce->e_block_hash_p); | 78 | |
79 | /* Schedule background reclaim if there are too many entries */ | ||
80 | if (cache->c_entry_count >= cache->c_max_entries) | ||
81 | schedule_work(&cache->c_shrink_work); | ||
82 | /* Do some sync reclaim if background reclaim cannot keep up */ | ||
83 | if (cache->c_entry_count >= 2*cache->c_max_entries) | ||
84 | mb_cache_shrink(cache, SYNC_SHRINK_BATCH); | ||
85 | |||
86 | entry = kmem_cache_alloc(mb_entry_cache, mask); | ||
87 | if (!entry) | ||
88 | return -ENOMEM; | ||
89 | |||
90 | INIT_LIST_HEAD(&entry->e_list); | ||
91 | /* One ref for hash, one ref returned */ | ||
92 | atomic_set(&entry->e_refcnt, 1); | ||
93 | entry->e_key = key; | ||
94 | entry->e_block = block; | ||
95 | entry->e_reusable = reusable; | ||
96 | head = mb_cache_entry_head(cache, key); | ||
97 | hlist_bl_lock(head); | ||
98 | hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) { | ||
99 | if (dup->e_key == key && dup->e_block == block) { | ||
100 | hlist_bl_unlock(head); | ||
101 | kmem_cache_free(mb_entry_cache, entry); | ||
102 | return -EBUSY; | ||
103 | } | ||
104 | } | ||
105 | hlist_bl_add_head(&entry->e_hash_list, head); | ||
106 | hlist_bl_unlock(head); | ||
107 | |||
108 | spin_lock(&cache->c_list_lock); | ||
109 | list_add_tail(&entry->e_list, &cache->c_list); | ||
110 | /* Grab ref for LRU list */ | ||
111 | atomic_inc(&entry->e_refcnt); | ||
112 | cache->c_entry_count++; | ||
113 | spin_unlock(&cache->c_list_lock); | ||
114 | |||
115 | return 0; | ||
188 | } | 116 | } |
117 | EXPORT_SYMBOL(mb_cache_entry_create); | ||
189 | 118 | ||
190 | static void | 119 | void __mb_cache_entry_free(struct mb_cache_entry *entry) |
191 | __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) | ||
192 | { | 120 | { |
193 | struct mb_cache *cache = ce->e_cache; | 121 | kmem_cache_free(mb_entry_cache, entry); |
194 | |||
195 | mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))); | ||
196 | kmem_cache_free(cache->c_entry_cache, ce); | ||
197 | atomic_dec(&cache->c_entry_count); | ||
198 | } | 122 | } |
123 | EXPORT_SYMBOL(__mb_cache_entry_free); | ||
199 | 124 | ||
200 | static void | 125 | static struct mb_cache_entry *__entry_find(struct mb_cache *cache, |
201 | __mb_cache_entry_release(struct mb_cache_entry *ce) | 126 | struct mb_cache_entry *entry, |
127 | u32 key) | ||
202 | { | 128 | { |
203 | /* First lock the entry to serialize access to its local data. */ | 129 | struct mb_cache_entry *old_entry = entry; |
204 | __spin_lock_mb_cache_entry(ce); | 130 | struct hlist_bl_node *node; |
205 | /* Wake up all processes queuing for this cache entry. */ | 131 | struct hlist_bl_head *head; |
206 | if (ce->e_queued) | 132 | |
207 | wake_up_all(&mb_cache_queue); | 133 | head = mb_cache_entry_head(cache, key); |
208 | if (ce->e_used >= MB_CACHE_WRITER) | 134 | hlist_bl_lock(head); |
209 | ce->e_used -= MB_CACHE_WRITER; | 135 | if (entry && !hlist_bl_unhashed(&entry->e_hash_list)) |
210 | /* | 136 | node = entry->e_hash_list.next; |
211 | * Make sure that all cache entries on lru_list have | 137 | else |
212 | * both e_used and e_qued of 0s. | 138 | node = hlist_bl_first(head); |
213 | */ | 139 | while (node) { |
214 | ce->e_used--; | 140 | entry = hlist_bl_entry(node, struct mb_cache_entry, |
215 | if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) { | 141 | e_hash_list); |
216 | if (!__mb_cache_entry_is_block_hashed(ce)) { | 142 | if (entry->e_key == key && entry->e_reusable) { |
217 | __spin_unlock_mb_cache_entry(ce); | 143 | atomic_inc(&entry->e_refcnt); |
218 | goto forget; | 144 | goto out; |
219 | } | 145 | } |
220 | /* | 146 | node = node->next; |
221 | * Need access to lru list, first drop entry lock, | ||
222 | * then reacquire the lock in the proper order. | ||
223 | */ | ||
224 | spin_lock(&mb_cache_spinlock); | ||
225 | if (list_empty(&ce->e_lru_list)) | ||
226 | list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); | ||
227 | spin_unlock(&mb_cache_spinlock); | ||
228 | } | 147 | } |
229 | __spin_unlock_mb_cache_entry(ce); | 148 | entry = NULL; |
230 | return; | 149 | out: |
231 | forget: | 150 | hlist_bl_unlock(head); |
232 | mb_assert(list_empty(&ce->e_lru_list)); | 151 | if (old_entry) |
233 | __mb_cache_entry_forget(ce, GFP_KERNEL); | 152 | mb_cache_entry_put(cache, old_entry); |
153 | |||
154 | return entry; | ||
234 | } | 155 | } |
235 | 156 | ||
236 | /* | 157 | /* |
237 | * mb_cache_shrink_scan() memory pressure callback | 158 | * mb_cache_entry_find_first - find the first entry in cache with given key |
238 | * | 159 | * @cache: cache where we should search |
239 | * This function is called by the kernel memory management when memory | 160 | * @key: key to look for |
240 | * gets low. | ||
241 | * | 161 | * |
242 | * @shrink: (ignored) | 162 | * Search in @cache for entry with key @key. Grabs reference to the first |
243 | * @sc: shrink_control passed from reclaim | 163 | * entry found and returns the entry. |
244 | * | ||
245 | * Returns the number of objects freed. | ||
246 | */ | 164 | */ |
247 | static unsigned long | 165 | struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, |
248 | mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) | 166 | u32 key) |
249 | { | 167 | { |
250 | LIST_HEAD(free_list); | 168 | return __entry_find(cache, NULL, key); |
251 | struct mb_cache_entry *entry, *tmp; | ||
252 | int nr_to_scan = sc->nr_to_scan; | ||
253 | gfp_t gfp_mask = sc->gfp_mask; | ||
254 | unsigned long freed = 0; | ||
255 | |||
256 | mb_debug("trying to free %d entries", nr_to_scan); | ||
257 | spin_lock(&mb_cache_spinlock); | ||
258 | while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) { | ||
259 | struct mb_cache_entry *ce = | ||
260 | list_entry(mb_cache_lru_list.next, | ||
261 | struct mb_cache_entry, e_lru_list); | ||
262 | list_del_init(&ce->e_lru_list); | ||
263 | if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)) | ||
264 | continue; | ||
265 | spin_unlock(&mb_cache_spinlock); | ||
266 | /* Prevent any find or get operation on the entry */ | ||
267 | hlist_bl_lock(ce->e_block_hash_p); | ||
268 | hlist_bl_lock(ce->e_index_hash_p); | ||
269 | /* Ignore if it is touched by a find/get */ | ||
270 | if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) || | ||
271 | !list_empty(&ce->e_lru_list)) { | ||
272 | hlist_bl_unlock(ce->e_index_hash_p); | ||
273 | hlist_bl_unlock(ce->e_block_hash_p); | ||
274 | spin_lock(&mb_cache_spinlock); | ||
275 | continue; | ||
276 | } | ||
277 | __mb_cache_entry_unhash_unlock(ce); | ||
278 | list_add_tail(&ce->e_lru_list, &free_list); | ||
279 | spin_lock(&mb_cache_spinlock); | ||
280 | } | ||
281 | spin_unlock(&mb_cache_spinlock); | ||
282 | |||
283 | list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { | ||
284 | __mb_cache_entry_forget(entry, gfp_mask); | ||
285 | freed++; | ||
286 | } | ||
287 | return freed; | ||
288 | } | 169 | } |
170 | EXPORT_SYMBOL(mb_cache_entry_find_first); | ||
289 | 171 | ||
290 | static unsigned long | 172 | /* |
291 | mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) | 173 | * mb_cache_entry_find_next - find next entry in cache with the same |
174 | * @cache: cache where we should search | ||
175 | * @entry: entry to start search from | ||
176 | * | ||
177 | * Finds next entry in the hash chain which has the same key as @entry. | ||
178 | * If @entry is unhashed (which can happen when deletion of entry races | ||
179 | * with the search), finds the first entry in the hash chain. The function | ||
180 | * drops reference to @entry and returns with a reference to the found entry. | ||
181 | */ | ||
182 | struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, | ||
183 | struct mb_cache_entry *entry) | ||
292 | { | 184 | { |
293 | struct mb_cache *cache; | 185 | return __entry_find(cache, entry, entry->e_key); |
294 | unsigned long count = 0; | ||
295 | |||
296 | spin_lock(&mb_cache_spinlock); | ||
297 | list_for_each_entry(cache, &mb_cache_list, c_cache_list) { | ||
298 | mb_debug("cache %s (%d)", cache->c_name, | ||
299 | atomic_read(&cache->c_entry_count)); | ||
300 | count += atomic_read(&cache->c_entry_count); | ||
301 | } | ||
302 | spin_unlock(&mb_cache_spinlock); | ||
303 | |||
304 | return vfs_pressure_ratio(count); | ||
305 | } | 186 | } |
306 | 187 | EXPORT_SYMBOL(mb_cache_entry_find_next); | |
307 | static struct shrinker mb_cache_shrinker = { | ||
308 | .count_objects = mb_cache_shrink_count, | ||
309 | .scan_objects = mb_cache_shrink_scan, | ||
310 | .seeks = DEFAULT_SEEKS, | ||
311 | }; | ||
312 | 188 | ||
313 | /* | 189 | /* |
314 | * mb_cache_create() create a new cache | 190 | * mb_cache_entry_get - get a cache entry by block number (and key) |
315 | * | 191 | * @cache - cache we work with |
316 | * All entries in one cache are equal size. Cache entries may be from | 192 | * @key - key of block number @block |
317 | * multiple devices. If this is the first mbcache created, registers | 193 | * @block - block number |
318 | * the cache with kernel memory management. Returns NULL if no more | ||
319 | * memory was available. | ||
320 | * | ||
321 | * @name: name of the cache (informal) | ||
322 | * @bucket_bits: log2(number of hash buckets) | ||
323 | */ | 194 | */ |
324 | struct mb_cache * | 195 | struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, |
325 | mb_cache_create(const char *name, int bucket_bits) | 196 | sector_t block) |
326 | { | 197 | { |
327 | int n, bucket_count = 1 << bucket_bits; | 198 | struct hlist_bl_node *node; |
328 | struct mb_cache *cache = NULL; | 199 | struct hlist_bl_head *head; |
329 | 200 | struct mb_cache_entry *entry; | |
330 | if (!mb_cache_bg_lock) { | 201 | |
331 | mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock), | 202 | head = mb_cache_entry_head(cache, key); |
332 | GFP_KERNEL); | 203 | hlist_bl_lock(head); |
333 | if (!mb_cache_bg_lock) | 204 | hlist_bl_for_each_entry(entry, node, head, e_hash_list) { |
334 | return NULL; | 205 | if (entry->e_key == key && entry->e_block == block) { |
335 | bgl_lock_init(mb_cache_bg_lock); | 206 | atomic_inc(&entry->e_refcnt); |
336 | } | 207 | goto out; |
337 | 208 | } | |
338 | cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL); | ||
339 | if (!cache) | ||
340 | return NULL; | ||
341 | cache->c_name = name; | ||
342 | atomic_set(&cache->c_entry_count, 0); | ||
343 | cache->c_bucket_bits = bucket_bits; | ||
344 | cache->c_block_hash = kmalloc(bucket_count * | ||
345 | sizeof(struct hlist_bl_head), GFP_KERNEL); | ||
346 | if (!cache->c_block_hash) | ||
347 | goto fail; | ||
348 | for (n=0; n<bucket_count; n++) | ||
349 | INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]); | ||
350 | cache->c_index_hash = kmalloc(bucket_count * | ||
351 | sizeof(struct hlist_bl_head), GFP_KERNEL); | ||
352 | if (!cache->c_index_hash) | ||
353 | goto fail; | ||
354 | for (n=0; n<bucket_count; n++) | ||
355 | INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]); | ||
356 | if (!mb_cache_kmem_cache) { | ||
357 | mb_cache_kmem_cache = kmem_cache_create(name, | ||
358 | sizeof(struct mb_cache_entry), 0, | ||
359 | SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); | ||
360 | if (!mb_cache_kmem_cache) | ||
361 | goto fail2; | ||
362 | } | 209 | } |
363 | cache->c_entry_cache = mb_cache_kmem_cache; | 210 | entry = NULL; |
364 | 211 | out: | |
365 | /* | 212 | hlist_bl_unlock(head); |
366 | * Set an upper limit on the number of cache entries so that the hash | 213 | return entry; |
367 | * chains won't grow too long. | ||
368 | */ | ||
369 | cache->c_max_entries = bucket_count << 4; | ||
370 | |||
371 | spin_lock(&mb_cache_spinlock); | ||
372 | list_add(&cache->c_cache_list, &mb_cache_list); | ||
373 | spin_unlock(&mb_cache_spinlock); | ||
374 | return cache; | ||
375 | |||
376 | fail2: | ||
377 | kfree(cache->c_index_hash); | ||
378 | |||
379 | fail: | ||
380 | kfree(cache->c_block_hash); | ||
381 | kfree(cache); | ||
382 | return NULL; | ||
383 | } | 214 | } |
215 | EXPORT_SYMBOL(mb_cache_entry_get); | ||
384 | 216 | ||
385 | 217 | /* mb_cache_entry_delete_block - remove information about block from cache | |
386 | /* | 218 | * @cache - cache we work with |
387 | * mb_cache_shrink() | 219 | * @key - key of block @block |
388 | * | 220 | * @block - block number |
389 | * Removes all cache entries of a device from the cache. All cache entries | ||
390 | * currently in use cannot be freed, and thus remain in the cache. All others | ||
391 | * are freed. | ||
392 | * | 221 | * |
393 | * @bdev: which device's cache entries to shrink | 222 | * Remove entry from cache @cache with key @key with data stored in @block. |
394 | */ | 223 | */ |
395 | void | 224 | void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, |
396 | mb_cache_shrink(struct block_device *bdev) | 225 | sector_t block) |
397 | { | 226 | { |
398 | LIST_HEAD(free_list); | 227 | struct hlist_bl_node *node; |
399 | struct list_head *l; | 228 | struct hlist_bl_head *head; |
400 | struct mb_cache_entry *ce, *tmp; | 229 | struct mb_cache_entry *entry; |
401 | 230 | ||
402 | l = &mb_cache_lru_list; | 231 | head = mb_cache_entry_head(cache, key); |
403 | spin_lock(&mb_cache_spinlock); | 232 | hlist_bl_lock(head); |
404 | while (!list_is_last(l, &mb_cache_lru_list)) { | 233 | hlist_bl_for_each_entry(entry, node, head, e_hash_list) { |
405 | l = l->next; | 234 | if (entry->e_key == key && entry->e_block == block) { |
406 | ce = list_entry(l, struct mb_cache_entry, e_lru_list); | 235 | /* We keep hash list reference to keep entry alive */ |
407 | if (ce->e_bdev == bdev) { | 236 | hlist_bl_del_init(&entry->e_hash_list); |
408 | list_del_init(&ce->e_lru_list); | 237 | hlist_bl_unlock(head); |
409 | if (ce->e_used || ce->e_queued || | 238 | spin_lock(&cache->c_list_lock); |
410 | atomic_read(&ce->e_refcnt)) | 239 | if (!list_empty(&entry->e_list)) { |
411 | continue; | 240 | list_del_init(&entry->e_list); |
412 | spin_unlock(&mb_cache_spinlock); | 241 | cache->c_entry_count--; |
413 | /* | 242 | atomic_dec(&entry->e_refcnt); |
414 | * Prevent any find or get operation on the entry. | ||
415 | */ | ||
416 | hlist_bl_lock(ce->e_block_hash_p); | ||
417 | hlist_bl_lock(ce->e_index_hash_p); | ||
418 | /* Ignore if it is touched by a find/get */ | ||
419 | if (ce->e_used || ce->e_queued || | ||
420 | atomic_read(&ce->e_refcnt) || | ||
421 | !list_empty(&ce->e_lru_list)) { | ||
422 | hlist_bl_unlock(ce->e_index_hash_p); | ||
423 | hlist_bl_unlock(ce->e_block_hash_p); | ||
424 | l = &mb_cache_lru_list; | ||
425 | spin_lock(&mb_cache_spinlock); | ||
426 | continue; | ||
427 | } | 243 | } |
428 | __mb_cache_entry_unhash_unlock(ce); | 244 | spin_unlock(&cache->c_list_lock); |
429 | mb_assert(!(ce->e_used || ce->e_queued || | 245 | mb_cache_entry_put(cache, entry); |
430 | atomic_read(&ce->e_refcnt))); | 246 | return; |
431 | list_add_tail(&ce->e_lru_list, &free_list); | ||
432 | l = &mb_cache_lru_list; | ||
433 | spin_lock(&mb_cache_spinlock); | ||
434 | } | 247 | } |
435 | } | 248 | } |
436 | spin_unlock(&mb_cache_spinlock); | 249 | hlist_bl_unlock(head); |
437 | |||
438 | list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { | ||
439 | __mb_cache_entry_forget(ce, GFP_KERNEL); | ||
440 | } | ||
441 | } | 250 | } |
251 | EXPORT_SYMBOL(mb_cache_entry_delete_block); | ||
442 | 252 | ||
443 | 253 | /* mb_cache_entry_touch - cache entry got used | |
444 | /* | 254 | * @cache - cache the entry belongs to |
445 | * mb_cache_destroy() | 255 | * @entry - entry that got used |
446 | * | 256 | * |
447 | * Shrinks the cache to its minimum possible size (hopefully 0 entries), | 257 | * Marks entry as used to give hit higher chances of surviving in cache. |
448 | * and then destroys it. If this was the last mbcache, un-registers the | ||
449 | * mbcache from kernel memory management. | ||
450 | */ | 258 | */ |
451 | void | 259 | void mb_cache_entry_touch(struct mb_cache *cache, |
452 | mb_cache_destroy(struct mb_cache *cache) | 260 | struct mb_cache_entry *entry) |
453 | { | 261 | { |
454 | LIST_HEAD(free_list); | 262 | entry->e_referenced = 1; |
455 | struct mb_cache_entry *ce, *tmp; | ||
456 | |||
457 | spin_lock(&mb_cache_spinlock); | ||
458 | list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) { | ||
459 | if (ce->e_cache == cache) | ||
460 | list_move_tail(&ce->e_lru_list, &free_list); | ||
461 | } | ||
462 | list_del(&cache->c_cache_list); | ||
463 | spin_unlock(&mb_cache_spinlock); | ||
464 | |||
465 | list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) { | ||
466 | list_del_init(&ce->e_lru_list); | ||
467 | /* | ||
468 | * Prevent any find or get operation on the entry. | ||
469 | */ | ||
470 | hlist_bl_lock(ce->e_block_hash_p); | ||
471 | hlist_bl_lock(ce->e_index_hash_p); | ||
472 | mb_assert(!(ce->e_used || ce->e_queued || | ||
473 | atomic_read(&ce->e_refcnt))); | ||
474 | __mb_cache_entry_unhash_unlock(ce); | ||
475 | __mb_cache_entry_forget(ce, GFP_KERNEL); | ||
476 | } | ||
477 | |||
478 | if (atomic_read(&cache->c_entry_count) > 0) { | ||
479 | mb_error("cache %s: %d orphaned entries", | ||
480 | cache->c_name, | ||
481 | atomic_read(&cache->c_entry_count)); | ||
482 | } | ||
483 | |||
484 | if (list_empty(&mb_cache_list)) { | ||
485 | kmem_cache_destroy(mb_cache_kmem_cache); | ||
486 | mb_cache_kmem_cache = NULL; | ||
487 | } | ||
488 | kfree(cache->c_index_hash); | ||
489 | kfree(cache->c_block_hash); | ||
490 | kfree(cache); | ||
491 | } | 263 | } |
264 | EXPORT_SYMBOL(mb_cache_entry_touch); | ||
492 | 265 | ||
493 | /* | 266 | static unsigned long mb_cache_count(struct shrinker *shrink, |
494 | * mb_cache_entry_alloc() | 267 | struct shrink_control *sc) |
495 | * | ||
496 | * Allocates a new cache entry. The new entry will not be valid initially, | ||
497 | * and thus cannot be looked up yet. It should be filled with data, and | ||
498 | * then inserted into the cache using mb_cache_entry_insert(). Returns NULL | ||
499 | * if no more memory was available. | ||
500 | */ | ||
501 | struct mb_cache_entry * | ||
502 | mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) | ||
503 | { | 268 | { |
504 | struct mb_cache_entry *ce; | 269 | struct mb_cache *cache = container_of(shrink, struct mb_cache, |
505 | 270 | c_shrink); | |
506 | if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) { | ||
507 | struct list_head *l; | ||
508 | |||
509 | l = &mb_cache_lru_list; | ||
510 | spin_lock(&mb_cache_spinlock); | ||
511 | while (!list_is_last(l, &mb_cache_lru_list)) { | ||
512 | l = l->next; | ||
513 | ce = list_entry(l, struct mb_cache_entry, e_lru_list); | ||
514 | if (ce->e_cache == cache) { | ||
515 | list_del_init(&ce->e_lru_list); | ||
516 | if (ce->e_used || ce->e_queued || | ||
517 | atomic_read(&ce->e_refcnt)) | ||
518 | continue; | ||
519 | spin_unlock(&mb_cache_spinlock); | ||
520 | /* | ||
521 | * Prevent any find or get operation on the | ||
522 | * entry. | ||
523 | */ | ||
524 | hlist_bl_lock(ce->e_block_hash_p); | ||
525 | hlist_bl_lock(ce->e_index_hash_p); | ||
526 | /* Ignore if it is touched by a find/get */ | ||
527 | if (ce->e_used || ce->e_queued || | ||
528 | atomic_read(&ce->e_refcnt) || | ||
529 | !list_empty(&ce->e_lru_list)) { | ||
530 | hlist_bl_unlock(ce->e_index_hash_p); | ||
531 | hlist_bl_unlock(ce->e_block_hash_p); | ||
532 | l = &mb_cache_lru_list; | ||
533 | spin_lock(&mb_cache_spinlock); | ||
534 | continue; | ||
535 | } | ||
536 | mb_assert(list_empty(&ce->e_lru_list)); | ||
537 | mb_assert(!(ce->e_used || ce->e_queued || | ||
538 | atomic_read(&ce->e_refcnt))); | ||
539 | __mb_cache_entry_unhash_unlock(ce); | ||
540 | goto found; | ||
541 | } | ||
542 | } | ||
543 | spin_unlock(&mb_cache_spinlock); | ||
544 | } | ||
545 | 271 | ||
546 | ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); | 272 | return cache->c_entry_count; |
547 | if (!ce) | ||
548 | return NULL; | ||
549 | atomic_inc(&cache->c_entry_count); | ||
550 | INIT_LIST_HEAD(&ce->e_lru_list); | ||
551 | INIT_HLIST_BL_NODE(&ce->e_block_list); | ||
552 | INIT_HLIST_BL_NODE(&ce->e_index.o_list); | ||
553 | ce->e_cache = cache; | ||
554 | ce->e_queued = 0; | ||
555 | atomic_set(&ce->e_refcnt, 0); | ||
556 | found: | ||
557 | ce->e_block_hash_p = &cache->c_block_hash[0]; | ||
558 | ce->e_index_hash_p = &cache->c_index_hash[0]; | ||
559 | ce->e_used = 1 + MB_CACHE_WRITER; | ||
560 | return ce; | ||
561 | } | 273 | } |
562 | 274 | ||
563 | 275 | /* Shrink number of entries in cache */ | |
564 | /* | 276 | static unsigned long mb_cache_shrink(struct mb_cache *cache, |
565 | * mb_cache_entry_insert() | 277 | unsigned int nr_to_scan) |
566 | * | ||
567 | * Inserts an entry that was allocated using mb_cache_entry_alloc() into | ||
568 | * the cache. After this, the cache entry can be looked up, but is not yet | ||
569 | * in the lru list as the caller still holds a handle to it. Returns 0 on | ||
570 | * success, or -EBUSY if a cache entry for that device + inode exists | ||
571 | * already (this may happen after a failed lookup, but when another process | ||
572 | * has inserted the same cache entry in the meantime). | ||
573 | * | ||
574 | * @bdev: device the cache entry belongs to | ||
575 | * @block: block number | ||
576 | * @key: lookup key | ||
577 | */ | ||
578 | int | ||
579 | mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev, | ||
580 | sector_t block, unsigned int key) | ||
581 | { | 278 | { |
582 | struct mb_cache *cache = ce->e_cache; | 279 | struct mb_cache_entry *entry; |
583 | unsigned int bucket; | 280 | struct hlist_bl_head *head; |
584 | struct hlist_bl_node *l; | 281 | unsigned int shrunk = 0; |
585 | struct hlist_bl_head *block_hash_p; | 282 | |
586 | struct hlist_bl_head *index_hash_p; | 283 | spin_lock(&cache->c_list_lock); |
587 | struct mb_cache_entry *lce; | 284 | while (nr_to_scan-- && !list_empty(&cache->c_list)) { |
588 | 285 | entry = list_first_entry(&cache->c_list, | |
589 | mb_assert(ce); | 286 | struct mb_cache_entry, e_list); |
590 | bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), | 287 | if (entry->e_referenced) { |
591 | cache->c_bucket_bits); | 288 | entry->e_referenced = 0; |
592 | block_hash_p = &cache->c_block_hash[bucket]; | 289 | list_move_tail(&cache->c_list, &entry->e_list); |
593 | hlist_bl_lock(block_hash_p); | 290 | continue; |
594 | hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) { | ||
595 | if (lce->e_bdev == bdev && lce->e_block == block) { | ||
596 | hlist_bl_unlock(block_hash_p); | ||
597 | return -EBUSY; | ||
598 | } | 291 | } |
292 | list_del_init(&entry->e_list); | ||
293 | cache->c_entry_count--; | ||
294 | /* | ||
295 | * We keep LRU list reference so that entry doesn't go away | ||
296 | * from under us. | ||
297 | */ | ||
298 | spin_unlock(&cache->c_list_lock); | ||
299 | head = mb_cache_entry_head(cache, entry->e_key); | ||
300 | hlist_bl_lock(head); | ||
301 | if (!hlist_bl_unhashed(&entry->e_hash_list)) { | ||
302 | hlist_bl_del_init(&entry->e_hash_list); | ||
303 | atomic_dec(&entry->e_refcnt); | ||
304 | } | ||
305 | hlist_bl_unlock(head); | ||
306 | if (mb_cache_entry_put(cache, entry)) | ||
307 | shrunk++; | ||
308 | cond_resched(); | ||
309 | spin_lock(&cache->c_list_lock); | ||
599 | } | 310 | } |
600 | mb_assert(!__mb_cache_entry_is_block_hashed(ce)); | 311 | spin_unlock(&cache->c_list_lock); |
601 | __mb_cache_entry_unhash_block(ce); | ||
602 | __mb_cache_entry_unhash_index(ce); | ||
603 | ce->e_bdev = bdev; | ||
604 | ce->e_block = block; | ||
605 | ce->e_block_hash_p = block_hash_p; | ||
606 | ce->e_index.o_key = key; | ||
607 | hlist_bl_add_head(&ce->e_block_list, block_hash_p); | ||
608 | hlist_bl_unlock(block_hash_p); | ||
609 | bucket = hash_long(key, cache->c_bucket_bits); | ||
610 | index_hash_p = &cache->c_index_hash[bucket]; | ||
611 | hlist_bl_lock(index_hash_p); | ||
612 | ce->e_index_hash_p = index_hash_p; | ||
613 | hlist_bl_add_head(&ce->e_index.o_list, index_hash_p); | ||
614 | hlist_bl_unlock(index_hash_p); | ||
615 | return 0; | ||
616 | } | ||
617 | 312 | ||
313 | return shrunk; | ||
314 | } | ||
618 | 315 | ||
619 | /* | 316 | static unsigned long mb_cache_scan(struct shrinker *shrink, |
620 | * mb_cache_entry_release() | 317 | struct shrink_control *sc) |
621 | * | ||
622 | * Release a handle to a cache entry. When the last handle to a cache entry | ||
623 | * is released it is either freed (if it is invalid) or otherwise inserted | ||
624 | * in to the lru list. | ||
625 | */ | ||
626 | void | ||
627 | mb_cache_entry_release(struct mb_cache_entry *ce) | ||
628 | { | 318 | { |
629 | __mb_cache_entry_release(ce); | 319 | int nr_to_scan = sc->nr_to_scan; |
320 | struct mb_cache *cache = container_of(shrink, struct mb_cache, | ||
321 | c_shrink); | ||
322 | return mb_cache_shrink(cache, nr_to_scan); | ||
630 | } | 323 | } |
631 | 324 | ||
325 | /* We shrink 1/X of the cache when we have too many entries in it */ | ||
326 | #define SHRINK_DIVISOR 16 | ||
632 | 327 | ||
633 | /* | 328 | static void mb_cache_shrink_worker(struct work_struct *work) |
634 | * mb_cache_entry_free() | ||
635 | * | ||
636 | */ | ||
637 | void | ||
638 | mb_cache_entry_free(struct mb_cache_entry *ce) | ||
639 | { | 329 | { |
640 | mb_assert(ce); | 330 | struct mb_cache *cache = container_of(work, struct mb_cache, |
641 | mb_assert(list_empty(&ce->e_lru_list)); | 331 | c_shrink_work); |
642 | hlist_bl_lock(ce->e_index_hash_p); | 332 | mb_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR); |
643 | __mb_cache_entry_unhash_index(ce); | ||
644 | hlist_bl_unlock(ce->e_index_hash_p); | ||
645 | hlist_bl_lock(ce->e_block_hash_p); | ||
646 | __mb_cache_entry_unhash_block(ce); | ||
647 | hlist_bl_unlock(ce->e_block_hash_p); | ||
648 | __mb_cache_entry_release(ce); | ||
649 | } | 333 | } |
650 | 334 | ||
651 | |||
652 | /* | 335 | /* |
653 | * mb_cache_entry_get() | 336 | * mb_cache_create - create cache |
337 | * @bucket_bits: log2 of the hash table size | ||
654 | * | 338 | * |
655 | * Get a cache entry by device / block number. (There can only be one entry | 339 | * Create cache for keys with 2^bucket_bits hash entries. |
656 | * in the cache per device and block.) Returns NULL if no such cache entry | ||
657 | * exists. The returned cache entry is locked for exclusive access ("single | ||
658 | * writer"). | ||
659 | */ | 340 | */ |
660 | struct mb_cache_entry * | 341 | struct mb_cache *mb_cache_create(int bucket_bits) |
661 | mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev, | ||
662 | sector_t block) | ||
663 | { | 342 | { |
664 | unsigned int bucket; | 343 | struct mb_cache *cache; |
665 | struct hlist_bl_node *l; | 344 | int bucket_count = 1 << bucket_bits; |
666 | struct mb_cache_entry *ce; | 345 | int i; |
667 | struct hlist_bl_head *block_hash_p; | ||
668 | |||
669 | bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), | ||
670 | cache->c_bucket_bits); | ||
671 | block_hash_p = &cache->c_block_hash[bucket]; | ||
672 | /* First serialize access to the block corresponding hash chain. */ | ||
673 | hlist_bl_lock(block_hash_p); | ||
674 | hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) { | ||
675 | mb_assert(ce->e_block_hash_p == block_hash_p); | ||
676 | if (ce->e_bdev == bdev && ce->e_block == block) { | ||
677 | /* | ||
678 | * Prevent a free from removing the entry. | ||
679 | */ | ||
680 | atomic_inc(&ce->e_refcnt); | ||
681 | hlist_bl_unlock(block_hash_p); | ||
682 | __spin_lock_mb_cache_entry(ce); | ||
683 | atomic_dec(&ce->e_refcnt); | ||
684 | if (ce->e_used > 0) { | ||
685 | DEFINE_WAIT(wait); | ||
686 | while (ce->e_used > 0) { | ||
687 | ce->e_queued++; | ||
688 | prepare_to_wait(&mb_cache_queue, &wait, | ||
689 | TASK_UNINTERRUPTIBLE); | ||
690 | __spin_unlock_mb_cache_entry(ce); | ||
691 | schedule(); | ||
692 | __spin_lock_mb_cache_entry(ce); | ||
693 | ce->e_queued--; | ||
694 | } | ||
695 | finish_wait(&mb_cache_queue, &wait); | ||
696 | } | ||
697 | ce->e_used += 1 + MB_CACHE_WRITER; | ||
698 | __spin_unlock_mb_cache_entry(ce); | ||
699 | 346 | ||
700 | if (!list_empty(&ce->e_lru_list)) { | 347 | if (!try_module_get(THIS_MODULE)) |
701 | spin_lock(&mb_cache_spinlock); | 348 | return NULL; |
702 | list_del_init(&ce->e_lru_list); | 349 | |
703 | spin_unlock(&mb_cache_spinlock); | 350 | cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL); |
704 | } | 351 | if (!cache) |
705 | if (!__mb_cache_entry_is_block_hashed(ce)) { | 352 | goto err_out; |
706 | __mb_cache_entry_release(ce); | 353 | cache->c_bucket_bits = bucket_bits; |
707 | return NULL; | 354 | cache->c_max_entries = bucket_count << 4; |
708 | } | 355 | INIT_LIST_HEAD(&cache->c_list); |
709 | return ce; | 356 | spin_lock_init(&cache->c_list_lock); |
710 | } | 357 | cache->c_hash = kmalloc(bucket_count * sizeof(struct hlist_bl_head), |
358 | GFP_KERNEL); | ||
359 | if (!cache->c_hash) { | ||
360 | kfree(cache); | ||
361 | goto err_out; | ||
711 | } | 362 | } |
712 | hlist_bl_unlock(block_hash_p); | 363 | for (i = 0; i < bucket_count; i++) |
713 | return NULL; | 364 | INIT_HLIST_BL_HEAD(&cache->c_hash[i]); |
714 | } | ||
715 | 365 | ||
716 | #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) | 366 | cache->c_shrink.count_objects = mb_cache_count; |
367 | cache->c_shrink.scan_objects = mb_cache_scan; | ||
368 | cache->c_shrink.seeks = DEFAULT_SEEKS; | ||
369 | register_shrinker(&cache->c_shrink); | ||
717 | 370 | ||
718 | static struct mb_cache_entry * | 371 | INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker); |
719 | __mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head, | ||
720 | struct block_device *bdev, unsigned int key) | ||
721 | { | ||
722 | 372 | ||
723 | /* The index hash chain is alredy acquire by caller. */ | 373 | return cache; |
724 | while (l != NULL) { | 374 | |
725 | struct mb_cache_entry *ce = | 375 | err_out: |
726 | hlist_bl_entry(l, struct mb_cache_entry, | 376 | module_put(THIS_MODULE); |
727 | e_index.o_list); | ||
728 | mb_assert(ce->e_index_hash_p == head); | ||
729 | if (ce->e_bdev == bdev && ce->e_index.o_key == key) { | ||
730 | /* | ||
731 | * Prevent a free from removing the entry. | ||
732 | */ | ||
733 | atomic_inc(&ce->e_refcnt); | ||
734 | hlist_bl_unlock(head); | ||
735 | __spin_lock_mb_cache_entry(ce); | ||
736 | atomic_dec(&ce->e_refcnt); | ||
737 | ce->e_used++; | ||
738 | /* Incrementing before holding the lock gives readers | ||
739 | priority over writers. */ | ||
740 | if (ce->e_used >= MB_CACHE_WRITER) { | ||
741 | DEFINE_WAIT(wait); | ||
742 | |||
743 | while (ce->e_used >= MB_CACHE_WRITER) { | ||
744 | ce->e_queued++; | ||
745 | prepare_to_wait(&mb_cache_queue, &wait, | ||
746 | TASK_UNINTERRUPTIBLE); | ||
747 | __spin_unlock_mb_cache_entry(ce); | ||
748 | schedule(); | ||
749 | __spin_lock_mb_cache_entry(ce); | ||
750 | ce->e_queued--; | ||
751 | } | ||
752 | finish_wait(&mb_cache_queue, &wait); | ||
753 | } | ||
754 | __spin_unlock_mb_cache_entry(ce); | ||
755 | if (!list_empty(&ce->e_lru_list)) { | ||
756 | spin_lock(&mb_cache_spinlock); | ||
757 | list_del_init(&ce->e_lru_list); | ||
758 | spin_unlock(&mb_cache_spinlock); | ||
759 | } | ||
760 | if (!__mb_cache_entry_is_block_hashed(ce)) { | ||
761 | __mb_cache_entry_release(ce); | ||
762 | return ERR_PTR(-EAGAIN); | ||
763 | } | ||
764 | return ce; | ||
765 | } | ||
766 | l = l->next; | ||
767 | } | ||
768 | hlist_bl_unlock(head); | ||
769 | return NULL; | 377 | return NULL; |
770 | } | 378 | } |
771 | 379 | EXPORT_SYMBOL(mb_cache_create); | |
772 | 380 | ||
773 | /* | 381 | /* |
774 | * mb_cache_entry_find_first() | 382 | * mb_cache_destroy - destroy cache |
775 | * | 383 | * @cache: the cache to destroy |
776 | * Find the first cache entry on a given device with a certain key in | ||
777 | * an additional index. Additional matches can be found with | ||
778 | * mb_cache_entry_find_next(). Returns NULL if no match was found. The | ||
779 | * returned cache entry is locked for shared access ("multiple readers"). | ||
780 | * | 384 | * |
781 | * @cache: the cache to search | 385 | * Free all entries in cache and cache itself. Caller must make sure nobody |
782 | * @bdev: the device the cache entry should belong to | 386 | * (except shrinker) can reach @cache when calling this. |
783 | * @key: the key in the index | ||
784 | */ | 387 | */ |
785 | struct mb_cache_entry * | 388 | void mb_cache_destroy(struct mb_cache *cache) |
786 | mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev, | ||
787 | unsigned int key) | ||
788 | { | 389 | { |
789 | unsigned int bucket = hash_long(key, cache->c_bucket_bits); | 390 | struct mb_cache_entry *entry, *next; |
790 | struct hlist_bl_node *l; | ||
791 | struct mb_cache_entry *ce = NULL; | ||
792 | struct hlist_bl_head *index_hash_p; | ||
793 | |||
794 | index_hash_p = &cache->c_index_hash[bucket]; | ||
795 | hlist_bl_lock(index_hash_p); | ||
796 | if (!hlist_bl_empty(index_hash_p)) { | ||
797 | l = hlist_bl_first(index_hash_p); | ||
798 | ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); | ||
799 | } else | ||
800 | hlist_bl_unlock(index_hash_p); | ||
801 | return ce; | ||
802 | } | ||
803 | 391 | ||
392 | unregister_shrinker(&cache->c_shrink); | ||
804 | 393 | ||
805 | /* | 394 | /* |
806 | * mb_cache_entry_find_next() | 395 | * We don't bother with any locking. Cache must not be used at this |
807 | * | 396 | * point. |
808 | * Find the next cache entry on a given device with a certain key in an | 397 | */ |
809 | * additional index. Returns NULL if no match could be found. The previous | 398 | list_for_each_entry_safe(entry, next, &cache->c_list, e_list) { |
810 | * entry is atomatically released, so that mb_cache_entry_find_next() can | 399 | if (!hlist_bl_unhashed(&entry->e_hash_list)) { |
811 | * be called like this: | 400 | hlist_bl_del_init(&entry->e_hash_list); |
812 | * | 401 | atomic_dec(&entry->e_refcnt); |
813 | * entry = mb_cache_entry_find_first(); | 402 | } else |
814 | * while (entry) { | 403 | WARN_ON(1); |
815 | * ... | 404 | list_del(&entry->e_list); |
816 | * entry = mb_cache_entry_find_next(entry, ...); | 405 | WARN_ON(atomic_read(&entry->e_refcnt) != 1); |
817 | * } | 406 | mb_cache_entry_put(cache, entry); |
818 | * | 407 | } |
819 | * @prev: The previous match | 408 | kfree(cache->c_hash); |
820 | * @bdev: the device the cache entry should belong to | 409 | kfree(cache); |
821 | * @key: the key in the index | 410 | module_put(THIS_MODULE); |
822 | */ | ||
823 | struct mb_cache_entry * | ||
824 | mb_cache_entry_find_next(struct mb_cache_entry *prev, | ||
825 | struct block_device *bdev, unsigned int key) | ||
826 | { | ||
827 | struct mb_cache *cache = prev->e_cache; | ||
828 | unsigned int bucket = hash_long(key, cache->c_bucket_bits); | ||
829 | struct hlist_bl_node *l; | ||
830 | struct mb_cache_entry *ce; | ||
831 | struct hlist_bl_head *index_hash_p; | ||
832 | |||
833 | index_hash_p = &cache->c_index_hash[bucket]; | ||
834 | mb_assert(prev->e_index_hash_p == index_hash_p); | ||
835 | hlist_bl_lock(index_hash_p); | ||
836 | mb_assert(!hlist_bl_empty(index_hash_p)); | ||
837 | l = prev->e_index.o_list.next; | ||
838 | ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); | ||
839 | __mb_cache_entry_release(prev); | ||
840 | return ce; | ||
841 | } | 411 | } |
412 | EXPORT_SYMBOL(mb_cache_destroy); | ||
842 | 413 | ||
843 | #endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */ | 414 | static int __init mbcache_init(void) |
844 | |||
845 | static int __init init_mbcache(void) | ||
846 | { | 415 | { |
847 | register_shrinker(&mb_cache_shrinker); | 416 | mb_entry_cache = kmem_cache_create("mbcache", |
417 | sizeof(struct mb_cache_entry), 0, | ||
418 | SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); | ||
419 | BUG_ON(!mb_entry_cache); | ||
848 | return 0; | 420 | return 0; |
849 | } | 421 | } |
850 | 422 | ||
851 | static void __exit exit_mbcache(void) | 423 | static void __exit mbcache_exit(void) |
852 | { | 424 | { |
853 | unregister_shrinker(&mb_cache_shrinker); | 425 | kmem_cache_destroy(mb_entry_cache); |
854 | } | 426 | } |
855 | 427 | ||
856 | module_init(init_mbcache) | 428 | module_init(mbcache_init) |
857 | module_exit(exit_mbcache) | 429 | module_exit(mbcache_exit) |
858 | 430 | ||
431 | MODULE_AUTHOR("Jan Kara <jack@suse.cz>"); | ||
432 | MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); | ||
433 | MODULE_LICENSE("GPL"); | ||
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 65407f6c9120..fd1083c46c61 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h | |||
@@ -200,7 +200,7 @@ typedef struct journal_block_tag_s | |||
200 | __be32 t_blocknr_high; /* most-significant high 32bits. */ | 200 | __be32 t_blocknr_high; /* most-significant high 32bits. */ |
201 | } journal_block_tag_t; | 201 | } journal_block_tag_t; |
202 | 202 | ||
203 | /* Tail of descriptor block, for checksumming */ | 203 | /* Tail of descriptor or revoke block, for checksumming */ |
204 | struct jbd2_journal_block_tail { | 204 | struct jbd2_journal_block_tail { |
205 | __be32 t_checksum; /* crc32c(uuid+descr_block) */ | 205 | __be32 t_checksum; /* crc32c(uuid+descr_block) */ |
206 | }; | 206 | }; |
@@ -215,11 +215,6 @@ typedef struct jbd2_journal_revoke_header_s | |||
215 | __be32 r_count; /* Count of bytes used in the block */ | 215 | __be32 r_count; /* Count of bytes used in the block */ |
216 | } jbd2_journal_revoke_header_t; | 216 | } jbd2_journal_revoke_header_t; |
217 | 217 | ||
218 | /* Tail of revoke block, for checksumming */ | ||
219 | struct jbd2_journal_revoke_tail { | ||
220 | __be32 r_checksum; /* crc32c(uuid+revoke_block) */ | ||
221 | }; | ||
222 | |||
223 | /* Definitions for the journal tag flags word: */ | 218 | /* Definitions for the journal tag flags word: */ |
224 | #define JBD2_FLAG_ESCAPE 1 /* on-disk block is escaped */ | 219 | #define JBD2_FLAG_ESCAPE 1 /* on-disk block is escaped */ |
225 | #define JBD2_FLAG_SAME_UUID 2 /* block has same uuid as previous */ | 220 | #define JBD2_FLAG_SAME_UUID 2 /* block has same uuid as previous */ |
@@ -1137,7 +1132,8 @@ static inline void jbd2_unfile_log_bh(struct buffer_head *bh) | |||
1137 | } | 1132 | } |
1138 | 1133 | ||
1139 | /* Log buffer allocation */ | 1134 | /* Log buffer allocation */ |
1140 | struct buffer_head *jbd2_journal_get_descriptor_buffer(journal_t *journal); | 1135 | struct buffer_head *jbd2_journal_get_descriptor_buffer(transaction_t *, int); |
1136 | void jbd2_descriptor_block_csum_set(journal_t *, struct buffer_head *); | ||
1141 | int jbd2_journal_next_log_block(journal_t *, unsigned long long *); | 1137 | int jbd2_journal_next_log_block(journal_t *, unsigned long long *); |
1142 | int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, | 1138 | int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, |
1143 | unsigned long *block); | 1139 | unsigned long *block); |
@@ -1327,10 +1323,8 @@ extern int jbd2_journal_init_revoke_caches(void); | |||
1327 | extern void jbd2_journal_destroy_revoke(journal_t *); | 1323 | extern void jbd2_journal_destroy_revoke(journal_t *); |
1328 | extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *); | 1324 | extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *); |
1329 | extern int jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); | 1325 | extern int jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); |
1330 | extern void jbd2_journal_write_revoke_records(journal_t *journal, | 1326 | extern void jbd2_journal_write_revoke_records(transaction_t *transaction, |
1331 | transaction_t *transaction, | 1327 | struct list_head *log_bufs); |
1332 | struct list_head *log_bufs, | ||
1333 | int write_op); | ||
1334 | 1328 | ||
1335 | /* Recovery revoke support */ | 1329 | /* Recovery revoke support */ |
1336 | extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); | 1330 | extern int jbd2_journal_set_revoke(journal_t *, unsigned long long, tid_t); |
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h index 6a392e7a723a..86c9a8b480c5 100644 --- a/include/linux/mbcache.h +++ b/include/linux/mbcache.h | |||
@@ -1,55 +1,52 @@ | |||
1 | /* | 1 | #ifndef _LINUX_MBCACHE_H |
2 | File: linux/mbcache.h | 2 | #define _LINUX_MBCACHE_H |
3 | 3 | ||
4 | (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> | 4 | #include <linux/hash.h> |
5 | */ | 5 | #include <linux/list_bl.h> |
6 | struct mb_cache_entry { | 6 | #include <linux/list.h> |
7 | struct list_head e_lru_list; | 7 | #include <linux/atomic.h> |
8 | struct mb_cache *e_cache; | 8 | #include <linux/fs.h> |
9 | unsigned short e_used; | ||
10 | unsigned short e_queued; | ||
11 | atomic_t e_refcnt; | ||
12 | struct block_device *e_bdev; | ||
13 | sector_t e_block; | ||
14 | struct hlist_bl_node e_block_list; | ||
15 | struct { | ||
16 | struct hlist_bl_node o_list; | ||
17 | unsigned int o_key; | ||
18 | } e_index; | ||
19 | struct hlist_bl_head *e_block_hash_p; | ||
20 | struct hlist_bl_head *e_index_hash_p; | ||
21 | }; | ||
22 | 9 | ||
23 | struct mb_cache { | 10 | struct mb_cache; |
24 | struct list_head c_cache_list; | ||
25 | const char *c_name; | ||
26 | atomic_t c_entry_count; | ||
27 | int c_max_entries; | ||
28 | int c_bucket_bits; | ||
29 | struct kmem_cache *c_entry_cache; | ||
30 | struct hlist_bl_head *c_block_hash; | ||
31 | struct hlist_bl_head *c_index_hash; | ||
32 | }; | ||
33 | 11 | ||
34 | /* Functions on caches */ | 12 | struct mb_cache_entry { |
13 | /* List of entries in cache - protected by cache->c_list_lock */ | ||
14 | struct list_head e_list; | ||
15 | /* Hash table list - protected by hash chain bitlock */ | ||
16 | struct hlist_bl_node e_hash_list; | ||
17 | atomic_t e_refcnt; | ||
18 | /* Key in hash - stable during lifetime of the entry */ | ||
19 | u32 e_key; | ||
20 | u32 e_referenced:1; | ||
21 | u32 e_reusable:1; | ||
22 | /* Block number of hashed block - stable during lifetime of the entry */ | ||
23 | sector_t e_block; | ||
24 | }; | ||
35 | 25 | ||
36 | struct mb_cache *mb_cache_create(const char *, int); | 26 | struct mb_cache *mb_cache_create(int bucket_bits); |
37 | void mb_cache_shrink(struct block_device *); | 27 | void mb_cache_destroy(struct mb_cache *cache); |
38 | void mb_cache_destroy(struct mb_cache *); | ||
39 | 28 | ||
40 | /* Functions on cache entries */ | 29 | int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, |
30 | sector_t block, bool reusable); | ||
31 | void __mb_cache_entry_free(struct mb_cache_entry *entry); | ||
32 | static inline int mb_cache_entry_put(struct mb_cache *cache, | ||
33 | struct mb_cache_entry *entry) | ||
34 | { | ||
35 | if (!atomic_dec_and_test(&entry->e_refcnt)) | ||
36 | return 0; | ||
37 | __mb_cache_entry_free(entry); | ||
38 | return 1; | ||
39 | } | ||
41 | 40 | ||
42 | struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *, gfp_t); | 41 | void mb_cache_entry_delete_block(struct mb_cache *cache, u32 key, |
43 | int mb_cache_entry_insert(struct mb_cache_entry *, struct block_device *, | 42 | sector_t block); |
44 | sector_t, unsigned int); | 43 | struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, |
45 | void mb_cache_entry_release(struct mb_cache_entry *); | 44 | sector_t block); |
46 | void mb_cache_entry_free(struct mb_cache_entry *); | ||
47 | struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *, | ||
48 | struct block_device *, | ||
49 | sector_t); | ||
50 | struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, | 45 | struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, |
51 | struct block_device *, | 46 | u32 key); |
52 | unsigned int); | 47 | struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, |
53 | struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, | 48 | struct mb_cache_entry *entry); |
54 | struct block_device *, | 49 | void mb_cache_entry_touch(struct mb_cache *cache, |
55 | unsigned int); | 50 | struct mb_cache_entry *entry); |
51 | |||
52 | #endif /* _LINUX_MBCACHE_H */ | ||