diff options
author | Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> | 2008-01-28 23:58:26 -0500 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2008-01-28 23:58:26 -0500 |
commit | 0e855ac8b103ef579052936b59fe7c599ac422a4 (patch) | |
tree | ec29f82e1d7bb1987dcadc00497daf69d6955483 | |
parent | c278bfecebfb1ed67c326ef472660878baa745cd (diff) |
ext4: Convert truncate_mutex to read write semaphore.
We are currently taking the truncate_mutex for every read. This would have
performance impact on large CPU configuration. Convert the lock to read write
semaphore and take read lock when we are trying to read the file.
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
-rw-r--r-- | fs/ext4/balloc.c | 2 | ||||
-rw-r--r-- | fs/ext4/extents.c | 13 | ||||
-rw-r--r-- | fs/ext4/file.c | 4 | ||||
-rw-r--r-- | fs/ext4/inode.c | 40 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 4 | ||||
-rw-r--r-- | fs/ext4/super.c | 2 | ||||
-rw-r--r-- | include/linux/ext4_fs.h | 25 | ||||
-rw-r--r-- | include/linux/ext4_fs_i.h | 6 |
8 files changed, 53 insertions, 43 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index d460223b8e1d..7ae223ed152f 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c | |||
@@ -526,7 +526,7 @@ static inline int rsv_is_empty(struct ext4_reserve_window *rsv) | |||
526 | * when setting the reservation window size through ioctl before the file | 526 | * when setting the reservation window size through ioctl before the file |
527 | * is open for write (needs block allocation). | 527 | * is open for write (needs block allocation). |
528 | * | 528 | * |
529 | * Needs truncate_mutex protection prior to call this function. | 529 | * Needs down_write(i_data_sem) protection prior to call this function. |
530 | */ | 530 | */ |
531 | void ext4_init_block_alloc_info(struct inode *inode) | 531 | void ext4_init_block_alloc_info(struct inode *inode) |
532 | { | 532 | { |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ec5019fa552f..03d1bbb78a2f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -1565,7 +1565,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, | |||
1565 | * This routine returns max. credits that the extent tree can consume. | 1565 | * This routine returns max. credits that the extent tree can consume. |
1566 | * It should be OK for low-performance paths like ->writepage() | 1566 | * It should be OK for low-performance paths like ->writepage() |
1567 | * To allow many writing processes to fit into a single transaction, | 1567 | * To allow many writing processes to fit into a single transaction, |
1568 | * the caller should calculate credits under truncate_mutex and | 1568 | * the caller should calculate credits under i_data_sem and |
1569 | * pass the actual path. | 1569 | * pass the actual path. |
1570 | */ | 1570 | */ |
1571 | int ext4_ext_calc_credits_for_insert(struct inode *inode, | 1571 | int ext4_ext_calc_credits_for_insert(struct inode *inode, |
@@ -2131,7 +2131,8 @@ out: | |||
2131 | 2131 | ||
2132 | /* | 2132 | /* |
2133 | * Need to be called with | 2133 | * Need to be called with |
2134 | * mutex_lock(&EXT4_I(inode)->truncate_mutex); | 2134 | * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block |
2135 | * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) | ||
2135 | */ | 2136 | */ |
2136 | int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | 2137 | int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, |
2137 | ext4_lblk_t iblock, | 2138 | ext4_lblk_t iblock, |
@@ -2350,7 +2351,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page) | |||
2350 | if (page) | 2351 | if (page) |
2351 | ext4_block_truncate_page(handle, page, mapping, inode->i_size); | 2352 | ext4_block_truncate_page(handle, page, mapping, inode->i_size); |
2352 | 2353 | ||
2353 | mutex_lock(&EXT4_I(inode)->truncate_mutex); | 2354 | down_write(&EXT4_I(inode)->i_data_sem); |
2354 | ext4_ext_invalidate_cache(inode); | 2355 | ext4_ext_invalidate_cache(inode); |
2355 | 2356 | ||
2356 | /* | 2357 | /* |
@@ -2386,7 +2387,7 @@ out_stop: | |||
2386 | if (inode->i_nlink) | 2387 | if (inode->i_nlink) |
2387 | ext4_orphan_del(handle, inode); | 2388 | ext4_orphan_del(handle, inode); |
2388 | 2389 | ||
2389 | mutex_unlock(&EXT4_I(inode)->truncate_mutex); | 2390 | up_write(&EXT4_I(inode)->i_data_sem); |
2390 | ext4_journal_stop(handle); | 2391 | ext4_journal_stop(handle); |
2391 | } | 2392 | } |
2392 | 2393 | ||
@@ -2450,7 +2451,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) | |||
2450 | * modify 1 super block, 1 block bitmap and 1 group descriptor. | 2451 | * modify 1 super block, 1 block bitmap and 1 group descriptor. |
2451 | */ | 2452 | */ |
2452 | credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; | 2453 | credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; |
2453 | mutex_lock(&EXT4_I(inode)->truncate_mutex) | 2454 | down_write((&EXT4_I(inode)->i_data_sem)); |
2454 | retry: | 2455 | retry: |
2455 | while (ret >= 0 && ret < max_blocks) { | 2456 | while (ret >= 0 && ret < max_blocks) { |
2456 | block = block + ret; | 2457 | block = block + ret; |
@@ -2507,7 +2508,7 @@ retry: | |||
2507 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | 2508 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) |
2508 | goto retry; | 2509 | goto retry; |
2509 | 2510 | ||
2510 | mutex_unlock(&EXT4_I(inode)->truncate_mutex) | 2511 | up_write((&EXT4_I(inode)->i_data_sem)); |
2511 | /* | 2512 | /* |
2512 | * Time to update the file size. | 2513 | * Time to update the file size. |
2513 | * Update only when preallocation was requested beyond the file size. | 2514 | * Update only when preallocation was requested beyond the file size. |
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index a6b2aa14626e..ac35ec58db55 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c | |||
@@ -37,9 +37,9 @@ static int ext4_release_file (struct inode * inode, struct file * filp) | |||
37 | if ((filp->f_mode & FMODE_WRITE) && | 37 | if ((filp->f_mode & FMODE_WRITE) && |
38 | (atomic_read(&inode->i_writecount) == 1)) | 38 | (atomic_read(&inode->i_writecount) == 1)) |
39 | { | 39 | { |
40 | mutex_lock(&EXT4_I(inode)->truncate_mutex); | 40 | down_write(&EXT4_I(inode)->i_data_sem); |
41 | ext4_discard_reservation(inode); | 41 | ext4_discard_reservation(inode); |
42 | mutex_unlock(&EXT4_I(inode)->truncate_mutex); | 42 | up_write(&EXT4_I(inode)->i_data_sem); |
43 | } | 43 | } |
44 | if (is_dx(inode) && filp->private_data) | 44 | if (is_dx(inode) && filp->private_data) |
45 | ext4_htree_free_dir_info(filp->private_data); | 45 | ext4_htree_free_dir_info(filp->private_data); |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 71c7ad0c6723..a7eb8bb4bdd4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -308,7 +308,7 @@ static int ext4_block_to_path(struct inode *inode, | |||
308 | final = ptrs; | 308 | final = ptrs; |
309 | } else { | 309 | } else { |
310 | ext4_warning(inode->i_sb, "ext4_block_to_path", | 310 | ext4_warning(inode->i_sb, "ext4_block_to_path", |
311 | "block %u > max", | 311 | "block %lu > max", |
312 | i_block + direct_blocks + | 312 | i_block + direct_blocks + |
313 | indirect_blocks + double_blocks); | 313 | indirect_blocks + double_blocks); |
314 | } | 314 | } |
@@ -345,7 +345,7 @@ static int ext4_block_to_path(struct inode *inode, | |||
345 | * the whole chain, all way to the data (returns %NULL, *err == 0). | 345 | * the whole chain, all way to the data (returns %NULL, *err == 0). |
346 | * | 346 | * |
347 | * Need to be called with | 347 | * Need to be called with |
348 | * mutex_lock(&EXT4_I(inode)->truncate_mutex) | 348 | * down_read(&EXT4_I(inode)->i_data_sem) |
349 | */ | 349 | */ |
350 | static Indirect *ext4_get_branch(struct inode *inode, int depth, | 350 | static Indirect *ext4_get_branch(struct inode *inode, int depth, |
351 | ext4_lblk_t *offsets, | 351 | ext4_lblk_t *offsets, |
@@ -777,7 +777,8 @@ err_out: | |||
777 | * | 777 | * |
778 | * | 778 | * |
779 | * Need to be called with | 779 | * Need to be called with |
780 | * mutex_lock(&EXT4_I(inode)->truncate_mutex) | 780 | * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block |
781 | * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) | ||
781 | */ | 782 | */ |
782 | int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | 783 | int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, |
783 | ext4_lblk_t iblock, unsigned long maxblocks, | 784 | ext4_lblk_t iblock, unsigned long maxblocks, |
@@ -865,7 +866,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | |||
865 | err = ext4_splice_branch(handle, inode, iblock, | 866 | err = ext4_splice_branch(handle, inode, iblock, |
866 | partial, indirect_blks, count); | 867 | partial, indirect_blks, count); |
867 | /* | 868 | /* |
868 | * i_disksize growing is protected by truncate_mutex. Don't forget to | 869 | * i_disksize growing is protected by i_data_sem. Don't forget to |
869 | * protect it if you're about to implement concurrent | 870 | * protect it if you're about to implement concurrent |
870 | * ext4_get_block() -bzzz | 871 | * ext4_get_block() -bzzz |
871 | */ | 872 | */ |
@@ -895,6 +896,31 @@ out: | |||
895 | 896 | ||
896 | #define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32) | 897 | #define DIO_CREDITS (EXT4_RESERVE_TRANS_BLOCKS + 32) |
897 | 898 | ||
899 | int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, | ||
900 | unsigned long max_blocks, struct buffer_head *bh, | ||
901 | int create, int extend_disksize) | ||
902 | { | ||
903 | int retval; | ||
904 | if (create) { | ||
905 | down_write((&EXT4_I(inode)->i_data_sem)); | ||
906 | } else { | ||
907 | down_read((&EXT4_I(inode)->i_data_sem)); | ||
908 | } | ||
909 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { | ||
910 | retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, | ||
911 | bh, create, extend_disksize); | ||
912 | } else { | ||
913 | retval = ext4_get_blocks_handle(handle, inode, block, | ||
914 | max_blocks, bh, create, extend_disksize); | ||
915 | } | ||
916 | if (create) { | ||
917 | up_write((&EXT4_I(inode)->i_data_sem)); | ||
918 | } else { | ||
919 | up_read((&EXT4_I(inode)->i_data_sem)); | ||
920 | } | ||
921 | return retval; | ||
922 | } | ||
923 | |||
898 | static int ext4_get_block(struct inode *inode, sector_t iblock, | 924 | static int ext4_get_block(struct inode *inode, sector_t iblock, |
899 | struct buffer_head *bh_result, int create) | 925 | struct buffer_head *bh_result, int create) |
900 | { | 926 | { |
@@ -1399,7 +1425,7 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) | |||
1399 | * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... | 1425 | * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... |
1400 | * | 1426 | * |
1401 | * Same applies to ext4_get_block(). We will deadlock on various things like | 1427 | * Same applies to ext4_get_block(). We will deadlock on various things like |
1402 | * lock_journal and i_truncate_mutex. | 1428 | * lock_journal and i_data_sem |
1403 | * | 1429 | * |
1404 | * Setting PF_MEMALLOC here doesn't work - too many internal memory | 1430 | * Setting PF_MEMALLOC here doesn't work - too many internal memory |
1405 | * allocations fail. | 1431 | * allocations fail. |
@@ -2325,7 +2351,7 @@ void ext4_truncate(struct inode *inode) | |||
2325 | * From here we block out all ext4_get_block() callers who want to | 2351 | * From here we block out all ext4_get_block() callers who want to |
2326 | * modify the block allocation tree. | 2352 | * modify the block allocation tree. |
2327 | */ | 2353 | */ |
2328 | mutex_lock(&ei->truncate_mutex); | 2354 | down_write(&ei->i_data_sem); |
2329 | 2355 | ||
2330 | if (n == 1) { /* direct blocks */ | 2356 | if (n == 1) { /* direct blocks */ |
2331 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], | 2357 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], |
@@ -2389,7 +2415,7 @@ do_indirects: | |||
2389 | 2415 | ||
2390 | ext4_discard_reservation(inode); | 2416 | ext4_discard_reservation(inode); |
2391 | 2417 | ||
2392 | mutex_unlock(&ei->truncate_mutex); | 2418 | up_write(&ei->i_data_sem); |
2393 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | 2419 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); |
2394 | ext4_mark_inode_dirty(handle, inode); | 2420 | ext4_mark_inode_dirty(handle, inode); |
2395 | 2421 | ||
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index e7f894bdb420..c0e5b8cf635c 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c | |||
@@ -199,7 +199,7 @@ flags_err: | |||
199 | * need to allocate reservation structure for this inode | 199 | * need to allocate reservation structure for this inode |
200 | * before set the window size | 200 | * before set the window size |
201 | */ | 201 | */ |
202 | mutex_lock(&ei->truncate_mutex); | 202 | down_write(&ei->i_data_sem); |
203 | if (!ei->i_block_alloc_info) | 203 | if (!ei->i_block_alloc_info) |
204 | ext4_init_block_alloc_info(inode); | 204 | ext4_init_block_alloc_info(inode); |
205 | 205 | ||
@@ -207,7 +207,7 @@ flags_err: | |||
207 | struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; | 207 | struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; |
208 | rsv->rsv_goal_size = rsv_window_size; | 208 | rsv->rsv_goal_size = rsv_window_size; |
209 | } | 209 | } |
210 | mutex_unlock(&ei->truncate_mutex); | 210 | up_write(&ei->i_data_sem); |
211 | return 0; | 211 | return 0; |
212 | } | 212 | } |
213 | case EXT4_IOC_GROUP_EXTEND: { | 213 | case EXT4_IOC_GROUP_EXTEND: { |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index effd375ece80..c7305443e100 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -593,7 +593,7 @@ static void init_once(struct kmem_cache *cachep, void *foo) | |||
593 | #ifdef CONFIG_EXT4DEV_FS_XATTR | 593 | #ifdef CONFIG_EXT4DEV_FS_XATTR |
594 | init_rwsem(&ei->xattr_sem); | 594 | init_rwsem(&ei->xattr_sem); |
595 | #endif | 595 | #endif |
596 | mutex_init(&ei->truncate_mutex); | 596 | init_rwsem(&ei->i_data_sem); |
597 | inode_init_once(&ei->vfs_inode); | 597 | inode_init_once(&ei->vfs_inode); |
598 | } | 598 | } |
599 | 599 | ||
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h index 583049c1d366..300cc5a5adb9 100644 --- a/include/linux/ext4_fs.h +++ b/include/linux/ext4_fs.h | |||
@@ -1107,27 +1107,10 @@ extern void ext4_ext_init(struct super_block *); | |||
1107 | extern void ext4_ext_release(struct super_block *); | 1107 | extern void ext4_ext_release(struct super_block *); |
1108 | extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, | 1108 | extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, |
1109 | loff_t len); | 1109 | loff_t len); |
1110 | static inline int | 1110 | extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, |
1111 | ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, | 1111 | sector_t block, unsigned long max_blocks, |
1112 | unsigned long max_blocks, struct buffer_head *bh, | 1112 | struct buffer_head *bh, int create, |
1113 | int create, int extend_disksize) | 1113 | int extend_disksize); |
1114 | { | ||
1115 | int retval; | ||
1116 | mutex_lock(&EXT4_I(inode)->truncate_mutex); | ||
1117 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { | ||
1118 | retval = ext4_ext_get_blocks(handle, inode, | ||
1119 | (ext4_lblk_t)block, max_blocks, | ||
1120 | bh, create, extend_disksize); | ||
1121 | } else { | ||
1122 | retval = ext4_get_blocks_handle(handle, inode, | ||
1123 | (ext4_lblk_t)block, max_blocks, | ||
1124 | bh, create, extend_disksize); | ||
1125 | } | ||
1126 | mutex_unlock(&EXT4_I(inode)->truncate_mutex); | ||
1127 | return retval; | ||
1128 | } | ||
1129 | |||
1130 | |||
1131 | #endif /* __KERNEL__ */ | 1114 | #endif /* __KERNEL__ */ |
1132 | 1115 | ||
1133 | #endif /* _LINUX_EXT4_FS_H */ | 1116 | #endif /* _LINUX_EXT4_FS_H */ |
diff --git a/include/linux/ext4_fs_i.h b/include/linux/ext4_fs_i.h index f1cd4934e46f..4377d249d378 100644 --- a/include/linux/ext4_fs_i.h +++ b/include/linux/ext4_fs_i.h | |||
@@ -139,16 +139,16 @@ struct ext4_inode_info { | |||
139 | __u16 i_extra_isize; | 139 | __u16 i_extra_isize; |
140 | 140 | ||
141 | /* | 141 | /* |
142 | * truncate_mutex is for serialising ext4_truncate() against | 142 | * i_data_sem is for serialising ext4_truncate() against |
143 | * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's | 143 | * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's |
144 | * data tree are chopped off during truncate. We can't do that in | 144 | * data tree are chopped off during truncate. We can't do that in |
145 | * ext4 because whenever we perform intermediate commits during | 145 | * ext4 because whenever we perform intermediate commits during |
146 | * truncate, the inode and all the metadata blocks *must* be in a | 146 | * truncate, the inode and all the metadata blocks *must* be in a |
147 | * consistent state which allows truncation of the orphans to restart | 147 | * consistent state which allows truncation of the orphans to restart |
148 | * during recovery. Hence we must fix the get_block-vs-truncate race | 148 | * during recovery. Hence we must fix the get_block-vs-truncate race |
149 | * by other means, so we have truncate_mutex. | 149 | * by other means, so we have i_data_sem. |
150 | */ | 150 | */ |
151 | struct mutex truncate_mutex; | 151 | struct rw_semaphore i_data_sem; |
152 | struct inode vfs_inode; | 152 | struct inode vfs_inode; |
153 | 153 | ||
154 | unsigned long i_ext_generation; | 154 | unsigned long i_ext_generation; |