diff options
author | Shuning Zhang <sunny.s.zhang@oracle.com> | 2019-11-06 00:16:34 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-11-06 11:47:08 -0500 |
commit | e74540b285569d2b1e14fe7aee92297078f235ce (patch) | |
tree | d0c7128a3c2d5baf0e9904938b77a97eabb22f74 /fs/ocfs2/file.c | |
parent | 169226f7e0d275c1879551f37484ef6683579a5c (diff) |
ocfs2: protect extent tree in ocfs2_prepare_inode_for_write()
When the extent tree is modified, it should be protected by inode
cluster lock and ip_alloc_sem.
The extent tree is accessed and modified in the
ocfs2_prepare_inode_for_write, but isn't protected by ip_alloc_sem.
The following is a case. The function ocfs2_fiemap is accessing the
extent tree, which is modified at the same time.
kernel BUG at fs/ocfs2/extent_map.c:475!
invalid opcode: 0000 [#1] SMP
Modules linked in: tun ocfs2 ocfs2_nodemanager configfs ocfs2_stackglue [...]
CPU: 16 PID: 14047 Comm: o2info Not tainted 4.1.12-124.23.1.el6uek.x86_64 #2
Hardware name: Oracle Corporation ORACLE SERVER X7-2L/ASM, MB MECH, X7-2L, BIOS 42040600 10/19/2018
task: ffff88019487e200 ti: ffff88003daa4000 task.ti: ffff88003daa4000
RIP: ocfs2_get_clusters_nocache.isra.11+0x390/0x550 [ocfs2]
Call Trace:
ocfs2_fiemap+0x1e3/0x430 [ocfs2]
do_vfs_ioctl+0x155/0x510
SyS_ioctl+0x81/0xa0
system_call_fastpath+0x18/0xd8
Code: 18 48 c7 c6 60 7f 65 a0 31 c0 bb e2 ff ff ff 48 8b 4a 40 48 8b 7a 28 48 c7 c2 78 2d 66 a0 e8 38 4f 05 00 e9 28 fe ff ff 0f 1f 00 <0f> 0b 66 0f 1f 44 00 00 bb 86 ff ff ff e9 13 fe ff ff 66 0f 1f
RIP ocfs2_get_clusters_nocache.isra.11+0x390/0x550 [ocfs2]
---[ end trace c8aa0c8180e869dc ]---
Kernel panic - not syncing: Fatal exception
Kernel Offset: disabled
This issue can be reproduced every week in a production environment.
This issue is related to the usage mode. If others use ocfs2 in this
mode, the kernel will panic frequently.
[akpm@linux-foundation.org: coding style fixes]
[Fix new warning due to unused function by removing said function - Linus ]
Link: http://lkml.kernel.org/r/1568772175-2906-2-git-send-email-sunny.s.zhang@oracle.com
Signed-off-by: Shuning Zhang <sunny.s.zhang@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Gang He <ghe@suse.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <jiangqi903@gmail.com>
Cc: Changwei Ge <gechangwei@live.cn>
Cc: Jun Piao <piaojun@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/ocfs2/file.c')
-rw-r--r-- | fs/ocfs2/file.c | 134 |
1 files changed, 90 insertions, 44 deletions
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 53939bf9d7d2..9876db52913a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -2098,53 +2098,89 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) | |||
2098 | return 0; | 2098 | return 0; |
2099 | } | 2099 | } |
2100 | 2100 | ||
2101 | static int ocfs2_prepare_inode_for_refcount(struct inode *inode, | 2101 | static int ocfs2_inode_lock_for_extent_tree(struct inode *inode, |
2102 | struct file *file, | 2102 | struct buffer_head **di_bh, |
2103 | loff_t pos, size_t count, | 2103 | int meta_level, |
2104 | int *meta_level) | 2104 | int overwrite_io, |
2105 | int write_sem, | ||
2106 | int wait) | ||
2105 | { | 2107 | { |
2106 | int ret; | 2108 | int ret = 0; |
2107 | struct buffer_head *di_bh = NULL; | ||
2108 | u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; | ||
2109 | u32 clusters = | ||
2110 | ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; | ||
2111 | 2109 | ||
2112 | ret = ocfs2_inode_lock(inode, &di_bh, 1); | 2110 | if (wait) |
2113 | if (ret) { | 2111 | ret = ocfs2_inode_lock(inode, NULL, meta_level); |
2114 | mlog_errno(ret); | 2112 | else |
2113 | ret = ocfs2_try_inode_lock(inode, | ||
2114 | overwrite_io ? NULL : di_bh, meta_level); | ||
2115 | if (ret < 0) | ||
2115 | goto out; | 2116 | goto out; |
2117 | |||
2118 | if (wait) { | ||
2119 | if (write_sem) | ||
2120 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
2121 | else | ||
2122 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
2123 | } else { | ||
2124 | if (write_sem) | ||
2125 | ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem); | ||
2126 | else | ||
2127 | ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem); | ||
2128 | |||
2129 | if (!ret) { | ||
2130 | ret = -EAGAIN; | ||
2131 | goto out_unlock; | ||
2132 | } | ||
2116 | } | 2133 | } |
2117 | 2134 | ||
2118 | *meta_level = 1; | 2135 | return ret; |
2119 | 2136 | ||
2120 | ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); | 2137 | out_unlock: |
2121 | if (ret) | 2138 | brelse(*di_bh); |
2122 | mlog_errno(ret); | 2139 | ocfs2_inode_unlock(inode, meta_level); |
2123 | out: | 2140 | out: |
2124 | brelse(di_bh); | ||
2125 | return ret; | 2141 | return ret; |
2126 | } | 2142 | } |
2127 | 2143 | ||
2144 | static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode, | ||
2145 | struct buffer_head **di_bh, | ||
2146 | int meta_level, | ||
2147 | int write_sem) | ||
2148 | { | ||
2149 | if (write_sem) | ||
2150 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
2151 | else | ||
2152 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
2153 | |||
2154 | brelse(*di_bh); | ||
2155 | *di_bh = NULL; | ||
2156 | |||
2157 | if (meta_level >= 0) | ||
2158 | ocfs2_inode_unlock(inode, meta_level); | ||
2159 | } | ||
2160 | |||
2128 | static int ocfs2_prepare_inode_for_write(struct file *file, | 2161 | static int ocfs2_prepare_inode_for_write(struct file *file, |
2129 | loff_t pos, size_t count, int wait) | 2162 | loff_t pos, size_t count, int wait) |
2130 | { | 2163 | { |
2131 | int ret = 0, meta_level = 0, overwrite_io = 0; | 2164 | int ret = 0, meta_level = 0, overwrite_io = 0; |
2165 | int write_sem = 0; | ||
2132 | struct dentry *dentry = file->f_path.dentry; | 2166 | struct dentry *dentry = file->f_path.dentry; |
2133 | struct inode *inode = d_inode(dentry); | 2167 | struct inode *inode = d_inode(dentry); |
2134 | struct buffer_head *di_bh = NULL; | 2168 | struct buffer_head *di_bh = NULL; |
2169 | u32 cpos; | ||
2170 | u32 clusters; | ||
2135 | 2171 | ||
2136 | /* | 2172 | /* |
2137 | * We start with a read level meta lock and only jump to an ex | 2173 | * We start with a read level meta lock and only jump to an ex |
2138 | * if we need to make modifications here. | 2174 | * if we need to make modifications here. |
2139 | */ | 2175 | */ |
2140 | for(;;) { | 2176 | for(;;) { |
2141 | if (wait) | 2177 | ret = ocfs2_inode_lock_for_extent_tree(inode, |
2142 | ret = ocfs2_inode_lock(inode, NULL, meta_level); | 2178 | &di_bh, |
2143 | else | 2179 | meta_level, |
2144 | ret = ocfs2_try_inode_lock(inode, | 2180 | overwrite_io, |
2145 | overwrite_io ? NULL : &di_bh, meta_level); | 2181 | write_sem, |
2182 | wait); | ||
2146 | if (ret < 0) { | 2183 | if (ret < 0) { |
2147 | meta_level = -1; | ||
2148 | if (ret != -EAGAIN) | 2184 | if (ret != -EAGAIN) |
2149 | mlog_errno(ret); | 2185 | mlog_errno(ret); |
2150 | goto out; | 2186 | goto out; |
@@ -2156,15 +2192,8 @@ static int ocfs2_prepare_inode_for_write(struct file *file, | |||
2156 | */ | 2192 | */ |
2157 | if (!wait && !overwrite_io) { | 2193 | if (!wait && !overwrite_io) { |
2158 | overwrite_io = 1; | 2194 | overwrite_io = 1; |
2159 | if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) { | ||
2160 | ret = -EAGAIN; | ||
2161 | goto out_unlock; | ||
2162 | } | ||
2163 | 2195 | ||
2164 | ret = ocfs2_overwrite_io(inode, di_bh, pos, count); | 2196 | ret = ocfs2_overwrite_io(inode, di_bh, pos, count); |
2165 | brelse(di_bh); | ||
2166 | di_bh = NULL; | ||
2167 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | ||
2168 | if (ret < 0) { | 2197 | if (ret < 0) { |
2169 | if (ret != -EAGAIN) | 2198 | if (ret != -EAGAIN) |
2170 | mlog_errno(ret); | 2199 | mlog_errno(ret); |
@@ -2183,7 +2212,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file, | |||
2183 | * set inode->i_size at the end of a write. */ | 2212 | * set inode->i_size at the end of a write. */ |
2184 | if (should_remove_suid(dentry)) { | 2213 | if (should_remove_suid(dentry)) { |
2185 | if (meta_level == 0) { | 2214 | if (meta_level == 0) { |
2186 | ocfs2_inode_unlock(inode, meta_level); | 2215 | ocfs2_inode_unlock_for_extent_tree(inode, |
2216 | &di_bh, | ||
2217 | meta_level, | ||
2218 | write_sem); | ||
2187 | meta_level = 1; | 2219 | meta_level = 1; |
2188 | continue; | 2220 | continue; |
2189 | } | 2221 | } |
@@ -2197,18 +2229,32 @@ static int ocfs2_prepare_inode_for_write(struct file *file, | |||
2197 | 2229 | ||
2198 | ret = ocfs2_check_range_for_refcount(inode, pos, count); | 2230 | ret = ocfs2_check_range_for_refcount(inode, pos, count); |
2199 | if (ret == 1) { | 2231 | if (ret == 1) { |
2200 | ocfs2_inode_unlock(inode, meta_level); | 2232 | ocfs2_inode_unlock_for_extent_tree(inode, |
2201 | meta_level = -1; | 2233 | &di_bh, |
2202 | 2234 | meta_level, | |
2203 | ret = ocfs2_prepare_inode_for_refcount(inode, | 2235 | write_sem); |
2204 | file, | 2236 | ret = ocfs2_inode_lock_for_extent_tree(inode, |
2205 | pos, | 2237 | &di_bh, |
2206 | count, | 2238 | meta_level, |
2207 | &meta_level); | 2239 | overwrite_io, |
2240 | 1, | ||
2241 | wait); | ||
2242 | write_sem = 1; | ||
2243 | if (ret < 0) { | ||
2244 | if (ret != -EAGAIN) | ||
2245 | mlog_errno(ret); | ||
2246 | goto out; | ||
2247 | } | ||
2248 | |||
2249 | cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; | ||
2250 | clusters = | ||
2251 | ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; | ||
2252 | ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); | ||
2208 | } | 2253 | } |
2209 | 2254 | ||
2210 | if (ret < 0) { | 2255 | if (ret < 0) { |
2211 | mlog_errno(ret); | 2256 | if (ret != -EAGAIN) |
2257 | mlog_errno(ret); | ||
2212 | goto out_unlock; | 2258 | goto out_unlock; |
2213 | } | 2259 | } |
2214 | 2260 | ||
@@ -2219,10 +2265,10 @@ out_unlock: | |||
2219 | trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, | 2265 | trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, |
2220 | pos, count, wait); | 2266 | pos, count, wait); |
2221 | 2267 | ||
2222 | brelse(di_bh); | 2268 | ocfs2_inode_unlock_for_extent_tree(inode, |
2223 | 2269 | &di_bh, | |
2224 | if (meta_level >= 0) | 2270 | meta_level, |
2225 | ocfs2_inode_unlock(inode, meta_level); | 2271 | write_sem); |
2226 | 2272 | ||
2227 | out: | 2273 | out: |
2228 | return ret; | 2274 | return ret; |