aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/file.c
diff options
context:
space:
mode:
authorShuning Zhang <sunny.s.zhang@oracle.com>2019-11-06 00:16:34 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2019-11-06 11:47:08 -0500
commite74540b285569d2b1e14fe7aee92297078f235ce (patch)
treed0c7128a3c2d5baf0e9904938b77a97eabb22f74 /fs/ocfs2/file.c
parent169226f7e0d275c1879551f37484ef6683579a5c (diff)
ocfs2: protect extent tree in ocfs2_prepare_inode_for_write()
When the extent tree is modified, it should be protected by inode cluster lock and ip_alloc_sem. The extent tree is accessed and modified in the ocfs2_prepare_inode_for_write, but isn't protected by ip_alloc_sem. The following is a case. The function ocfs2_fiemap is accessing the extent tree, which is modified at the same time. kernel BUG at fs/ocfs2/extent_map.c:475! invalid opcode: 0000 [#1] SMP Modules linked in: tun ocfs2 ocfs2_nodemanager configfs ocfs2_stackglue [...] CPU: 16 PID: 14047 Comm: o2info Not tainted 4.1.12-124.23.1.el6uek.x86_64 #2 Hardware name: Oracle Corporation ORACLE SERVER X7-2L/ASM, MB MECH, X7-2L, BIOS 42040600 10/19/2018 task: ffff88019487e200 ti: ffff88003daa4000 task.ti: ffff88003daa4000 RIP: ocfs2_get_clusters_nocache.isra.11+0x390/0x550 [ocfs2] Call Trace: ocfs2_fiemap+0x1e3/0x430 [ocfs2] do_vfs_ioctl+0x155/0x510 SyS_ioctl+0x81/0xa0 system_call_fastpath+0x18/0xd8 Code: 18 48 c7 c6 60 7f 65 a0 31 c0 bb e2 ff ff ff 48 8b 4a 40 48 8b 7a 28 48 c7 c2 78 2d 66 a0 e8 38 4f 05 00 e9 28 fe ff ff 0f 1f 00 <0f> 0b 66 0f 1f 44 00 00 bb 86 ff ff ff e9 13 fe ff ff 66 0f 1f RIP ocfs2_get_clusters_nocache.isra.11+0x390/0x550 [ocfs2] ---[ end trace c8aa0c8180e869dc ]--- Kernel panic - not syncing: Fatal exception Kernel Offset: disabled This issue can be reproduced every week in a production environment. This issue is related to the usage mode. If others use ocfs2 in this mode, the kernel will panic frequently. [akpm@linux-foundation.org: coding style fixes] [Fix new warning due to unused function by removing said function - Linus ] Link: http://lkml.kernel.org/r/1568772175-2906-2-git-send-email-sunny.s.zhang@oracle.com Signed-off-by: Shuning Zhang <sunny.s.zhang@oracle.com> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com> Reviewed-by: Gang He <ghe@suse.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Joseph Qi <jiangqi903@gmail.com> Cc: Changwei Ge <gechangwei@live.cn> Cc: Jun Piao <piaojun@huawei.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/ocfs2/file.c')
-rw-r--r--fs/ocfs2/file.c134
1 files changed, 90 insertions, 44 deletions
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 53939bf9d7d2..9876db52913a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2098,53 +2098,89 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
2098 return 0; 2098 return 0;
2099} 2099}
2100 2100
2101static int ocfs2_prepare_inode_for_refcount(struct inode *inode, 2101static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
2102 struct file *file, 2102 struct buffer_head **di_bh,
2103 loff_t pos, size_t count, 2103 int meta_level,
2104 int *meta_level) 2104 int overwrite_io,
2105 int write_sem,
2106 int wait)
2105{ 2107{
2106 int ret; 2108 int ret = 0;
2107 struct buffer_head *di_bh = NULL;
2108 u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
2109 u32 clusters =
2110 ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
2111 2109
2112 ret = ocfs2_inode_lock(inode, &di_bh, 1); 2110 if (wait)
2113 if (ret) { 2111 ret = ocfs2_inode_lock(inode, NULL, meta_level);
2114 mlog_errno(ret); 2112 else
2113 ret = ocfs2_try_inode_lock(inode,
2114 overwrite_io ? NULL : di_bh, meta_level);
2115 if (ret < 0)
2115 goto out; 2116 goto out;
2117
2118 if (wait) {
2119 if (write_sem)
2120 down_write(&OCFS2_I(inode)->ip_alloc_sem);
2121 else
2122 down_read(&OCFS2_I(inode)->ip_alloc_sem);
2123 } else {
2124 if (write_sem)
2125 ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem);
2126 else
2127 ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem);
2128
2129 if (!ret) {
2130 ret = -EAGAIN;
2131 goto out_unlock;
2132 }
2116 } 2133 }
2117 2134
2118 *meta_level = 1; 2135 return ret;
2119 2136
2120 ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); 2137out_unlock:
2121 if (ret) 2138 brelse(*di_bh);
2122 mlog_errno(ret); 2139 ocfs2_inode_unlock(inode, meta_level);
2123out: 2140out:
2124 brelse(di_bh);
2125 return ret; 2141 return ret;
2126} 2142}
2127 2143
2144static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode,
2145 struct buffer_head **di_bh,
2146 int meta_level,
2147 int write_sem)
2148{
2149 if (write_sem)
2150 up_write(&OCFS2_I(inode)->ip_alloc_sem);
2151 else
2152 up_read(&OCFS2_I(inode)->ip_alloc_sem);
2153
2154 brelse(*di_bh);
2155 *di_bh = NULL;
2156
2157 if (meta_level >= 0)
2158 ocfs2_inode_unlock(inode, meta_level);
2159}
2160
2128static int ocfs2_prepare_inode_for_write(struct file *file, 2161static int ocfs2_prepare_inode_for_write(struct file *file,
2129 loff_t pos, size_t count, int wait) 2162 loff_t pos, size_t count, int wait)
2130{ 2163{
2131 int ret = 0, meta_level = 0, overwrite_io = 0; 2164 int ret = 0, meta_level = 0, overwrite_io = 0;
2165 int write_sem = 0;
2132 struct dentry *dentry = file->f_path.dentry; 2166 struct dentry *dentry = file->f_path.dentry;
2133 struct inode *inode = d_inode(dentry); 2167 struct inode *inode = d_inode(dentry);
2134 struct buffer_head *di_bh = NULL; 2168 struct buffer_head *di_bh = NULL;
2169 u32 cpos;
2170 u32 clusters;
2135 2171
2136 /* 2172 /*
2137 * We start with a read level meta lock and only jump to an ex 2173 * We start with a read level meta lock and only jump to an ex
2138 * if we need to make modifications here. 2174 * if we need to make modifications here.
2139 */ 2175 */
2140 for(;;) { 2176 for(;;) {
2141 if (wait) 2177 ret = ocfs2_inode_lock_for_extent_tree(inode,
2142 ret = ocfs2_inode_lock(inode, NULL, meta_level); 2178 &di_bh,
2143 else 2179 meta_level,
2144 ret = ocfs2_try_inode_lock(inode, 2180 overwrite_io,
2145 overwrite_io ? NULL : &di_bh, meta_level); 2181 write_sem,
2182 wait);
2146 if (ret < 0) { 2183 if (ret < 0) {
2147 meta_level = -1;
2148 if (ret != -EAGAIN) 2184 if (ret != -EAGAIN)
2149 mlog_errno(ret); 2185 mlog_errno(ret);
2150 goto out; 2186 goto out;
@@ -2156,15 +2192,8 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
2156 */ 2192 */
2157 if (!wait && !overwrite_io) { 2193 if (!wait && !overwrite_io) {
2158 overwrite_io = 1; 2194 overwrite_io = 1;
2159 if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
2160 ret = -EAGAIN;
2161 goto out_unlock;
2162 }
2163 2195
2164 ret = ocfs2_overwrite_io(inode, di_bh, pos, count); 2196 ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
2165 brelse(di_bh);
2166 di_bh = NULL;
2167 up_read(&OCFS2_I(inode)->ip_alloc_sem);
2168 if (ret < 0) { 2197 if (ret < 0) {
2169 if (ret != -EAGAIN) 2198 if (ret != -EAGAIN)
2170 mlog_errno(ret); 2199 mlog_errno(ret);
@@ -2183,7 +2212,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
2183 * set inode->i_size at the end of a write. */ 2212 * set inode->i_size at the end of a write. */
2184 if (should_remove_suid(dentry)) { 2213 if (should_remove_suid(dentry)) {
2185 if (meta_level == 0) { 2214 if (meta_level == 0) {
2186 ocfs2_inode_unlock(inode, meta_level); 2215 ocfs2_inode_unlock_for_extent_tree(inode,
2216 &di_bh,
2217 meta_level,
2218 write_sem);
2187 meta_level = 1; 2219 meta_level = 1;
2188 continue; 2220 continue;
2189 } 2221 }
@@ -2197,18 +2229,32 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
2197 2229
2198 ret = ocfs2_check_range_for_refcount(inode, pos, count); 2230 ret = ocfs2_check_range_for_refcount(inode, pos, count);
2199 if (ret == 1) { 2231 if (ret == 1) {
2200 ocfs2_inode_unlock(inode, meta_level); 2232 ocfs2_inode_unlock_for_extent_tree(inode,
2201 meta_level = -1; 2233 &di_bh,
2202 2234 meta_level,
2203 ret = ocfs2_prepare_inode_for_refcount(inode, 2235 write_sem);
2204 file, 2236 ret = ocfs2_inode_lock_for_extent_tree(inode,
2205 pos, 2237 &di_bh,
2206 count, 2238 meta_level,
2207 &meta_level); 2239 overwrite_io,
2240 1,
2241 wait);
2242 write_sem = 1;
2243 if (ret < 0) {
2244 if (ret != -EAGAIN)
2245 mlog_errno(ret);
2246 goto out;
2247 }
2248
2249 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
2250 clusters =
2251 ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
2252 ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
2208 } 2253 }
2209 2254
2210 if (ret < 0) { 2255 if (ret < 0) {
2211 mlog_errno(ret); 2256 if (ret != -EAGAIN)
2257 mlog_errno(ret);
2212 goto out_unlock; 2258 goto out_unlock;
2213 } 2259 }
2214 2260
@@ -2219,10 +2265,10 @@ out_unlock:
2219 trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, 2265 trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
2220 pos, count, wait); 2266 pos, count, wait);
2221 2267
2222 brelse(di_bh); 2268 ocfs2_inode_unlock_for_extent_tree(inode,
2223 2269 &di_bh,
2224 if (meta_level >= 0) 2270 meta_level,
2225 ocfs2_inode_unlock(inode, meta_level); 2271 write_sem);
2226 2272
2227out: 2273out:
2228 return ret; 2274 return ret;