diff options
author | Dave Chinner <dchinner@redhat.com> | 2015-02-23 05:43:37 -0500 |
---|---|---|
committer | Dave Chinner <david@fromorbit.com> | 2015-02-23 05:43:37 -0500 |
commit | 653c60b633a9019a54a80d64b5ed33ecb214823c (patch) | |
tree | 833b977a047a36b07e49524ba3afb295cf140287 /fs/xfs/xfs_inode.c | |
parent | c517d838eb7d07bbe9507871fab3931deccff539 (diff) |
xfs: introduce mmap/truncate lock
Right now we cannot serialise mmap against truncate or hole punch
sanely. ->page_mkwrite is not able to take locks that the read IO
path normally takes (i.e. the inode iolock) because that could
result in lock inversions (read - iolock - page fault - page_mkwrite
- iolock) and so we cannot use an IO path lock to serialise page
write faults against truncate operations.
Instead, introduce a new lock that is used *only* in the
->page_mkwrite path that is the equivalent of the iolock. The lock
ordering in a page fault is i_mmaplock -> page lock -> i_ilock,
and so in truncate we can i_iolock -> i_mmaplock and so lock out
new write faults during the process of truncation.
Because i_mmap_lock is outside the page lock, we can hold it across
all the same operations we hold the i_iolock for. The only
difference is that we never hold the i_mmaplock in the normal IO
path and so do not ever have the possibility that we can page fault
inside it. Hence there are no recursion issues on the i_mmap_lock
and so we can use it to serialise page fault IO against inode
modification operations that affect the IO path.
This patch introduces the i_mmaplock infrastructure, lockdep
annotations and initialisation/destruction code. Use of the new lock
will be in subsequent patches.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
Diffstat (limited to 'fs/xfs/xfs_inode.c')
-rw-r--r-- | fs/xfs/xfs_inode.c | 128 |
1 files changed, 97 insertions, 31 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index daafa1f6d260..ac24818f7b2d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c | |||
@@ -117,24 +117,34 @@ xfs_ilock_attr_map_shared( | |||
117 | } | 117 | } |
118 | 118 | ||
119 | /* | 119 | /* |
120 | * The xfs inode contains 2 locks: a multi-reader lock called the | 120 | * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and |
121 | * i_iolock and a multi-reader lock called the i_lock. This routine | 121 | * the i_lock. This routine allows various combinations of the locks to be |
122 | * allows either or both of the locks to be obtained. | 122 | * obtained. |
123 | * | 123 | * |
124 | * The 2 locks should always be ordered so that the IO lock is | 124 | * The 3 locks should always be ordered so that the IO lock is obtained first, |
125 | * obtained first in order to prevent deadlock. | 125 | * the mmap lock second and the ilock last in order to prevent deadlock. |
126 | * | 126 | * |
127 | * ip -- the inode being locked | 127 | * Basic locking order: |
128 | * lock_flags -- this parameter indicates the inode's locks | 128 | * |
129 | * to be locked. It can be: | 129 | * i_iolock -> i_mmap_lock -> page_lock -> i_ilock |
130 | * XFS_IOLOCK_SHARED, | 130 | * |
131 | * XFS_IOLOCK_EXCL, | 131 | * mmap_sem locking order: |
132 | * XFS_ILOCK_SHARED, | 132 | * |
133 | * XFS_ILOCK_EXCL, | 133 | * i_iolock -> page lock -> mmap_sem |
134 | * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, | 134 | * mmap_sem -> i_mmap_lock -> page_lock |
135 | * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, | 135 | * |
136 | * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, | 136 | * The difference in mmap_sem locking order mean that we cannot hold the |
137 | * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | 137 | * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can |
138 | * fault in pages during copy in/out (for buffered IO) or require the mmap_sem | ||
139 | * in get_user_pages() to map the user pages into the kernel address space for | ||
140 | * direct IO. Similarly the i_iolock cannot be taken inside a page fault because | ||
141 | * page faults already hold the mmap_sem. | ||
142 | * | ||
143 | * Hence to serialise fully against both syscall and mmap based IO, we need to | ||
144 | * take both the i_iolock and the i_mmap_lock. These locks should *only* be both | ||
145 | * taken in places where we need to invalidate the page cache in a race | ||
146 | * free manner (e.g. truncate, hole punch and other extent manipulation | ||
147 | * functions). | ||
138 | */ | 148 | */ |
139 | void | 149 | void |
140 | xfs_ilock( | 150 | xfs_ilock( |
@@ -150,6 +160,8 @@ xfs_ilock( | |||
150 | */ | 160 | */ |
151 | ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != | 161 | ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != |
152 | (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); | 162 | (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); |
163 | ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != | ||
164 | (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); | ||
153 | ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != | 165 | ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != |
154 | (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); | 166 | (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); |
155 | ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); | 167 | ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); |
@@ -159,6 +171,11 @@ xfs_ilock( | |||
159 | else if (lock_flags & XFS_IOLOCK_SHARED) | 171 | else if (lock_flags & XFS_IOLOCK_SHARED) |
160 | mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); | 172 | mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); |
161 | 173 | ||
174 | if (lock_flags & XFS_MMAPLOCK_EXCL) | ||
175 | mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); | ||
176 | else if (lock_flags & XFS_MMAPLOCK_SHARED) | ||
177 | mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); | ||
178 | |||
162 | if (lock_flags & XFS_ILOCK_EXCL) | 179 | if (lock_flags & XFS_ILOCK_EXCL) |
163 | mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); | 180 | mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); |
164 | else if (lock_flags & XFS_ILOCK_SHARED) | 181 | else if (lock_flags & XFS_ILOCK_SHARED) |
@@ -191,6 +208,8 @@ xfs_ilock_nowait( | |||
191 | */ | 208 | */ |
192 | ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != | 209 | ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != |
193 | (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); | 210 | (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); |
211 | ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != | ||
212 | (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); | ||
194 | ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != | 213 | ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != |
195 | (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); | 214 | (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); |
196 | ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); | 215 | ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); |
@@ -202,21 +221,35 @@ xfs_ilock_nowait( | |||
202 | if (!mrtryaccess(&ip->i_iolock)) | 221 | if (!mrtryaccess(&ip->i_iolock)) |
203 | goto out; | 222 | goto out; |
204 | } | 223 | } |
224 | |||
225 | if (lock_flags & XFS_MMAPLOCK_EXCL) { | ||
226 | if (!mrtryupdate(&ip->i_mmaplock)) | ||
227 | goto out_undo_iolock; | ||
228 | } else if (lock_flags & XFS_MMAPLOCK_SHARED) { | ||
229 | if (!mrtryaccess(&ip->i_mmaplock)) | ||
230 | goto out_undo_iolock; | ||
231 | } | ||
232 | |||
205 | if (lock_flags & XFS_ILOCK_EXCL) { | 233 | if (lock_flags & XFS_ILOCK_EXCL) { |
206 | if (!mrtryupdate(&ip->i_lock)) | 234 | if (!mrtryupdate(&ip->i_lock)) |
207 | goto out_undo_iolock; | 235 | goto out_undo_mmaplock; |
208 | } else if (lock_flags & XFS_ILOCK_SHARED) { | 236 | } else if (lock_flags & XFS_ILOCK_SHARED) { |
209 | if (!mrtryaccess(&ip->i_lock)) | 237 | if (!mrtryaccess(&ip->i_lock)) |
210 | goto out_undo_iolock; | 238 | goto out_undo_mmaplock; |
211 | } | 239 | } |
212 | return 1; | 240 | return 1; |
213 | 241 | ||
214 | out_undo_iolock: | 242 | out_undo_mmaplock: |
243 | if (lock_flags & XFS_MMAPLOCK_EXCL) | ||
244 | mrunlock_excl(&ip->i_mmaplock); | ||
245 | else if (lock_flags & XFS_MMAPLOCK_SHARED) | ||
246 | mrunlock_shared(&ip->i_mmaplock); | ||
247 | out_undo_iolock: | ||
215 | if (lock_flags & XFS_IOLOCK_EXCL) | 248 | if (lock_flags & XFS_IOLOCK_EXCL) |
216 | mrunlock_excl(&ip->i_iolock); | 249 | mrunlock_excl(&ip->i_iolock); |
217 | else if (lock_flags & XFS_IOLOCK_SHARED) | 250 | else if (lock_flags & XFS_IOLOCK_SHARED) |
218 | mrunlock_shared(&ip->i_iolock); | 251 | mrunlock_shared(&ip->i_iolock); |
219 | out: | 252 | out: |
220 | return 0; | 253 | return 0; |
221 | } | 254 | } |
222 | 255 | ||
@@ -244,6 +277,8 @@ xfs_iunlock( | |||
244 | */ | 277 | */ |
245 | ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != | 278 | ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != |
246 | (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); | 279 | (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); |
280 | ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != | ||
281 | (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); | ||
247 | ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != | 282 | ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != |
248 | (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); | 283 | (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); |
249 | ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); | 284 | ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); |
@@ -254,6 +289,11 @@ xfs_iunlock( | |||
254 | else if (lock_flags & XFS_IOLOCK_SHARED) | 289 | else if (lock_flags & XFS_IOLOCK_SHARED) |
255 | mrunlock_shared(&ip->i_iolock); | 290 | mrunlock_shared(&ip->i_iolock); |
256 | 291 | ||
292 | if (lock_flags & XFS_MMAPLOCK_EXCL) | ||
293 | mrunlock_excl(&ip->i_mmaplock); | ||
294 | else if (lock_flags & XFS_MMAPLOCK_SHARED) | ||
295 | mrunlock_shared(&ip->i_mmaplock); | ||
296 | |||
257 | if (lock_flags & XFS_ILOCK_EXCL) | 297 | if (lock_flags & XFS_ILOCK_EXCL) |
258 | mrunlock_excl(&ip->i_lock); | 298 | mrunlock_excl(&ip->i_lock); |
259 | else if (lock_flags & XFS_ILOCK_SHARED) | 299 | else if (lock_flags & XFS_ILOCK_SHARED) |
@@ -271,11 +311,14 @@ xfs_ilock_demote( | |||
271 | xfs_inode_t *ip, | 311 | xfs_inode_t *ip, |
272 | uint lock_flags) | 312 | uint lock_flags) |
273 | { | 313 | { |
274 | ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); | 314 | ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)); |
275 | ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); | 315 | ASSERT((lock_flags & |
316 | ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); | ||
276 | 317 | ||
277 | if (lock_flags & XFS_ILOCK_EXCL) | 318 | if (lock_flags & XFS_ILOCK_EXCL) |
278 | mrdemote(&ip->i_lock); | 319 | mrdemote(&ip->i_lock); |
320 | if (lock_flags & XFS_MMAPLOCK_EXCL) | ||
321 | mrdemote(&ip->i_mmaplock); | ||
279 | if (lock_flags & XFS_IOLOCK_EXCL) | 322 | if (lock_flags & XFS_IOLOCK_EXCL) |
280 | mrdemote(&ip->i_iolock); | 323 | mrdemote(&ip->i_iolock); |
281 | 324 | ||
@@ -294,6 +337,12 @@ xfs_isilocked( | |||
294 | return rwsem_is_locked(&ip->i_lock.mr_lock); | 337 | return rwsem_is_locked(&ip->i_lock.mr_lock); |
295 | } | 338 | } |
296 | 339 | ||
340 | if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { | ||
341 | if (!(lock_flags & XFS_MMAPLOCK_SHARED)) | ||
342 | return !!ip->i_mmaplock.mr_writer; | ||
343 | return rwsem_is_locked(&ip->i_mmaplock.mr_lock); | ||
344 | } | ||
345 | |||
297 | if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { | 346 | if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { |
298 | if (!(lock_flags & XFS_IOLOCK_SHARED)) | 347 | if (!(lock_flags & XFS_IOLOCK_SHARED)) |
299 | return !!ip->i_iolock.mr_writer; | 348 | return !!ip->i_iolock.mr_writer; |
@@ -314,14 +363,27 @@ int xfs_lock_delays; | |||
314 | #endif | 363 | #endif |
315 | 364 | ||
316 | /* | 365 | /* |
317 | * Bump the subclass so xfs_lock_inodes() acquires each lock with | 366 | * Bump the subclass so xfs_lock_inodes() acquires each lock with a different |
318 | * a different value | 367 | * value. This shouldn't be called for page fault locking, but we also need to |
368 | * ensure we don't overrun the number of lockdep subclasses for the iolock or | ||
369 | * mmaplock as that is limited to 12 by the mmap lock lockdep annotations. | ||
319 | */ | 370 | */ |
320 | static inline int | 371 | static inline int |
321 | xfs_lock_inumorder(int lock_mode, int subclass) | 372 | xfs_lock_inumorder(int lock_mode, int subclass) |
322 | { | 373 | { |
323 | if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) | 374 | if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { |
375 | ASSERT(subclass + XFS_LOCK_INUMORDER < | ||
376 | (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT))); | ||
324 | lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; | 377 | lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; |
378 | } | ||
379 | |||
380 | if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { | ||
381 | ASSERT(subclass + XFS_LOCK_INUMORDER < | ||
382 | (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT))); | ||
383 | lock_mode |= (subclass + XFS_LOCK_INUMORDER) << | ||
384 | XFS_MMAPLOCK_SHIFT; | ||
385 | } | ||
386 | |||
325 | if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) | 387 | if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) |
326 | lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; | 388 | lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; |
327 | 389 | ||
@@ -440,10 +502,10 @@ again: | |||
440 | } | 502 | } |
441 | 503 | ||
442 | /* | 504 | /* |
443 | * xfs_lock_two_inodes() can only be used to lock one type of lock | 505 | * xfs_lock_two_inodes() can only be used to lock one type of lock at a time - |
444 | * at a time - the iolock or the ilock, but not both at once. If | 506 | * the iolock, the mmaplock or the ilock, but not more than one at a time. If we |
445 | * we lock both at once, lockdep will report false positives saying | 507 | * lock more than one at a time, lockdep will report false positives saying we |
446 | * we have violated locking orders. | 508 | * have violated locking orders. |
447 | */ | 509 | */ |
448 | void | 510 | void |
449 | xfs_lock_two_inodes( | 511 | xfs_lock_two_inodes( |
@@ -455,8 +517,12 @@ xfs_lock_two_inodes( | |||
455 | int attempts = 0; | 517 | int attempts = 0; |
456 | xfs_log_item_t *lp; | 518 | xfs_log_item_t *lp; |
457 | 519 | ||
458 | if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) | 520 | if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { |
459 | ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); | 521 | ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); |
522 | ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); | ||
523 | } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) | ||
524 | ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); | ||
525 | |||
460 | ASSERT(ip0->i_ino != ip1->i_ino); | 526 | ASSERT(ip0->i_ino != ip1->i_ino); |
461 | 527 | ||
462 | if (ip0->i_ino > ip1->i_ino) { | 528 | if (ip0->i_ino > ip1->i_ino) { |