aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorDave Chinner <david@fromorbit.com>2015-02-23 18:27:47 -0500
committerDave Chinner <david@fromorbit.com>2015-02-23 18:27:47 -0500
commit88e8fda99a4c99a1a6482510655dbd88cccd221b (patch)
treeb5f10ecc7c99ebf3eeb7a6733c15d3930b5f8a63 /fs/xfs
parent4225441a1eec45241efe529d23403d8ca3d1d71b (diff)
parent723cac48473358939759885a18e8df113ea96138 (diff)
Merge branch 'xfs-mmap-lock' into for-next
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/xfs_bmap_util.c31
-rw-r--r--fs/xfs/xfs_file.c70
-rw-r--r--fs/xfs/xfs_inode.c128
-rw-r--r--fs/xfs/xfs_inode.h29
-rw-r--r--fs/xfs/xfs_ioctl.c5
-rw-r--r--fs/xfs/xfs_iops.c63
-rw-r--r--fs/xfs/xfs_super.c2
-rw-r--r--fs/xfs/xfs_trace.h3
8 files changed, 217 insertions, 114 deletions
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 22a5dcb70b32..7efa23e72a90 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1599,13 +1599,6 @@ xfs_swap_extent_flush(
1599 /* Verify O_DIRECT for ftmp */ 1599 /* Verify O_DIRECT for ftmp */
1600 if (VFS_I(ip)->i_mapping->nrpages) 1600 if (VFS_I(ip)->i_mapping->nrpages)
1601 return -EINVAL; 1601 return -EINVAL;
1602
1603 /*
1604 * Don't try to swap extents on mmap()d files because we can't lock
1605 * out races against page faults safely.
1606 */
1607 if (mapping_mapped(VFS_I(ip)->i_mapping))
1608 return -EBUSY;
1609 return 0; 1602 return 0;
1610} 1603}
1611 1604
@@ -1633,13 +1626,14 @@ xfs_swap_extents(
1633 } 1626 }
1634 1627
1635 /* 1628 /*
1636 * Lock up the inodes against other IO and truncate to begin with. 1629 * Lock the inodes against other IO, page faults and truncate to
1637 * Then we can ensure the inodes are flushed and have no page cache 1630 * begin with. Then we can ensure the inodes are flushed and have no
1638 * safely. Once we have done this we can take the ilocks and do the rest 1631 * page cache safely. Once we have done this we can take the ilocks and
1639 * of the checks. 1632 * do the rest of the checks.
1640 */ 1633 */
1641 lock_flags = XFS_IOLOCK_EXCL; 1634 lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1642 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); 1635 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
1636 xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
1643 1637
1644 /* Verify that both files have the same format */ 1638 /* Verify that both files have the same format */
1645 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { 1639 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
@@ -1666,8 +1660,16 @@ xfs_swap_extents(
1666 xfs_trans_cancel(tp, 0); 1660 xfs_trans_cancel(tp, 0);
1667 goto out_unlock; 1661 goto out_unlock;
1668 } 1662 }
1663
1664 /*
1665 * Lock and join the inodes to the tansaction so that transaction commit
1666 * or cancel will unlock the inodes from this point onwards.
1667 */
1669 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); 1668 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
1670 lock_flags |= XFS_ILOCK_EXCL; 1669 lock_flags |= XFS_ILOCK_EXCL;
1670 xfs_trans_ijoin(tp, ip, lock_flags);
1671 xfs_trans_ijoin(tp, tip, lock_flags);
1672
1671 1673
1672 /* Verify all data are being swapped */ 1674 /* Verify all data are being swapped */
1673 if (sxp->sx_offset != 0 || 1675 if (sxp->sx_offset != 0 ||
@@ -1720,9 +1722,6 @@ xfs_swap_extents(
1720 goto out_trans_cancel; 1722 goto out_trans_cancel;
1721 } 1723 }
1722 1724
1723 xfs_trans_ijoin(tp, ip, lock_flags);
1724 xfs_trans_ijoin(tp, tip, lock_flags);
1725
1726 /* 1725 /*
1727 * Before we've swapped the forks, lets set the owners of the forks 1726 * Before we've swapped the forks, lets set the owners of the forks
1728 * appropriately. We have to do this as we are demand paging the btree 1727 * appropriately. We have to do this as we are demand paging the btree
@@ -1856,5 +1855,5 @@ out_unlock:
1856 1855
1857out_trans_cancel: 1856out_trans_cancel:
1858 xfs_trans_cancel(tp, 0); 1857 xfs_trans_cancel(tp, 0);
1859 goto out_unlock; 1858 goto out;
1860} 1859}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index a2e1cb8a568b..b101e80f2862 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -847,6 +847,9 @@ xfs_file_fallocate(
847 if (error) 847 if (error)
848 goto out_unlock; 848 goto out_unlock;
849 849
850 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
851 iolock |= XFS_MMAPLOCK_EXCL;
852
850 if (mode & FALLOC_FL_PUNCH_HOLE) { 853 if (mode & FALLOC_FL_PUNCH_HOLE) {
851 error = xfs_free_file_space(ip, offset, len); 854 error = xfs_free_file_space(ip, offset, len);
852 if (error) 855 if (error)
@@ -997,20 +1000,6 @@ xfs_file_mmap(
997} 1000}
998 1001
999/* 1002/*
1000 * mmap()d file has taken write protection fault and is being made
1001 * writable. We can set the page state up correctly for a writable
1002 * page, which means we can do correct delalloc accounting (ENOSPC
1003 * checking!) and unwritten extent mapping.
1004 */
1005STATIC int
1006xfs_vm_page_mkwrite(
1007 struct vm_area_struct *vma,
1008 struct vm_fault *vmf)
1009{
1010 return block_page_mkwrite(vma, vmf, xfs_get_blocks);
1011}
1012
1013/*
1014 * This type is designed to indicate the type of offset we would like 1003 * This type is designed to indicate the type of offset we would like
1015 * to search from page cache for xfs_seek_hole_data(). 1004 * to search from page cache for xfs_seek_hole_data().
1016 */ 1005 */
@@ -1385,6 +1374,55 @@ xfs_file_llseek(
1385 } 1374 }
1386} 1375}
1387 1376
1377/*
1378 * Locking for serialisation of IO during page faults. This results in a lock
1379 * ordering of:
1380 *
1381 * mmap_sem (MM)
1382 * i_mmap_lock (XFS - truncate serialisation)
1383 * page_lock (MM)
1384 * i_lock (XFS - extent map serialisation)
1385 */
1386STATIC int
1387xfs_filemap_fault(
1388 struct vm_area_struct *vma,
1389 struct vm_fault *vmf)
1390{
1391 struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);
1392 int error;
1393
1394 trace_xfs_filemap_fault(ip);
1395
1396 xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1397 error = filemap_fault(vma, vmf);
1398 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1399
1400 return error;
1401}
1402
1403/*
1404 * mmap()d file has taken write protection fault and is being made writable. We
1405 * can set the page state up correctly for a writable page, which means we can
1406 * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
1407 * mapping.
1408 */
1409STATIC int
1410xfs_filemap_page_mkwrite(
1411 struct vm_area_struct *vma,
1412 struct vm_fault *vmf)
1413{
1414 struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);
1415 int error;
1416
1417 trace_xfs_filemap_page_mkwrite(ip);
1418
1419 xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1420 error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
1421 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1422
1423 return error;
1424}
1425
1388const struct file_operations xfs_file_operations = { 1426const struct file_operations xfs_file_operations = {
1389 .llseek = xfs_file_llseek, 1427 .llseek = xfs_file_llseek,
1390 .read = new_sync_read, 1428 .read = new_sync_read,
@@ -1417,7 +1455,7 @@ const struct file_operations xfs_dir_file_operations = {
1417}; 1455};
1418 1456
1419static const struct vm_operations_struct xfs_file_vm_ops = { 1457static const struct vm_operations_struct xfs_file_vm_ops = {
1420 .fault = filemap_fault, 1458 .fault = xfs_filemap_fault,
1421 .map_pages = filemap_map_pages, 1459 .map_pages = filemap_map_pages,
1422 .page_mkwrite = xfs_vm_page_mkwrite, 1460 .page_mkwrite = xfs_filemap_page_mkwrite,
1423}; 1461};
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 698da0388f22..5a44f1cc820c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -117,24 +117,34 @@ xfs_ilock_attr_map_shared(
117} 117}
118 118
119/* 119/*
120 * The xfs inode contains 2 locks: a multi-reader lock called the 120 * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and
121 * i_iolock and a multi-reader lock called the i_lock. This routine 121 * the i_lock. This routine allows various combinations of the locks to be
122 * allows either or both of the locks to be obtained. 122 * obtained.
123 * 123 *
124 * The 2 locks should always be ordered so that the IO lock is 124 * The 3 locks should always be ordered so that the IO lock is obtained first,
125 * obtained first in order to prevent deadlock. 125 * the mmap lock second and the ilock last in order to prevent deadlock.
126 * 126 *
127 * ip -- the inode being locked 127 * Basic locking order:
128 * lock_flags -- this parameter indicates the inode's locks 128 *
129 * to be locked. It can be: 129 * i_iolock -> i_mmap_lock -> page_lock -> i_ilock
130 * XFS_IOLOCK_SHARED, 130 *
131 * XFS_IOLOCK_EXCL, 131 * mmap_sem locking order:
132 * XFS_ILOCK_SHARED, 132 *
133 * XFS_ILOCK_EXCL, 133 * i_iolock -> page lock -> mmap_sem
134 * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, 134 * mmap_sem -> i_mmap_lock -> page_lock
135 * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, 135 *
136 * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, 136 * The difference in mmap_sem locking order mean that we cannot hold the
137 * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL 137 * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
138 * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
139 * in get_user_pages() to map the user pages into the kernel address space for
140 * direct IO. Similarly the i_iolock cannot be taken inside a page fault because
141 * page faults already hold the mmap_sem.
142 *
143 * Hence to serialise fully against both syscall and mmap based IO, we need to
144 * take both the i_iolock and the i_mmap_lock. These locks should *only* be both
145 * taken in places where we need to invalidate the page cache in a race
146 * free manner (e.g. truncate, hole punch and other extent manipulation
147 * functions).
138 */ 148 */
139void 149void
140xfs_ilock( 150xfs_ilock(
@@ -150,6 +160,8 @@ xfs_ilock(
150 */ 160 */
151 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 161 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
152 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 162 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
163 ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
164 (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
153 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 165 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
154 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 166 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
155 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); 167 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
@@ -159,6 +171,11 @@ xfs_ilock(
159 else if (lock_flags & XFS_IOLOCK_SHARED) 171 else if (lock_flags & XFS_IOLOCK_SHARED)
160 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); 172 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
161 173
174 if (lock_flags & XFS_MMAPLOCK_EXCL)
175 mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
176 else if (lock_flags & XFS_MMAPLOCK_SHARED)
177 mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
178
162 if (lock_flags & XFS_ILOCK_EXCL) 179 if (lock_flags & XFS_ILOCK_EXCL)
163 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 180 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
164 else if (lock_flags & XFS_ILOCK_SHARED) 181 else if (lock_flags & XFS_ILOCK_SHARED)
@@ -191,6 +208,8 @@ xfs_ilock_nowait(
191 */ 208 */
192 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 209 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
193 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 210 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
211 ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
212 (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
194 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 213 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
195 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 214 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
196 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); 215 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
@@ -202,21 +221,35 @@ xfs_ilock_nowait(
202 if (!mrtryaccess(&ip->i_iolock)) 221 if (!mrtryaccess(&ip->i_iolock))
203 goto out; 222 goto out;
204 } 223 }
224
225 if (lock_flags & XFS_MMAPLOCK_EXCL) {
226 if (!mrtryupdate(&ip->i_mmaplock))
227 goto out_undo_iolock;
228 } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
229 if (!mrtryaccess(&ip->i_mmaplock))
230 goto out_undo_iolock;
231 }
232
205 if (lock_flags & XFS_ILOCK_EXCL) { 233 if (lock_flags & XFS_ILOCK_EXCL) {
206 if (!mrtryupdate(&ip->i_lock)) 234 if (!mrtryupdate(&ip->i_lock))
207 goto out_undo_iolock; 235 goto out_undo_mmaplock;
208 } else if (lock_flags & XFS_ILOCK_SHARED) { 236 } else if (lock_flags & XFS_ILOCK_SHARED) {
209 if (!mrtryaccess(&ip->i_lock)) 237 if (!mrtryaccess(&ip->i_lock))
210 goto out_undo_iolock; 238 goto out_undo_mmaplock;
211 } 239 }
212 return 1; 240 return 1;
213 241
214 out_undo_iolock: 242out_undo_mmaplock:
243 if (lock_flags & XFS_MMAPLOCK_EXCL)
244 mrunlock_excl(&ip->i_mmaplock);
245 else if (lock_flags & XFS_MMAPLOCK_SHARED)
246 mrunlock_shared(&ip->i_mmaplock);
247out_undo_iolock:
215 if (lock_flags & XFS_IOLOCK_EXCL) 248 if (lock_flags & XFS_IOLOCK_EXCL)
216 mrunlock_excl(&ip->i_iolock); 249 mrunlock_excl(&ip->i_iolock);
217 else if (lock_flags & XFS_IOLOCK_SHARED) 250 else if (lock_flags & XFS_IOLOCK_SHARED)
218 mrunlock_shared(&ip->i_iolock); 251 mrunlock_shared(&ip->i_iolock);
219 out: 252out:
220 return 0; 253 return 0;
221} 254}
222 255
@@ -244,6 +277,8 @@ xfs_iunlock(
244 */ 277 */
245 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 278 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
246 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 279 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
280 ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
281 (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
247 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 282 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
248 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 283 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
249 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); 284 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
@@ -254,6 +289,11 @@ xfs_iunlock(
254 else if (lock_flags & XFS_IOLOCK_SHARED) 289 else if (lock_flags & XFS_IOLOCK_SHARED)
255 mrunlock_shared(&ip->i_iolock); 290 mrunlock_shared(&ip->i_iolock);
256 291
292 if (lock_flags & XFS_MMAPLOCK_EXCL)
293 mrunlock_excl(&ip->i_mmaplock);
294 else if (lock_flags & XFS_MMAPLOCK_SHARED)
295 mrunlock_shared(&ip->i_mmaplock);
296
257 if (lock_flags & XFS_ILOCK_EXCL) 297 if (lock_flags & XFS_ILOCK_EXCL)
258 mrunlock_excl(&ip->i_lock); 298 mrunlock_excl(&ip->i_lock);
259 else if (lock_flags & XFS_ILOCK_SHARED) 299 else if (lock_flags & XFS_ILOCK_SHARED)
@@ -271,11 +311,14 @@ xfs_ilock_demote(
271 xfs_inode_t *ip, 311 xfs_inode_t *ip,
272 uint lock_flags) 312 uint lock_flags)
273{ 313{
274 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); 314 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
275 ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); 315 ASSERT((lock_flags &
316 ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
276 317
277 if (lock_flags & XFS_ILOCK_EXCL) 318 if (lock_flags & XFS_ILOCK_EXCL)
278 mrdemote(&ip->i_lock); 319 mrdemote(&ip->i_lock);
320 if (lock_flags & XFS_MMAPLOCK_EXCL)
321 mrdemote(&ip->i_mmaplock);
279 if (lock_flags & XFS_IOLOCK_EXCL) 322 if (lock_flags & XFS_IOLOCK_EXCL)
280 mrdemote(&ip->i_iolock); 323 mrdemote(&ip->i_iolock);
281 324
@@ -294,6 +337,12 @@ xfs_isilocked(
294 return rwsem_is_locked(&ip->i_lock.mr_lock); 337 return rwsem_is_locked(&ip->i_lock.mr_lock);
295 } 338 }
296 339
340 if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
341 if (!(lock_flags & XFS_MMAPLOCK_SHARED))
342 return !!ip->i_mmaplock.mr_writer;
343 return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
344 }
345
297 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { 346 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
298 if (!(lock_flags & XFS_IOLOCK_SHARED)) 347 if (!(lock_flags & XFS_IOLOCK_SHARED))
299 return !!ip->i_iolock.mr_writer; 348 return !!ip->i_iolock.mr_writer;
@@ -314,14 +363,27 @@ int xfs_lock_delays;
314#endif 363#endif
315 364
316/* 365/*
317 * Bump the subclass so xfs_lock_inodes() acquires each lock with 366 * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
318 * a different value 367 * value. This shouldn't be called for page fault locking, but we also need to
368 * ensure we don't overrun the number of lockdep subclasses for the iolock or
369 * mmaplock as that is limited to 12 by the mmap lock lockdep annotations.
319 */ 370 */
320static inline int 371static inline int
321xfs_lock_inumorder(int lock_mode, int subclass) 372xfs_lock_inumorder(int lock_mode, int subclass)
322{ 373{
323 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) 374 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
375 ASSERT(subclass + XFS_LOCK_INUMORDER <
376 (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT)));
324 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; 377 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
378 }
379
380 if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
381 ASSERT(subclass + XFS_LOCK_INUMORDER <
382 (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT)));
383 lock_mode |= (subclass + XFS_LOCK_INUMORDER) <<
384 XFS_MMAPLOCK_SHIFT;
385 }
386
325 if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) 387 if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
326 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; 388 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
327 389
@@ -440,10 +502,10 @@ again:
440} 502}
441 503
442/* 504/*
443 * xfs_lock_two_inodes() can only be used to lock one type of lock 505 * xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
444 * at a time - the iolock or the ilock, but not both at once. If 506 * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
445 * we lock both at once, lockdep will report false positives saying 507 * lock more than one at a time, lockdep will report false positives saying we
446 * we have violated locking orders. 508 * have violated locking orders.
447 */ 509 */
448void 510void
449xfs_lock_two_inodes( 511xfs_lock_two_inodes(
@@ -455,8 +517,12 @@ xfs_lock_two_inodes(
455 int attempts = 0; 517 int attempts = 0;
456 xfs_log_item_t *lp; 518 xfs_log_item_t *lp;
457 519
458 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) 520 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
459 ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); 521 ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
522 ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
523 } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
524 ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
525
460 ASSERT(ip0->i_ino != ip1->i_ino); 526 ASSERT(ip0->i_ino != ip1->i_ino);
461 527
462 if (ip0->i_ino > ip1->i_ino) { 528 if (ip0->i_ino > ip1->i_ino) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index c73b63d51bc1..8f22d20368d8 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -56,6 +56,7 @@ typedef struct xfs_inode {
56 struct xfs_inode_log_item *i_itemp; /* logging information */ 56 struct xfs_inode_log_item *i_itemp; /* logging information */
57 mrlock_t i_lock; /* inode lock */ 57 mrlock_t i_lock; /* inode lock */
58 mrlock_t i_iolock; /* inode IO lock */ 58 mrlock_t i_iolock; /* inode IO lock */
59 mrlock_t i_mmaplock; /* inode mmap IO lock */
59 atomic_t i_pincount; /* inode pin count */ 60 atomic_t i_pincount; /* inode pin count */
60 spinlock_t i_flags_lock; /* inode i_flags lock */ 61 spinlock_t i_flags_lock; /* inode i_flags lock */
61 /* Miscellaneous state. */ 62 /* Miscellaneous state. */
@@ -263,15 +264,20 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
263#define XFS_IOLOCK_SHARED (1<<1) 264#define XFS_IOLOCK_SHARED (1<<1)
264#define XFS_ILOCK_EXCL (1<<2) 265#define XFS_ILOCK_EXCL (1<<2)
265#define XFS_ILOCK_SHARED (1<<3) 266#define XFS_ILOCK_SHARED (1<<3)
267#define XFS_MMAPLOCK_EXCL (1<<4)
268#define XFS_MMAPLOCK_SHARED (1<<5)
266 269
267#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ 270#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
268 | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED) 271 | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
272 | XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED)
269 273
270#define XFS_LOCK_FLAGS \ 274#define XFS_LOCK_FLAGS \
271 { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \ 275 { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \
272 { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \ 276 { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \
273 { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \ 277 { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \
274 { XFS_ILOCK_SHARED, "ILOCK_SHARED" } 278 { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \
279 { XFS_MMAPLOCK_EXCL, "MMAPLOCK_EXCL" }, \
280 { XFS_MMAPLOCK_SHARED, "MMAPLOCK_SHARED" }
275 281
276 282
277/* 283/*
@@ -302,17 +308,26 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
302#define XFS_IOLOCK_SHIFT 16 308#define XFS_IOLOCK_SHIFT 16
303#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) 309#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
304 310
311#define XFS_MMAPLOCK_SHIFT 20
312
305#define XFS_ILOCK_SHIFT 24 313#define XFS_ILOCK_SHIFT 24
306#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) 314#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
307#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT) 315#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
308#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT) 316#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
309 317
310#define XFS_IOLOCK_DEP_MASK 0x00ff0000 318#define XFS_IOLOCK_DEP_MASK 0x000f0000
319#define XFS_MMAPLOCK_DEP_MASK 0x00f00000
311#define XFS_ILOCK_DEP_MASK 0xff000000 320#define XFS_ILOCK_DEP_MASK 0xff000000
312#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK) 321#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | \
313 322 XFS_MMAPLOCK_DEP_MASK | \
314#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) 323 XFS_ILOCK_DEP_MASK)
315#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) 324
325#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) \
326 >> XFS_IOLOCK_SHIFT)
327#define XFS_MMAPLOCK_DEP(flags) (((flags) & XFS_MMAPLOCK_DEP_MASK) \
328 >> XFS_MMAPLOCK_SHIFT)
329#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) \
330 >> XFS_ILOCK_SHIFT)
316 331
317/* 332/*
318 * For multiple groups support: if S_ISGID bit is set in the parent 333 * For multiple groups support: if S_ISGID bit is set in the parent
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ac4feae45eb3..4ee44ddfdfb7 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -631,7 +631,7 @@ xfs_ioc_space(
631 631
632 if (filp->f_flags & O_DSYNC) 632 if (filp->f_flags & O_DSYNC)
633 flags |= XFS_PREALLOC_SYNC; 633 flags |= XFS_PREALLOC_SYNC;
634 if (ioflags & XFS_IO_INVIS) 634 if (ioflags & XFS_IO_INVIS)
635 flags |= XFS_PREALLOC_INVISIBLE; 635 flags |= XFS_PREALLOC_INVISIBLE;
636 636
637 error = mnt_want_write_file(filp); 637 error = mnt_want_write_file(filp);
@@ -643,6 +643,9 @@ xfs_ioc_space(
643 if (error) 643 if (error)
644 goto out_unlock; 644 goto out_unlock;
645 645
646 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
647 iolock |= XFS_MMAPLOCK_EXCL;
648
646 switch (bf->l_whence) { 649 switch (bf->l_whence) {
647 case 0: /*SEEK_SET*/ 650 case 0: /*SEEK_SET*/
648 break; 651 break;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 3ccc28e8d3a0..8b9e6887e315 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -771,6 +771,7 @@ xfs_setattr_size(
771 return error; 771 return error;
772 772
773 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 773 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
774 ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
774 ASSERT(S_ISREG(ip->i_d.di_mode)); 775 ASSERT(S_ISREG(ip->i_d.di_mode));
775 ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| 776 ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
776 ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); 777 ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
@@ -834,55 +835,27 @@ xfs_setattr_size(
834 inode_dio_wait(inode); 835 inode_dio_wait(inode);
835 836
836 /* 837 /*
837 * Do all the page cache truncate work outside the transaction context 838 * We've already locked out new page faults, so now we can safely remove
838 * as the "lock" order is page lock->log space reservation. i.e. 839 * pages from the page cache knowing they won't get refaulted until we
839 * locking pages inside the transaction can ABBA deadlock with 840 * drop the XFS_MMAP_EXCL lock after the extent manipulations are
840 * writeback. We have to do the VFS inode size update before we truncate 841 * complete. The truncate_setsize() call also cleans partial EOF page
841 * the pagecache, however, to avoid racing with page faults beyond the 842 * PTEs on extending truncates and hence ensures sub-page block size
842 * new EOF they are not serialised against truncate operations except by 843 * filesystems are correctly handled, too.
843 * page locks and size updates.
844 * 844 *
845 * Hence we are in a situation where a truncate can fail with ENOMEM 845 * We have to do all the page cache truncate work outside the
846 * from xfs_trans_reserve(), but having already truncated the in-memory 846 * transaction context as the "lock" order is page lock->log space
847 * version of the file (i.e. made user visible changes). There's not 847 * reservation as defined by extent allocation in the writeback path.
848 * much we can do about this, except to hope that the caller sees ENOMEM 848 * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but
849 * and retries the truncate operation. 849 * having already truncated the in-memory version of the file (i.e. made
850 * user visible changes). There's not much we can do about this, except
851 * to hope that the caller sees ENOMEM and retries the truncate
852 * operation.
850 */ 853 */
851 error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); 854 error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
852 if (error) 855 if (error)
853 return error; 856 return error;
854 truncate_setsize(inode, newsize); 857 truncate_setsize(inode, newsize);
855 858
856 /*
857 * The "we can't serialise against page faults" pain gets worse.
858 *
859 * If the file is mapped then we have to clean the page at the old EOF
860 * when extending the file. Extending the file can expose changes the
861 * underlying page mapping (e.g. from beyond EOF to a hole or
862 * unwritten), and so on the next attempt to write to that page we need
863 * to remap it for write. i.e. we need .page_mkwrite() to be called.
864 * Hence we need to clean the page to clean the pte and so a new write
865 * fault will be triggered appropriately.
866 *
867 * If we do it before we change the inode size, then we can race with a
868 * page fault that maps the page with exactly the same problem. If we do
869 * it after we change the file size, then a new page fault can come in
870 * and allocate space before we've run the rest of the truncate
871 * transaction. That's kinda grotesque, but it's better than have data
872 * over a hole, and so that's the lesser evil that has been chosen here.
873 *
874 * The real solution, however, is to have some mechanism for locking out
875 * page faults while a truncate is in progress.
876 */
877 if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) {
878 error = filemap_write_and_wait_range(
879 VFS_I(ip)->i_mapping,
880 round_down(oldsize, PAGE_CACHE_SIZE),
881 round_up(oldsize, PAGE_CACHE_SIZE) - 1);
882 if (error)
883 return error;
884 }
885
886 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); 859 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
887 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); 860 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
888 if (error) 861 if (error)
@@ -981,8 +954,12 @@ xfs_vn_setattr(
981 954
982 xfs_ilock(ip, iolock); 955 xfs_ilock(ip, iolock);
983 error = xfs_break_layouts(dentry->d_inode, &iolock); 956 error = xfs_break_layouts(dentry->d_inode, &iolock);
984 if (!error) 957 if (!error) {
958 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
959 iolock |= XFS_MMAPLOCK_EXCL;
960
985 error = xfs_setattr_size(ip, iattr); 961 error = xfs_setattr_size(ip, iattr);
962 }
986 xfs_iunlock(ip, iolock); 963 xfs_iunlock(ip, iolock);
987 } else { 964 } else {
988 error = xfs_setattr_nonsize(ip, iattr, 0); 965 error = xfs_setattr_nonsize(ip, iattr, 0);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f32ad64c4d05..3ad0b17885f1 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -966,6 +966,8 @@ xfs_fs_inode_init_once(
966 atomic_set(&ip->i_pincount, 0); 966 atomic_set(&ip->i_pincount, 0);
967 spin_lock_init(&ip->i_flags_lock); 967 spin_lock_init(&ip->i_flags_lock);
968 968
969 mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
970 "xfsino", ip->i_ino);
969 mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, 971 mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
970 "xfsino", ip->i_ino); 972 "xfsino", ip->i_ino);
971} 973}
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 51372e34d988..b1e059b398c0 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -685,6 +685,9 @@ DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
685DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); 685DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
686DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); 686DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
687 687
688DEFINE_INODE_EVENT(xfs_filemap_fault);
689DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
690
688DECLARE_EVENT_CLASS(xfs_iref_class, 691DECLARE_EVENT_CLASS(xfs_iref_class,
689 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), 692 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
690 TP_ARGS(ip, caller_ip), 693 TP_ARGS(ip, caller_ip),