aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_file.c
diff options
context:
space:
mode:
authorDave Chinner <david@fromorbit.com>2011-08-25 03:17:02 -0400
committerAlex Elder <aelder@sgi.com>2011-10-11 22:14:59 -0400
commit7271d243f9d1b4106289e4cf876c8b1203de59ab (patch)
tree0fa71fcbab0ab9b7aa89cdd37bf05564ecc3ac4d /fs/xfs/xfs_file.c
parent0c38a2512df272b14ef4238b476a2e4f70da1479 (diff)
xfs: don't serialise adjacent concurrent direct IO appending writes
For append write workloads, extending the file requires a certain amount of exclusive locking to be done up front to ensure sanity in things like ensuring that we've zeroed any allocated regions between the old EOF and the start of the new IO. For single threads, this typically isn't a problem, and for large IOs we don't serialise enough for it to be a problem for two threads on really fast block devices. However for smaller IO and larger thread counts we have a problem. Take 4 concurrent sequential, single block sized and aligned IOs. After the first IO is submitted but before it completes, we end up with this state: IO 1 IO 2 IO 3 IO 4 +-------+-------+-------+-------+ ^ ^ | | | | | | | \- ip->i_new_size \- ip->i_size And the IO is done without exclusive locking because offset <= ip->i_size. When we submit IO 2, we see offset > ip->i_size, and grab the IO lock exclusive, because there is a chance we need to do EOF zeroing. However, there is already an IO in progress that avoids the need for IO zeroing because offset <= ip->i_new_size. hence we could avoid holding the IO lock exlcusive for this. Hence after submission of the second IO, we'd end up this state: IO 1 IO 2 IO 3 IO 4 +-------+-------+-------+-------+ ^ ^ | | | | | | | \- ip->i_new_size \- ip->i_size There is no need to grab the i_mutex of the IO lock in exclusive mode if we don't need to invalidate the page cache. Taking these locks on every direct IO effective serialises them as taking the IO lock in exclusive mode has to wait for all shared holders to drop the lock. That only happens when IO is complete, so effective it prevents dispatch of concurrent direct IO writes to the same inode. And so you can see that for the third concurrent IO, we'd avoid exclusive locking for the same reason we avoided the exclusive lock for the second IO. Fixing this is a bit more complex than that, because we need to hold a write-submission local value of ip->i_new_size to that clearing the value is only done if no other thread has updated it before our IO completes..... Signed-off-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Alex Elder <aelder@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_file.c')
-rw-r--r--fs/xfs/xfs_file.c68
1 files changed, 52 insertions, 16 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 8fd4a0708d30..cbbac5cc9c26 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -418,11 +418,13 @@ xfs_aio_write_isize_update(
418 */ 418 */
419STATIC void 419STATIC void
420xfs_aio_write_newsize_update( 420xfs_aio_write_newsize_update(
421 struct xfs_inode *ip) 421 struct xfs_inode *ip,
422 xfs_fsize_t new_size)
422{ 423{
423 if (ip->i_new_size) { 424 if (new_size == ip->i_new_size) {
424 xfs_rw_ilock(ip, XFS_ILOCK_EXCL); 425 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
425 ip->i_new_size = 0; 426 if (new_size == ip->i_new_size)
427 ip->i_new_size = 0;
426 if (ip->i_d.di_size > ip->i_size) 428 if (ip->i_d.di_size > ip->i_size)
427 ip->i_d.di_size = ip->i_size; 429 ip->i_d.di_size = ip->i_size;
428 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); 430 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
@@ -473,7 +475,7 @@ xfs_file_splice_write(
473 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); 475 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
474 476
475 xfs_aio_write_isize_update(inode, ppos, ret); 477 xfs_aio_write_isize_update(inode, ppos, ret);
476 xfs_aio_write_newsize_update(ip); 478 xfs_aio_write_newsize_update(ip, new_size);
477 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 479 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
478 return ret; 480 return ret;
479} 481}
@@ -670,6 +672,7 @@ xfs_file_aio_write_checks(
670 struct file *file, 672 struct file *file,
671 loff_t *pos, 673 loff_t *pos,
672 size_t *count, 674 size_t *count,
675 xfs_fsize_t *new_sizep,
673 int *iolock) 676 int *iolock)
674{ 677{
675 struct inode *inode = file->f_mapping->host; 678 struct inode *inode = file->f_mapping->host;
@@ -677,6 +680,8 @@ xfs_file_aio_write_checks(
677 xfs_fsize_t new_size; 680 xfs_fsize_t new_size;
678 int error = 0; 681 int error = 0;
679 682
683 *new_sizep = 0;
684restart:
680 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); 685 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
681 if (error) { 686 if (error) {
682 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); 687 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
@@ -684,20 +689,41 @@ xfs_file_aio_write_checks(
684 return error; 689 return error;
685 } 690 }
686 691
687 new_size = *pos + *count;
688 if (new_size > ip->i_size)
689 ip->i_new_size = new_size;
690
691 if (likely(!(file->f_mode & FMODE_NOCMTIME))) 692 if (likely(!(file->f_mode & FMODE_NOCMTIME)))
692 file_update_time(file); 693 file_update_time(file);
693 694
694 /* 695 /*
695 * If the offset is beyond the size of the file, we need to zero any 696 * If the offset is beyond the size of the file, we need to zero any
696 * blocks that fall between the existing EOF and the start of this 697 * blocks that fall between the existing EOF and the start of this
697 * write. 698 * write. There is no need to issue zeroing if another in-flght IO ends
699 * at or before this one If zeronig is needed and we are currently
700 * holding the iolock shared, we need to update it to exclusive which
701 * involves dropping all locks and relocking to maintain correct locking
702 * order. If we do this, restart the function to ensure all checks and
703 * values are still valid.
698 */ 704 */
699 if (*pos > ip->i_size) 705 if ((ip->i_new_size && *pos > ip->i_new_size) ||
706 (!ip->i_new_size && *pos > ip->i_size)) {
707 if (*iolock == XFS_IOLOCK_SHARED) {
708 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
709 *iolock = XFS_IOLOCK_EXCL;
710 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
711 goto restart;
712 }
700 error = -xfs_zero_eof(ip, *pos, ip->i_size); 713 error = -xfs_zero_eof(ip, *pos, ip->i_size);
714 }
715
716 /*
717 * If this IO extends beyond EOF, we may need to update ip->i_new_size.
718 * We have already zeroed space beyond EOF (if necessary). Only update
719 * ip->i_new_size if this IO ends beyond any other in-flight writes.
720 */
721 new_size = *pos + *count;
722 if (new_size > ip->i_size) {
723 if (new_size > ip->i_new_size)
724 ip->i_new_size = new_size;
725 *new_sizep = new_size;
726 }
701 727
702 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); 728 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
703 if (error) 729 if (error)
@@ -744,6 +770,7 @@ xfs_file_dio_aio_write(
744 unsigned long nr_segs, 770 unsigned long nr_segs,
745 loff_t pos, 771 loff_t pos,
746 size_t ocount, 772 size_t ocount,
773 xfs_fsize_t *new_size,
747 int *iolock) 774 int *iolock)
748{ 775{
749 struct file *file = iocb->ki_filp; 776 struct file *file = iocb->ki_filp;
@@ -764,13 +791,20 @@ xfs_file_dio_aio_write(
764 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) 791 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
765 unaligned_io = 1; 792 unaligned_io = 1;
766 793
767 if (unaligned_io || mapping->nrpages || pos > ip->i_size) 794 /*
795 * We don't need to take an exclusive lock unless there page cache needs
796 * to be invalidated or unaligned IO is being executed. We don't need to
797 * consider the EOF extension case here because
798 * xfs_file_aio_write_checks() will relock the inode as necessary for
799 * EOF zeroing cases and fill out the new inode size as appropriate.
800 */
801 if (unaligned_io || mapping->nrpages)
768 *iolock = XFS_IOLOCK_EXCL; 802 *iolock = XFS_IOLOCK_EXCL;
769 else 803 else
770 *iolock = XFS_IOLOCK_SHARED; 804 *iolock = XFS_IOLOCK_SHARED;
771 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); 805 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
772 806
773 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); 807 ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
774 if (ret) 808 if (ret)
775 return ret; 809 return ret;
776 810
@@ -809,6 +843,7 @@ xfs_file_buffered_aio_write(
809 unsigned long nr_segs, 843 unsigned long nr_segs,
810 loff_t pos, 844 loff_t pos,
811 size_t ocount, 845 size_t ocount,
846 xfs_fsize_t *new_size,
812 int *iolock) 847 int *iolock)
813{ 848{
814 struct file *file = iocb->ki_filp; 849 struct file *file = iocb->ki_filp;
@@ -822,7 +857,7 @@ xfs_file_buffered_aio_write(
822 *iolock = XFS_IOLOCK_EXCL; 857 *iolock = XFS_IOLOCK_EXCL;
823 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); 858 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
824 859
825 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); 860 ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock);
826 if (ret) 861 if (ret)
827 return ret; 862 return ret;
828 863
@@ -862,6 +897,7 @@ xfs_file_aio_write(
862 ssize_t ret; 897 ssize_t ret;
863 int iolock; 898 int iolock;
864 size_t ocount = 0; 899 size_t ocount = 0;
900 xfs_fsize_t new_size = 0;
865 901
866 XFS_STATS_INC(xs_write_calls); 902 XFS_STATS_INC(xs_write_calls);
867 903
@@ -881,10 +917,10 @@ xfs_file_aio_write(
881 917
882 if (unlikely(file->f_flags & O_DIRECT)) 918 if (unlikely(file->f_flags & O_DIRECT))
883 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, 919 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
884 ocount, &iolock); 920 ocount, &new_size, &iolock);
885 else 921 else
886 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, 922 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
887 ocount, &iolock); 923 ocount, &new_size, &iolock);
888 924
889 xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret); 925 xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
890 926
@@ -905,7 +941,7 @@ xfs_file_aio_write(
905 } 941 }
906 942
907out_unlock: 943out_unlock:
908 xfs_aio_write_newsize_update(ip); 944 xfs_aio_write_newsize_update(ip, new_size);
909 xfs_rw_iunlock(ip, iolock); 945 xfs_rw_iunlock(ip, iolock);
910 return ret; 946 return ret;
911} 947}