diff options
author | Brian Foster <bfoster@redhat.com> | 2015-10-12 00:34:20 -0400 |
---|---|---|
committer | Dave Chinner <david@fromorbit.com> | 2015-10-12 00:34:20 -0400 |
commit | 009c6e871e98aa23bc2e58474c3d9feb05dd1ae6 (patch) | |
tree | 5af6b1c62320e3643bfbc895f9ee9fbe93103ba4 | |
parent | 5cb13dcd0fac071b45c4bebe1801a08ff0d89cad (diff) |
xfs: add missing ilock around dio write last extent alignment
The iomap codepath (via get_blocks()) acquires and release the inode
lock in the case of a direct write that requires block allocation. This
is because xfs_iomap_write_direct() allocates a transaction, which means
the ilock must be dropped and reacquired after the transaction is
allocated and reserved.
xfs_iomap_write_direct() invokes xfs_iomap_eof_align_last_fsb() before
the transaction is created and thus before the ilock is reacquired. This
can lead to calls to xfs_iread_extents() and reads of the in-core extent
list without any synchronization (via xfs_bmap_eof() and
xfs_bmap_last_extent()). xfs_iread_extents() assert fails if the ilock
is not held, but this is not currently seen in practice as the current
callers had already invoked xfs_bmapi_read().
What has been seen in practice are reports of crashes down in the
xfs_bmap_eof() codepath on direct writes due to seemingly bogus pointer
references from xfs_iext_get_ext(). While an explicit reproducer is not
currently available to confirm the cause of the problem, crash analysis
and code inspection from David Jeffrey had identified the insufficient
locking.
xfs_iomap_eof_align_last_fsb() is called from other contexts with the
inode lock already held, so we cannot acquire it therein.
__xfs_get_blocks() acquires and drops the ilock with variable flags to
cover the event that the extent list must be read in. The common case is
that __xfs_get_blocks() acquires the shared ilock. To provide locking
around the last extent alignment call without adding more lock cycles to
the dio path, update xfs_iomap_write_direct() to expect the shared ilock
held on entry and do the extent alignment under its protection. Demote
the lock, if necessary, from __xfs_get_blocks() and push the
xfs_qm_dqattach() call outside of the shared lock critical section.
Also, add an assert to document that the extent list is always expected
to be present in this path. Otherwise, we risk a call to
xfs_iread_extents() while under the shared ilock. This is safe as all
current callers have executed an xfs_bmapi_read() call under the current
iolock context.
Reported-by: David Jeffery <djeffery@redhat.com>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
-rw-r--r-- | fs/xfs/xfs_aops.c | 10 | ||||
-rw-r--r-- | fs/xfs/xfs_iomap.c | 33 | ||||
-rw-r--r-- | fs/xfs/xfs_pnfs.c | 5 |
3 files changed, 36 insertions, 12 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e485e31813fa..e4fff5898c1c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c | |||
@@ -1408,12 +1408,12 @@ __xfs_get_blocks( | |||
1408 | imap.br_startblock == DELAYSTARTBLOCK))) { | 1408 | imap.br_startblock == DELAYSTARTBLOCK))) { |
1409 | if (direct || xfs_get_extsz_hint(ip)) { | 1409 | if (direct || xfs_get_extsz_hint(ip)) { |
1410 | /* | 1410 | /* |
1411 | * Drop the ilock in preparation for starting the block | 1411 | * xfs_iomap_write_direct() expects the shared lock. It |
1412 | * allocation transaction. It will be retaken | 1412 | * is unlocked on return. |
1413 | * exclusively inside xfs_iomap_write_direct for the | ||
1414 | * actual allocation. | ||
1415 | */ | 1413 | */ |
1416 | xfs_iunlock(ip, lockmode); | 1414 | if (lockmode == XFS_ILOCK_EXCL) |
1415 | xfs_ilock_demote(ip, lockmode); | ||
1416 | |||
1417 | error = xfs_iomap_write_direct(ip, offset, size, | 1417 | error = xfs_iomap_write_direct(ip, offset, size, |
1418 | &imap, nimaps); | 1418 | &imap, nimaps); |
1419 | if (error) | 1419 | if (error) |
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 1f86033171c8..1beda331d8a7 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c | |||
@@ -131,20 +131,29 @@ xfs_iomap_write_direct( | |||
131 | uint qblocks, resblks, resrtextents; | 131 | uint qblocks, resblks, resrtextents; |
132 | int committed; | 132 | int committed; |
133 | int error; | 133 | int error; |
134 | 134 | int lockmode; | |
135 | error = xfs_qm_dqattach(ip, 0); | ||
136 | if (error) | ||
137 | return error; | ||
138 | 135 | ||
139 | rt = XFS_IS_REALTIME_INODE(ip); | 136 | rt = XFS_IS_REALTIME_INODE(ip); |
140 | extsz = xfs_get_extsz_hint(ip); | 137 | extsz = xfs_get_extsz_hint(ip); |
138 | lockmode = XFS_ILOCK_SHARED; /* locked by caller */ | ||
139 | |||
140 | ASSERT(xfs_isilocked(ip, lockmode)); | ||
141 | 141 | ||
142 | offset_fsb = XFS_B_TO_FSBT(mp, offset); | 142 | offset_fsb = XFS_B_TO_FSBT(mp, offset); |
143 | last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); | 143 | last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); |
144 | if ((offset + count) > XFS_ISIZE(ip)) { | 144 | if ((offset + count) > XFS_ISIZE(ip)) { |
145 | /* | ||
146 | * Assert that the in-core extent list is present since this can | ||
147 | * call xfs_iread_extents() and we only have the ilock shared. | ||
148 | * This should be safe because the lock was held around a bmapi | ||
149 | * call in the caller and we only need it to access the in-core | ||
150 | * list. | ||
151 | */ | ||
152 | ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags & | ||
153 | XFS_IFEXTENTS); | ||
145 | error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); | 154 | error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); |
146 | if (error) | 155 | if (error) |
147 | return error; | 156 | goto out_unlock; |
148 | } else { | 157 | } else { |
149 | if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) | 158 | if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) |
150 | last_fsb = MIN(last_fsb, (xfs_fileoff_t) | 159 | last_fsb = MIN(last_fsb, (xfs_fileoff_t) |
@@ -174,6 +183,15 @@ xfs_iomap_write_direct( | |||
174 | } | 183 | } |
175 | 184 | ||
176 | /* | 185 | /* |
186 | * Drop the shared lock acquired by the caller, attach the dquot if | ||
187 | * necessary and move on to transaction setup. | ||
188 | */ | ||
189 | xfs_iunlock(ip, lockmode); | ||
190 | error = xfs_qm_dqattach(ip, 0); | ||
191 | if (error) | ||
192 | return error; | ||
193 | |||
194 | /* | ||
177 | * Allocate and setup the transaction | 195 | * Allocate and setup the transaction |
178 | */ | 196 | */ |
179 | tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); | 197 | tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); |
@@ -187,7 +205,8 @@ xfs_iomap_write_direct( | |||
187 | return error; | 205 | return error; |
188 | } | 206 | } |
189 | 207 | ||
190 | xfs_ilock(ip, XFS_ILOCK_EXCL); | 208 | lockmode = XFS_ILOCK_EXCL; |
209 | xfs_ilock(ip, lockmode); | ||
191 | 210 | ||
192 | error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); | 211 | error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); |
193 | if (error) | 212 | if (error) |
@@ -229,7 +248,7 @@ xfs_iomap_write_direct( | |||
229 | error = xfs_alert_fsblock_zero(ip, imap); | 248 | error = xfs_alert_fsblock_zero(ip, imap); |
230 | 249 | ||
231 | out_unlock: | 250 | out_unlock: |
232 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | 251 | xfs_iunlock(ip, lockmode); |
233 | return error; | 252 | return error; |
234 | 253 | ||
235 | out_bmap_cancel: | 254 | out_bmap_cancel: |
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index ab4a6066f7ca..dc6221942b85 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c | |||
@@ -181,6 +181,11 @@ xfs_fs_map_blocks( | |||
181 | ASSERT(imap.br_startblock != DELAYSTARTBLOCK); | 181 | ASSERT(imap.br_startblock != DELAYSTARTBLOCK); |
182 | 182 | ||
183 | if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) { | 183 | if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) { |
184 | /* | ||
185 | * xfs_iomap_write_direct() expects to take ownership of | ||
186 | * the shared ilock. | ||
187 | */ | ||
188 | xfs_ilock(ip, XFS_ILOCK_SHARED); | ||
184 | error = xfs_iomap_write_direct(ip, offset, length, | 189 | error = xfs_iomap_write_direct(ip, offset, length, |
185 | &imap, nimaps); | 190 | &imap, nimaps); |
186 | if (error) | 191 | if (error) |