diff options
author | Dave Chinner <dchinner@redhat.com> | 2011-01-03 19:35:03 -0500 |
---|---|---|
committer | Dave Chinner <david@fromorbit.com> | 2011-01-03 19:35:03 -0500 |
commit | 055388a3188f56676c21e92962fc366ac8b5cb72 (patch) | |
tree | 57fb9d649956f311031f574e3f6d8f54146f0f2a /fs/xfs/xfs_iomap.c | |
parent | 622d81494fa32343a4b97b607619656c7a4a6d1a (diff) |
xfs: dynamic speculative EOF preallocation
Currently the size of the speculative preallocation during delayed
allocation is fixed by either the allocsize mount option of a
default size. We are seeing a lot of cases where we need to
recommend using the allocsize mount option to prevent fragmentation
when buffered writes land in the same AG.
Rather than using a fixed preallocation size by default (up to 64k),
make it dynamic by basing it on the current inode size. That way the
EOF preallocation will increase as the file size increases. Hence
for streaming writes we are much more likely to get large
preallocations exactly when we need it to reduce fragementation.
For default settings, the size of the initial extents is determined
by the number of parallel writers and the amount of memory in the
machine. For 4GB RAM and 4 concurrent 32GB file writes:
EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL
0: [0..1048575]: 1048672..2097247 0 (1048672..2097247) 1048576
1: [1048576..2097151]: 5242976..6291551 0 (5242976..6291551) 1048576
2: [2097152..4194303]: 12583008..14680159 0 (12583008..14680159) 2097152
3: [4194304..8388607]: 25165920..29360223 0 (25165920..29360223) 4194304
4: [8388608..16777215]: 58720352..67108959 0 (58720352..67108959) 8388608
5: [16777216..33554423]: 117440584..134217791 0 (117440584..134217791) 16777208
6: [33554424..50331511]: 184549056..201326143 0 (184549056..201326143) 16777088
7: [50331512..67108599]: 251657408..268434495 0 (251657408..268434495) 16777088
and for 16 concurrent 16GB file writes:
EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL
0: [0..262143]: 2490472..2752615 0 (2490472..2752615) 262144
1: [262144..524287]: 6291560..6553703 0 (6291560..6553703) 262144
2: [524288..1048575]: 13631592..14155879 0 (13631592..14155879) 524288
3: [1048576..2097151]: 30408808..31457383 0 (30408808..31457383) 1048576
4: [2097152..4194303]: 52428904..54526055 0 (52428904..54526055) 2097152
5: [4194304..8388607]: 104857704..109052007 0 (104857704..109052007) 4194304
6: [8388608..16777215]: 209715304..218103911 0 (209715304..218103911) 8388608
7: [16777216..33554423]: 452984848..469762055 0 (452984848..469762055) 16777208
Because it is hard to take back specualtive preallocation, cases
where there are large slow growing log files on a nearly full
filesystem may cause premature ENOSPC. Hence as the filesystem nears
full, the maximum dynamic prealloc size іs reduced according to this
table (based on 4k block size):
freespace max prealloc size
>5% full extent (8GB)
4-5% 2GB (8GB >> 2)
3-4% 1GB (8GB >> 3)
2-3% 512MB (8GB >> 4)
1-2% 256MB (8GB >> 5)
<1% 128MB (8GB >> 6)
This should reduce the amount of space held in speculative
preallocation for such cases.
The allocsize mount option turns off the dynamic behaviour and fixes
the prealloc size to whatever the mount option specifies. i.e. the
behaviour is unchanged.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Diffstat (limited to 'fs/xfs/xfs_iomap.c')
-rw-r--r-- | fs/xfs/xfs_iomap.c | 84 |
1 files changed, 74 insertions, 10 deletions
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 22b62a179e89..55582bd66659 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c | |||
@@ -267,6 +267,9 @@ error_out: | |||
267 | * If the caller is doing a write at the end of the file, then extend the | 267 | * If the caller is doing a write at the end of the file, then extend the |
268 | * allocation out to the file system's write iosize. We clean up any extra | 268 | * allocation out to the file system's write iosize. We clean up any extra |
269 | * space left over when the file is closed in xfs_inactive(). | 269 | * space left over when the file is closed in xfs_inactive(). |
270 | * | ||
271 | * If we find we already have delalloc preallocation beyond EOF, don't do more | ||
272 | * preallocation as it it not needed. | ||
270 | */ | 273 | */ |
271 | STATIC int | 274 | STATIC int |
272 | xfs_iomap_eof_want_preallocate( | 275 | xfs_iomap_eof_want_preallocate( |
@@ -282,6 +285,7 @@ xfs_iomap_eof_want_preallocate( | |||
282 | xfs_filblks_t count_fsb; | 285 | xfs_filblks_t count_fsb; |
283 | xfs_fsblock_t firstblock; | 286 | xfs_fsblock_t firstblock; |
284 | int n, error, imaps; | 287 | int n, error, imaps; |
288 | int found_delalloc = 0; | ||
285 | 289 | ||
286 | *prealloc = 0; | 290 | *prealloc = 0; |
287 | if ((offset + count) <= ip->i_size) | 291 | if ((offset + count) <= ip->i_size) |
@@ -306,12 +310,60 @@ xfs_iomap_eof_want_preallocate( | |||
306 | return 0; | 310 | return 0; |
307 | start_fsb += imap[n].br_blockcount; | 311 | start_fsb += imap[n].br_blockcount; |
308 | count_fsb -= imap[n].br_blockcount; | 312 | count_fsb -= imap[n].br_blockcount; |
313 | |||
314 | if (imap[n].br_startblock == DELAYSTARTBLOCK) | ||
315 | found_delalloc = 1; | ||
309 | } | 316 | } |
310 | } | 317 | } |
311 | *prealloc = 1; | 318 | if (!found_delalloc) |
319 | *prealloc = 1; | ||
312 | return 0; | 320 | return 0; |
313 | } | 321 | } |
314 | 322 | ||
323 | /* | ||
324 | * If we don't have a user specified preallocation size, dynamically increase | ||
325 | * the preallocation size as the size of the file grows. Cap the maximum size | ||
326 | * at a single extent or less if the filesystem is near full. The closer the | ||
327 | * filesystem is to full, the smaller the maximum prealocation. | ||
328 | */ | ||
329 | STATIC xfs_fsblock_t | ||
330 | xfs_iomap_prealloc_size( | ||
331 | struct xfs_mount *mp, | ||
332 | struct xfs_inode *ip) | ||
333 | { | ||
334 | xfs_fsblock_t alloc_blocks = 0; | ||
335 | |||
336 | if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { | ||
337 | int shift = 0; | ||
338 | int64_t freesp; | ||
339 | |||
340 | alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size); | ||
341 | alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, | ||
342 | rounddown_pow_of_two(alloc_blocks)); | ||
343 | |||
344 | xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); | ||
345 | freesp = mp->m_sb.sb_fdblocks; | ||
346 | if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { | ||
347 | shift = 2; | ||
348 | if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) | ||
349 | shift++; | ||
350 | if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) | ||
351 | shift++; | ||
352 | if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) | ||
353 | shift++; | ||
354 | if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) | ||
355 | shift++; | ||
356 | } | ||
357 | if (shift) | ||
358 | alloc_blocks >>= shift; | ||
359 | } | ||
360 | |||
361 | if (alloc_blocks < mp->m_writeio_blocks) | ||
362 | alloc_blocks = mp->m_writeio_blocks; | ||
363 | |||
364 | return alloc_blocks; | ||
365 | } | ||
366 | |||
315 | int | 367 | int |
316 | xfs_iomap_write_delay( | 368 | xfs_iomap_write_delay( |
317 | xfs_inode_t *ip, | 369 | xfs_inode_t *ip, |
@@ -344,6 +396,7 @@ xfs_iomap_write_delay( | |||
344 | extsz = xfs_get_extsz_hint(ip); | 396 | extsz = xfs_get_extsz_hint(ip); |
345 | offset_fsb = XFS_B_TO_FSBT(mp, offset); | 397 | offset_fsb = XFS_B_TO_FSBT(mp, offset); |
346 | 398 | ||
399 | |||
347 | error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, | 400 | error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, |
348 | imap, XFS_WRITE_IMAPS, &prealloc); | 401 | imap, XFS_WRITE_IMAPS, &prealloc); |
349 | if (error) | 402 | if (error) |
@@ -351,9 +404,11 @@ xfs_iomap_write_delay( | |||
351 | 404 | ||
352 | retry: | 405 | retry: |
353 | if (prealloc) { | 406 | if (prealloc) { |
407 | xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip); | ||
408 | |||
354 | aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); | 409 | aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); |
355 | ioalign = XFS_B_TO_FSBT(mp, aligned_offset); | 410 | ioalign = XFS_B_TO_FSBT(mp, aligned_offset); |
356 | last_fsb = ioalign + mp->m_writeio_blocks; | 411 | last_fsb = ioalign + alloc_blocks; |
357 | } else { | 412 | } else { |
358 | last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); | 413 | last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); |
359 | } | 414 | } |
@@ -371,22 +426,31 @@ retry: | |||
371 | XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | | 426 | XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | |
372 | XFS_BMAPI_ENTIRE, &firstblock, 1, imap, | 427 | XFS_BMAPI_ENTIRE, &firstblock, 1, imap, |
373 | &nimaps, NULL); | 428 | &nimaps, NULL); |
374 | if (error && (error != ENOSPC)) | 429 | switch (error) { |
430 | case 0: | ||
431 | case ENOSPC: | ||
432 | case EDQUOT: | ||
433 | break; | ||
434 | default: | ||
375 | return XFS_ERROR(error); | 435 | return XFS_ERROR(error); |
436 | } | ||
376 | 437 | ||
377 | /* | 438 | /* |
378 | * If bmapi returned us nothing, and if we didn't get back EDQUOT, | 439 | * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For |
379 | * then we must have run out of space - flush all other inodes with | 440 | * ENOSPC, * flush all other inodes with delalloc blocks to free up |
380 | * delalloc blocks and retry without EOF preallocation. | 441 | * some of the excess reserved metadata space. For both cases, retry |
442 | * without EOF preallocation. | ||
381 | */ | 443 | */ |
382 | if (nimaps == 0) { | 444 | if (nimaps == 0) { |
383 | trace_xfs_delalloc_enospc(ip, offset, count); | 445 | trace_xfs_delalloc_enospc(ip, offset, count); |
384 | if (flushed) | 446 | if (flushed) |
385 | return XFS_ERROR(ENOSPC); | 447 | return XFS_ERROR(error ? error : ENOSPC); |
386 | 448 | ||
387 | xfs_iunlock(ip, XFS_ILOCK_EXCL); | 449 | if (error == ENOSPC) { |
388 | xfs_flush_inodes(ip); | 450 | xfs_iunlock(ip, XFS_ILOCK_EXCL); |
389 | xfs_ilock(ip, XFS_ILOCK_EXCL); | 451 | xfs_flush_inodes(ip); |
452 | xfs_ilock(ip, XFS_ILOCK_EXCL); | ||
453 | } | ||
390 | 454 | ||
391 | flushed = 1; | 455 | flushed = 1; |
392 | error = 0; | 456 | error = 0; |