aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2011-01-03 19:35:03 -0500
committerDave Chinner <david@fromorbit.com>2011-01-03 19:35:03 -0500
commit055388a3188f56676c21e92962fc366ac8b5cb72 (patch)
tree57fb9d649956f311031f574e3f6d8f54146f0f2a /fs
parent622d81494fa32343a4b97b607619656c7a4a6d1a (diff)
xfs: dynamic speculative EOF preallocation
Currently the size of the speculative preallocation during delayed allocation is fixed by either the allocsize mount option of a default size. We are seeing a lot of cases where we need to recommend using the allocsize mount option to prevent fragmentation when buffered writes land in the same AG. Rather than using a fixed preallocation size by default (up to 64k), make it dynamic by basing it on the current inode size. That way the EOF preallocation will increase as the file size increases. Hence for streaming writes we are much more likely to get large preallocations exactly when we need it to reduce fragementation. For default settings, the size of the initial extents is determined by the number of parallel writers and the amount of memory in the machine. For 4GB RAM and 4 concurrent 32GB file writes: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..1048575]: 1048672..2097247 0 (1048672..2097247) 1048576 1: [1048576..2097151]: 5242976..6291551 0 (5242976..6291551) 1048576 2: [2097152..4194303]: 12583008..14680159 0 (12583008..14680159) 2097152 3: [4194304..8388607]: 25165920..29360223 0 (25165920..29360223) 4194304 4: [8388608..16777215]: 58720352..67108959 0 (58720352..67108959) 8388608 5: [16777216..33554423]: 117440584..134217791 0 (117440584..134217791) 16777208 6: [33554424..50331511]: 184549056..201326143 0 (184549056..201326143) 16777088 7: [50331512..67108599]: 251657408..268434495 0 (251657408..268434495) 16777088 and for 16 concurrent 16GB file writes: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..262143]: 2490472..2752615 0 (2490472..2752615) 262144 1: [262144..524287]: 6291560..6553703 0 (6291560..6553703) 262144 2: [524288..1048575]: 13631592..14155879 0 (13631592..14155879) 524288 3: [1048576..2097151]: 30408808..31457383 0 (30408808..31457383) 1048576 4: [2097152..4194303]: 52428904..54526055 0 (52428904..54526055) 2097152 5: [4194304..8388607]: 104857704..109052007 0 (104857704..109052007) 4194304 6: [8388608..16777215]: 209715304..218103911 0 (209715304..218103911) 8388608 7: [16777216..33554423]: 452984848..469762055 0 (452984848..469762055) 16777208 Because it is hard to take back specualtive preallocation, cases where there are large slow growing log files on a nearly full filesystem may cause premature ENOSPC. Hence as the filesystem nears full, the maximum dynamic prealloc size іs reduced according to this table (based on 4k block size): freespace max prealloc size >5% full extent (8GB) 4-5% 2GB (8GB >> 2) 3-4% 1GB (8GB >> 3) 2-3% 512MB (8GB >> 4) 1-2% 256MB (8GB >> 5) <1% 128MB (8GB >> 6) This should reduce the amount of space held in speculative preallocation for such cases. The allocsize mount option turns off the dynamic behaviour and fixes the prealloc size to whatever the mount option specifies. i.e. the behaviour is unchanged. Signed-off-by: Dave Chinner <dchinner@redhat.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/xfs_fsops.c1
-rw-r--r--fs/xfs/xfs_iomap.c84
-rw-r--r--fs/xfs/xfs_mount.c21
-rw-r--r--fs/xfs/xfs_mount.h14
4 files changed, 110 insertions, 10 deletions
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a7c116e814af..f56d30e8040c 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -374,6 +374,7 @@ xfs_growfs_data_private(
374 mp->m_maxicount = icount << mp->m_sb.sb_inopblog; 374 mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
375 } else 375 } else
376 mp->m_maxicount = 0; 376 mp->m_maxicount = 0;
377 xfs_set_low_space_thresholds(mp);
377 378
378 /* update secondary superblocks. */ 379 /* update secondary superblocks. */
379 for (agno = 1; agno < nagcount; agno++) { 380 for (agno = 1; agno < nagcount; agno++) {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 22b62a179e89..55582bd66659 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -267,6 +267,9 @@ error_out:
267 * If the caller is doing a write at the end of the file, then extend the 267 * If the caller is doing a write at the end of the file, then extend the
268 * allocation out to the file system's write iosize. We clean up any extra 268 * allocation out to the file system's write iosize. We clean up any extra
269 * space left over when the file is closed in xfs_inactive(). 269 * space left over when the file is closed in xfs_inactive().
270 *
271 * If we find we already have delalloc preallocation beyond EOF, don't do more
272 * preallocation as it it not needed.
270 */ 273 */
271STATIC int 274STATIC int
272xfs_iomap_eof_want_preallocate( 275xfs_iomap_eof_want_preallocate(
@@ -282,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
282 xfs_filblks_t count_fsb; 285 xfs_filblks_t count_fsb;
283 xfs_fsblock_t firstblock; 286 xfs_fsblock_t firstblock;
284 int n, error, imaps; 287 int n, error, imaps;
288 int found_delalloc = 0;
285 289
286 *prealloc = 0; 290 *prealloc = 0;
287 if ((offset + count) <= ip->i_size) 291 if ((offset + count) <= ip->i_size)
@@ -306,12 +310,60 @@ xfs_iomap_eof_want_preallocate(
306 return 0; 310 return 0;
307 start_fsb += imap[n].br_blockcount; 311 start_fsb += imap[n].br_blockcount;
308 count_fsb -= imap[n].br_blockcount; 312 count_fsb -= imap[n].br_blockcount;
313
314 if (imap[n].br_startblock == DELAYSTARTBLOCK)
315 found_delalloc = 1;
309 } 316 }
310 } 317 }
311 *prealloc = 1; 318 if (!found_delalloc)
319 *prealloc = 1;
312 return 0; 320 return 0;
313} 321}
314 322
323/*
324 * If we don't have a user specified preallocation size, dynamically increase
325 * the preallocation size as the size of the file grows. Cap the maximum size
326 * at a single extent or less if the filesystem is near full. The closer the
327 * filesystem is to full, the smaller the maximum prealocation.
328 */
329STATIC xfs_fsblock_t
330xfs_iomap_prealloc_size(
331 struct xfs_mount *mp,
332 struct xfs_inode *ip)
333{
334 xfs_fsblock_t alloc_blocks = 0;
335
336 if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
337 int shift = 0;
338 int64_t freesp;
339
340 alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size);
341 alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
342 rounddown_pow_of_two(alloc_blocks));
343
344 xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
345 freesp = mp->m_sb.sb_fdblocks;
346 if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
347 shift = 2;
348 if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
349 shift++;
350 if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
351 shift++;
352 if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
353 shift++;
354 if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
355 shift++;
356 }
357 if (shift)
358 alloc_blocks >>= shift;
359 }
360
361 if (alloc_blocks < mp->m_writeio_blocks)
362 alloc_blocks = mp->m_writeio_blocks;
363
364 return alloc_blocks;
365}
366
315int 367int
316xfs_iomap_write_delay( 368xfs_iomap_write_delay(
317 xfs_inode_t *ip, 369 xfs_inode_t *ip,
@@ -344,6 +396,7 @@ xfs_iomap_write_delay(
344 extsz = xfs_get_extsz_hint(ip); 396 extsz = xfs_get_extsz_hint(ip);
345 offset_fsb = XFS_B_TO_FSBT(mp, offset); 397 offset_fsb = XFS_B_TO_FSBT(mp, offset);
346 398
399
347 error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, 400 error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
348 imap, XFS_WRITE_IMAPS, &prealloc); 401 imap, XFS_WRITE_IMAPS, &prealloc);
349 if (error) 402 if (error)
@@ -351,9 +404,11 @@ xfs_iomap_write_delay(
351 404
352retry: 405retry:
353 if (prealloc) { 406 if (prealloc) {
407 xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
408
354 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); 409 aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
355 ioalign = XFS_B_TO_FSBT(mp, aligned_offset); 410 ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
356 last_fsb = ioalign + mp->m_writeio_blocks; 411 last_fsb = ioalign + alloc_blocks;
357 } else { 412 } else {
358 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); 413 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
359 } 414 }
@@ -371,22 +426,31 @@ retry:
371 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | 426 XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
372 XFS_BMAPI_ENTIRE, &firstblock, 1, imap, 427 XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
373 &nimaps, NULL); 428 &nimaps, NULL);
374 if (error && (error != ENOSPC)) 429 switch (error) {
430 case 0:
431 case ENOSPC:
432 case EDQUOT:
433 break;
434 default:
375 return XFS_ERROR(error); 435 return XFS_ERROR(error);
436 }
376 437
377 /* 438 /*
378 * If bmapi returned us nothing, and if we didn't get back EDQUOT, 439 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For
379 * then we must have run out of space - flush all other inodes with 440 * ENOSPC, * flush all other inodes with delalloc blocks to free up
380 * delalloc blocks and retry without EOF preallocation. 441 * some of the excess reserved metadata space. For both cases, retry
442 * without EOF preallocation.
381 */ 443 */
382 if (nimaps == 0) { 444 if (nimaps == 0) {
383 trace_xfs_delalloc_enospc(ip, offset, count); 445 trace_xfs_delalloc_enospc(ip, offset, count);
384 if (flushed) 446 if (flushed)
385 return XFS_ERROR(ENOSPC); 447 return XFS_ERROR(error ? error : ENOSPC);
386 448
387 xfs_iunlock(ip, XFS_ILOCK_EXCL); 449 if (error == ENOSPC) {
388 xfs_flush_inodes(ip); 450 xfs_iunlock(ip, XFS_ILOCK_EXCL);
389 xfs_ilock(ip, XFS_ILOCK_EXCL); 451 xfs_flush_inodes(ip);
452 xfs_ilock(ip, XFS_ILOCK_EXCL);
453 }
390 454
391 flushed = 1; 455 flushed = 1;
392 error = 0; 456 error = 0;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 19e9dfa1c254..40579fdf0d0a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -975,6 +975,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
975} 975}
976 976
977/* 977/*
978 * precalculate the low space thresholds for dynamic speculative preallocation.
979 */
980void
981xfs_set_low_space_thresholds(
982 struct xfs_mount *mp)
983{
984 int i;
985
986 for (i = 0; i < XFS_LOWSP_MAX; i++) {
987 __uint64_t space = mp->m_sb.sb_dblocks;
988
989 do_div(space, 100);
990 mp->m_low_space[i] = space * (i + 1);
991 }
992}
993
994
995/*
978 * Set whether we're using inode alignment. 996 * Set whether we're using inode alignment.
979 */ 997 */
980STATIC void 998STATIC void
@@ -1196,6 +1214,9 @@ xfs_mountfs(
1196 */ 1214 */
1197 xfs_set_rw_sizes(mp); 1215 xfs_set_rw_sizes(mp);
1198 1216
1217 /* set the low space thresholds for dynamic preallocation */
1218 xfs_set_low_space_thresholds(mp);
1219
1199 /* 1220 /*
1200 * Set the inode cluster size. 1221 * Set the inode cluster size.
1201 * This may still be overridden by the file system 1222 * This may still be overridden by the file system
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 5861b4980740..a62e8971539d 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -103,6 +103,16 @@ extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
103 xfs_mod_incore_sb(mp, field, delta, rsvd) 103 xfs_mod_incore_sb(mp, field, delta, rsvd)
104#endif 104#endif
105 105
106/* dynamic preallocation free space thresholds, 5% down to 1% */
107enum {
108 XFS_LOWSP_1_PCNT = 0,
109 XFS_LOWSP_2_PCNT,
110 XFS_LOWSP_3_PCNT,
111 XFS_LOWSP_4_PCNT,
112 XFS_LOWSP_5_PCNT,
113 XFS_LOWSP_MAX,
114};
115
106typedef struct xfs_mount { 116typedef struct xfs_mount {
107 struct super_block *m_super; 117 struct super_block *m_super;
108 xfs_tid_t m_tid; /* next unused tid for fs */ 118 xfs_tid_t m_tid; /* next unused tid for fs */
@@ -202,6 +212,8 @@ typedef struct xfs_mount {
202 __int64_t m_update_flags; /* sb flags we need to update 212 __int64_t m_update_flags; /* sb flags we need to update
203 on the next remount,rw */ 213 on the next remount,rw */
204 struct shrinker m_inode_shrink; /* inode reclaim shrinker */ 214 struct shrinker m_inode_shrink; /* inode reclaim shrinker */
215 int64_t m_low_space[XFS_LOWSP_MAX];
216 /* low free space thresholds */
205} xfs_mount_t; 217} xfs_mount_t;
206 218
207/* 219/*
@@ -379,6 +391,8 @@ extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
379 391
380extern int xfs_dev_is_read_only(struct xfs_mount *, char *); 392extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
381 393
394extern void xfs_set_low_space_thresholds(struct xfs_mount *);
395
382#endif /* __KERNEL__ */ 396#endif /* __KERNEL__ */
383 397
384extern void xfs_mod_sb(struct xfs_trans *, __int64_t); 398extern void xfs_mod_sb(struct xfs_trans *, __int64_t);