xfs: limit speculative prealloc size on sparse files

Speculative preallocation based on the current file size works well for contiguous files, but is sub-optimal for sparse files where the EOF preallocation can fill holes and result in large amounts of zeros being written when it is not necessary. The algorithm is modified to prevent EOF speculative preallocation from triggering larger allocations on IO patterns of truncate--to-zero-seek-write-seek-write-.... which results in non-sparse files for large files. This, unfortunately, is the way cp now behaves when copying sparse files and so needs to be fixed. What this code does is that it looks at the existing extent adjacent to the current EOF and if it determines that it is a hole we disable speculative preallocation altogether. To avoid the next write from doing a large prealloc, it takes the size of subsequent preallocations from the current size of the existing EOF extent. IOWs, if you leave a hole in the file, it resets preallocation behaviour to the same as if it was a zero size file. Example new behaviour: $ xfs_io -f -c "pwrite 0 31m" \ -c "pwrite 33m 1m" \ -c "pwrite 128m 1m" \ -c "fiemap -v" /mnt/scratch/blah wrote 32505856/32505856 bytes at offset 0 31 MiB, 7936 ops; 0.0000 sec (1.608 GiB/sec and 421432.7439 ops/sec) wrote 1048576/1048576 bytes at offset 34603008 1 MiB, 256 ops; 0.0000 sec (1.462 GiB/sec and 383233.5329 ops/sec) wrote 1048576/1048576 bytes at offset 134217728 1 MiB, 256 ops; 0.0000 sec (1.719 GiB/sec and 450704.2254 ops/sec) /mnt/scratch/blah: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..65535]: 96..65631 65536 0x0 1: [65536..67583]: hole 2048 2: [67584..69631]: 67680..69727 2048 0x0 3: [69632..262143]: hole 192512 4: [262144..264191]: 262240..264287 2048 0x1 Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Ben Myers <bpm@sgi.com>
author: Dave Chinner <dchinner@redhat.com> 2013-02-11 00:05:01 -0500
committer: Ben Myers <bpm@sgi.com> 2013-02-14 18:21:32 -0500
commit: a1e16c26660b301cc8423185924cf1b0b16ea92b (patch)
tree: b0dfc8be0dd7a91d32b3280517ca64ca3f3c5f79 /fs/xfs
parent: 311f08acde635e4e5ccea9b9d8c856cc2e0ced95 (diff)
1 files changed, 67 insertions, 10 deletions
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 364818eef40e..912d83d8860a 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -311,6 +311,62 @@ xfs_iomap_eof_want_preallocate(
 }
 /*
+ * Determine the initial size of the preallocation. We are beyond the current
+ * EOF here, but we need to take into account whether this is a sparse write or
+ * an extending write when determining the preallocation size.  Hence we need to
+ * look up the extent that ends at the current write offset and use the result
+ * to determine the preallocation size.
+ *
+ * If the extent is a hole, then preallocation is essentially disabled.
+ * Otherwise we take the size of the preceeding data extent as the basis for the
+ * preallocation size. If the size of the extent is greater than half the
+ * maximum extent length, then use the current offset as the basis. This ensures
+ * that for large files the preallocation size always extends to MAXEXTLEN
+ * rather than falling short due to things like stripe unit/width alignment of
+ * real extents.
+ */
+STATIC int
+xfs_iomap_eof_prealloc_initial_size(
+        struct xfs_mount        *mp,
+        struct xfs_inode        *ip,
+        xfs_off_t               offset,
+        xfs_bmbt_irec_t         *imap,
+        int                     nimaps)
+{
+        xfs_fileoff_t   start_fsb;
+        int             imaps = 1;
+        int             error;
+        ASSERT(nimaps >= imaps);
+        /* if we are using a specific prealloc size, return now */
+        if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
+                return 0;
+        /*
+         * As we write multiple pages, the offset will always align to the
+         * start of a page and hence point to a hole at EOF. i.e. if the size is
+         * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096)
+         * will return FSB 1. Hence if there are blocks in the file, we want to
+         * point to the block prior to the EOF block and not the hole that maps
+         * directly at @offset.
+         */
+        start_fsb = XFS_B_TO_FSB(mp, offset);
+        if (start_fsb)
+                start_fsb--;
+        error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE);
+        if (error)
+                return 0;
+        ASSERT(imaps == 1);
+        if (imap[0].br_startblock == HOLESTARTBLOCK)
+                return 0;
+        if (imap[0].br_blockcount <= (MAXEXTLEN >> 1))
+                return imap[0].br_blockcount;
+        return XFS_B_TO_FSB(mp, offset);
+}
+/*
 * If we don't have a user specified preallocation size, dynamically increase
 * the preallocation size as the size of the file grows. Cap the maximum size
 * at a single extent or less if the filesystem is near full. The closer the
@@ -319,20 +375,19 @@ xfs_iomap_eof_want_preallocate(
 STATIC xfs_fsblock_t
 xfs_iomap_prealloc_size(
        struct xfs_mount        *mp,
-        struct xfs_inode        *ip)
+        struct xfs_inode        *ip,
+        xfs_off_t               offset,
+        struct xfs_bmbt_irec    *imap,
+        int                     nimaps)
 {
        xfs_fsblock_t           alloc_blocks = 0;
-        if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+        alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset,
+                                                           imap, nimaps);
+        if (alloc_blocks > 0) {
                int shift = 0;
                int64_t freesp;
-                /*
-                 * rounddown_pow_of_two() returns an undefined result
-                 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
-                 * ensure we always pass in a non-zero value.
-                 */
-                alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
                alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
                                        rounddown_pow_of_two(alloc_blocks));
@@ -399,7 +454,6 @@ xfs_iomap_write_delay(
        extsz = xfs_get_extsz_hint(ip);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
                                imap, XFS_WRITE_IMAPS, &prealloc);
        if (error)
@@ -407,7 +461,10 @@ xfs_iomap_write_delay(
 retry:
        if (prealloc) {
-                xfs_fsblock_t   alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
+                xfs_fsblock_t   alloc_blocks;
+                alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap,
+                                                       XFS_WRITE_IMAPS);
                aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
                ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
author	Dave Chinner <dchinner@redhat.com>	2013-02-11 00:05:01 -0500
committer	Ben Myers <bpm@sgi.com>	2013-02-14 18:21:32 -0500
commit	a1e16c26660b301cc8423185924cf1b0b16ea92b (patch)
tree	b0dfc8be0dd7a91d32b3280517ca64ca3f3c5f79 /fs/xfs
parent	311f08acde635e4e5ccea9b9d8c856cc2e0ced95 (diff)

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 364818eef40e..912d83d8860a 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c
@@ -311,6 +311,62 @@ xfs_iomap_eof_want_preallocate(
311	}	311	}
312		312
313	/*	313	/*
		314	* Determine the initial size of the preallocation. We are beyond the current
		315	* EOF here, but we need to take into account whether this is a sparse write or
		316	* an extending write when determining the preallocation size. Hence we need to
		317	* look up the extent that ends at the current write offset and use the result
		318	* to determine the preallocation size.
		319	*
		320	* If the extent is a hole, then preallocation is essentially disabled.
		321	* Otherwise we take the size of the preceeding data extent as the basis for the
		322	* preallocation size. If the size of the extent is greater than half the
		323	* maximum extent length, then use the current offset as the basis. This ensures
		324	* that for large files the preallocation size always extends to MAXEXTLEN
		325	* rather than falling short due to things like stripe unit/width alignment of
		326	* real extents.
		327	*/
		328	STATIC int
		329	xfs_iomap_eof_prealloc_initial_size(
		330	struct xfs_mount *mp,
		331	struct xfs_inode *ip,
		332	xfs_off_t offset,
		333	xfs_bmbt_irec_t *imap,
		334	int nimaps)
		335	{
		336	xfs_fileoff_t start_fsb;
		337	int imaps = 1;
		338	int error;
		339
		340	ASSERT(nimaps >= imaps);
		341
		342	/* if we are using a specific prealloc size, return now */
		343	if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
		344	return 0;
		345
		346	/*
		347	* As we write multiple pages, the offset will always align to the
		348	* start of a page and hence point to a hole at EOF. i.e. if the size is
		349	* 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096)
		350	* will return FSB 1. Hence if there are blocks in the file, we want to
		351	* point to the block prior to the EOF block and not the hole that maps
		352	* directly at @offset.
		353	*/
		354	start_fsb = XFS_B_TO_FSB(mp, offset);
		355	if (start_fsb)
		356	start_fsb--;
		357	error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE);
		358	if (error)
		359	return 0;
		360
		361	ASSERT(imaps == 1);
		362	if (imap[0].br_startblock == HOLESTARTBLOCK)
		363	return 0;
		364	if (imap[0].br_blockcount <= (MAXEXTLEN >> 1))
		365	return imap[0].br_blockcount;
		366	return XFS_B_TO_FSB(mp, offset);
		367	}
		368
		369	/*
314	* If we don't have a user specified preallocation size, dynamically increase	370	* If we don't have a user specified preallocation size, dynamically increase
315	* the preallocation size as the size of the file grows. Cap the maximum size	371	* the preallocation size as the size of the file grows. Cap the maximum size
316	* at a single extent or less if the filesystem is near full. The closer the	372	* at a single extent or less if the filesystem is near full. The closer the
@@ -319,20 +375,19 @@ xfs_iomap_eof_want_preallocate(
319	STATIC xfs_fsblock_t	375	STATIC xfs_fsblock_t
320	xfs_iomap_prealloc_size(	376	xfs_iomap_prealloc_size(
321	struct xfs_mount *mp,	377	struct xfs_mount *mp,
322	struct xfs_inode *ip)	378	struct xfs_inode *ip,
		379	xfs_off_t offset,
		380	struct xfs_bmbt_irec *imap,
		381	int nimaps)
323	{	382	{
324	xfs_fsblock_t alloc_blocks = 0;	383	xfs_fsblock_t alloc_blocks = 0;
325		384
326	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {	385	alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset,
		386	imap, nimaps);
		387	if (alloc_blocks > 0) {
327	int shift = 0;	388	int shift = 0;
328	int64_t freesp;	389	int64_t freesp;
329		390
330	/*
331	* rounddown_pow_of_two() returns an undefined result
332	* if we pass in alloc_blocks = 0. Hence the "+ 1" to
333	* ensure we always pass in a non-zero value.
334	*/
335	alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
336	alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,	391	alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
337	rounddown_pow_of_two(alloc_blocks));	392	rounddown_pow_of_two(alloc_blocks));
338		393
@@ -399,7 +454,6 @@ xfs_iomap_write_delay(
399	extsz = xfs_get_extsz_hint(ip);	454	extsz = xfs_get_extsz_hint(ip);
400	offset_fsb = XFS_B_TO_FSBT(mp, offset);	455	offset_fsb = XFS_B_TO_FSBT(mp, offset);
401		456
402
403	error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,	457	error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
404	imap, XFS_WRITE_IMAPS, &prealloc);	458	imap, XFS_WRITE_IMAPS, &prealloc);
405	if (error)	459	if (error)
@@ -407,7 +461,10 @@ xfs_iomap_write_delay(
407		461
408	retry:	462	retry:
409	if (prealloc) {	463	if (prealloc) {
410	xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip);	464	xfs_fsblock_t alloc_blocks;
		465
		466	alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap,
		467	XFS_WRITE_IMAPS);
411		468
412	aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));	469	aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
413	ioalign = XFS_B_TO_FSBT(mp, aligned_offset);	470	ioalign = XFS_B_TO_FSBT(mp, aligned_offset);