fallocate should be a file operation

Currently all filesystems except XFS implement fallocate asynchronously, while XFS forced a commit. Both of these are suboptimal - in case of O_SYNC I/O we really want our allocation on disk, especially for the !KEEP_SIZE case where we actually grow the file with user-visible zeroes. On the other hand always commiting the transaction is a bad idea for fast-path uses of fallocate like for example in recent Samba versions. Given that block allocation is a data plane operation anyway change it from an inode operation to a file operation so that we have the file structure available that lets us check for O_SYNC. This also includes moving the code around for a few of the filesystems, and remove the already unnedded S_ISDIR checks given that we only wire up fallocate for regular files. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
author: Christoph Hellwig <hch@lst.de> 2011-01-14 07:07:43 -0500
committer: Al Viro <viro@zeniv.linux.org.uk> 2011-01-17 02:25:31 -0500
commit: 2fe17c1075836b66678ed2a305fd09b6773883aa (patch)
tree: eb5287be8138686682eef9622872cfc7657e0664 /fs/btrfs/file.c
parent: 64c23e86873ee410554d6d1c76b60da47025e96f (diff)
1 files changed, 113 insertions, 0 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 66836d85763b..a9e0a4eaf3d9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
+#include <linux/falloc.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/statfs.h>
@@ -1237,6 +1238,117 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
        return 0;
 }
+static long btrfs_fallocate(struct file *file, int mode,
+                            loff_t offset, loff_t len)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct extent_state *cached_state = NULL;
+        u64 cur_offset;
+        u64 last_byte;
+        u64 alloc_start;
+        u64 alloc_end;
+        u64 alloc_hint = 0;
+        u64 locked_end;
+        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+        struct extent_map *em;
+        int ret;
+        alloc_start = offset & ~mask;
+        alloc_end =  (offset + len + mask) & ~mask;
+        /* We only support the FALLOC_FL_KEEP_SIZE mode */
+        if (mode & ~FALLOC_FL_KEEP_SIZE)
+                return -EOPNOTSUPP;
+        /*
+         * wait for ordered IO before we have any locks.  We'll loop again
+         * below with the locks held.
+         */
+        btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+        mutex_lock(&inode->i_mutex);
+        ret = inode_newsize_ok(inode, alloc_end);
+        if (ret)
+                goto out;
+        if (alloc_start > inode->i_size) {
+                ret = btrfs_cont_expand(inode, alloc_start);
+                if (ret)
+                        goto out;
+        }
+        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+        if (ret)
+                goto out;
+        locked_end = alloc_end - 1;
+        while (1) {
+                struct btrfs_ordered_extent *ordered;
+                /* the extent lock is ordered inside the running
+                 * transaction
+                 */
+                lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+                                 locked_end, 0, &cached_state, GFP_NOFS);
+                ordered = btrfs_lookup_first_ordered_extent(inode,
+                                                            alloc_end - 1);
+                if (ordered &&
+                    ordered->file_offset + ordered->len > alloc_start &&
+                    ordered->file_offset < alloc_end) {
+                        btrfs_put_ordered_extent(ordered);
+                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+                                             alloc_start, locked_end,
+                                             &cached_state, GFP_NOFS);
+                        /*
+                         * we can't wait on the range with the transaction
+                         * running or with the extent lock held
+                         */
+                        btrfs_wait_ordered_range(inode, alloc_start,
+                                                 alloc_end - alloc_start);
+                } else {
+                        if (ordered)
+                                btrfs_put_ordered_extent(ordered);
+                        break;
+                }
+        }
+        cur_offset = alloc_start;
+        while (1) {
+                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+                                      alloc_end - cur_offset, 0);
+                BUG_ON(IS_ERR(em) || !em);
+                last_byte = min(extent_map_end(em), alloc_end);
+                last_byte = (last_byte + mask) & ~mask;
+                if (em->block_start == EXTENT_MAP_HOLE ||
+                    (cur_offset >= inode->i_size &&
+                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+                        ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+                                                        last_byte - cur_offset,
+                                                        1 << inode->i_blkbits,
+                                                        offset + len,
+                                                        &alloc_hint);
+                        if (ret < 0) {
+                                free_extent_map(em);
+                                break;
+                        }
+                }
+                free_extent_map(em);
+                cur_offset = last_byte;
+                if (cur_offset >= alloc_end) {
+                        ret = 0;
+                        break;
+                }
+        }
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+                             &cached_state, GFP_NOFS);
+        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+out:
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
 const struct file_operations btrfs_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -1248,6 +1360,7 @@ const struct file_operations btrfs_file_operations = {
        .open           = generic_file_open,
        .release        = btrfs_release_file,
        .fsync          = btrfs_sync_file,
+        .fallocate      = btrfs_fallocate,
        .unlocked_ioctl = btrfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = btrfs_ioctl,
author	Christoph Hellwig <hch@lst.de>	2011-01-14 07:07:43 -0500
committer	Al Viro <viro@zeniv.linux.org.uk>	2011-01-17 02:25:31 -0500
commit	2fe17c1075836b66678ed2a305fd09b6773883aa (patch)
tree	eb5287be8138686682eef9622872cfc7657e0664 /fs/btrfs/file.c
parent	64c23e86873ee410554d6d1c76b60da47025e96f (diff)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 66836d85763b..a9e0a4eaf3d9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
24	#include <linux/string.h>	24	#include <linux/string.h>
25	#include <linux/backing-dev.h>	25	#include <linux/backing-dev.h>
26	#include <linux/mpage.h>	26	#include <linux/mpage.h>
		27	#include <linux/falloc.h>
27	#include <linux/swap.h>	28	#include <linux/swap.h>
28	#include <linux/writeback.h>	29	#include <linux/writeback.h>
29	#include <linux/statfs.h>	30	#include <linux/statfs.h>
@@ -1237,6 +1238,117 @@ static int btrfs_file_mmap(struct file filp, struct vm_area_struct vma)
1237	return 0;	1238	return 0;
1238	}	1239	}
1239		1240
		1241	static long btrfs_fallocate(struct file *file, int mode,
		1242	loff_t offset, loff_t len)
		1243	{
		1244	struct inode *inode = file->f_path.dentry->d_inode;
		1245	struct extent_state *cached_state = NULL;
		1246	u64 cur_offset;
		1247	u64 last_byte;
		1248	u64 alloc_start;
		1249	u64 alloc_end;
		1250	u64 alloc_hint = 0;
		1251	u64 locked_end;
		1252	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
		1253	struct extent_map *em;
		1254	int ret;
		1255
		1256	alloc_start = offset & ~mask;
		1257	alloc_end = (offset + len + mask) & ~mask;
		1258
		1259	/* We only support the FALLOC_FL_KEEP_SIZE mode */
		1260	if (mode & ~FALLOC_FL_KEEP_SIZE)
		1261	return -EOPNOTSUPP;
		1262
		1263	/*
		1264	* wait for ordered IO before we have any locks. We'll loop again
		1265	* below with the locks held.
		1266	*/
		1267	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
		1268
		1269	mutex_lock(&inode->i_mutex);
		1270	ret = inode_newsize_ok(inode, alloc_end);
		1271	if (ret)
		1272	goto out;
		1273
		1274	if (alloc_start > inode->i_size) {
		1275	ret = btrfs_cont_expand(inode, alloc_start);
		1276	if (ret)
		1277	goto out;
		1278	}
		1279
		1280	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
		1281	if (ret)
		1282	goto out;
		1283
		1284	locked_end = alloc_end - 1;
		1285	while (1) {
		1286	struct btrfs_ordered_extent *ordered;
		1287
		1288	/* the extent lock is ordered inside the running
		1289	* transaction
		1290	*/
		1291	lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
		1292	locked_end, 0, &cached_state, GFP_NOFS);
		1293	ordered = btrfs_lookup_first_ordered_extent(inode,
		1294	alloc_end - 1);
		1295	if (ordered &&
		1296	ordered->file_offset + ordered->len > alloc_start &&
		1297	ordered->file_offset < alloc_end) {
		1298	btrfs_put_ordered_extent(ordered);
		1299	unlock_extent_cached(&BTRFS_I(inode)->io_tree,
		1300	alloc_start, locked_end,
		1301	&cached_state, GFP_NOFS);
		1302	/*
		1303	* we can't wait on the range with the transaction
		1304	* running or with the extent lock held
		1305	*/
		1306	btrfs_wait_ordered_range(inode, alloc_start,
		1307	alloc_end - alloc_start);
		1308	} else {
		1309	if (ordered)
		1310	btrfs_put_ordered_extent(ordered);
		1311	break;
		1312	}
		1313	}
		1314
		1315	cur_offset = alloc_start;
		1316	while (1) {
		1317	em = btrfs_get_extent(inode, NULL, 0, cur_offset,
		1318	alloc_end - cur_offset, 0);
		1319	BUG_ON(IS_ERR(em) \|\| !em);
		1320	last_byte = min(extent_map_end(em), alloc_end);
		1321	last_byte = (last_byte + mask) & ~mask;
		1322	if (em->block_start == EXTENT_MAP_HOLE \|\|
		1323	(cur_offset >= inode->i_size &&
		1324	!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
		1325	ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
		1326	last_byte - cur_offset,
		1327	1 << inode->i_blkbits,
		1328	offset + len,
		1329	&alloc_hint);
		1330	if (ret < 0) {
		1331	free_extent_map(em);
		1332	break;
		1333	}
		1334	}
		1335	free_extent_map(em);
		1336
		1337	cur_offset = last_byte;
		1338	if (cur_offset >= alloc_end) {
		1339	ret = 0;
		1340	break;
		1341	}
		1342	}
		1343	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
		1344	&cached_state, GFP_NOFS);
		1345
		1346	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
		1347	out:
		1348	mutex_unlock(&inode->i_mutex);
		1349	return ret;
		1350	}
		1351
1240	const struct file_operations btrfs_file_operations = {	1352	const struct file_operations btrfs_file_operations = {
1241	.llseek = generic_file_llseek,	1353	.llseek = generic_file_llseek,
1242	.read = do_sync_read,	1354	.read = do_sync_read,
@@ -1248,6 +1360,7 @@ const struct file_operations btrfs_file_operations = {
1248	.open = generic_file_open,	1360	.open = generic_file_open,
1249	.release = btrfs_release_file,	1361	.release = btrfs_release_file,
1250	.fsync = btrfs_sync_file,	1362	.fsync = btrfs_sync_file,
		1363	.fallocate = btrfs_fallocate,
1251	.unlocked_ioctl = btrfs_ioctl,	1364	.unlocked_ioctl = btrfs_ioctl,
1252	#ifdef CONFIG_COMPAT	1365	#ifdef CONFIG_COMPAT
1253	.compat_ioctl = btrfs_ioctl,	1366	.compat_ioctl = btrfs_ioctl,