xfs: implement pNFS export operations

Add operations to export pNFS block layouts from an XFS filesystem. See the previous commit adding the operations for an explanation of them. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
author: Christoph Hellwig <hch@lst.de> 2015-02-15 19:49:23 -0500
committer: Dave Chinner <david@fromorbit.com> 2015-02-15 19:49:23 -0500
commit: 527851124d10f9c50b1c578e0a56fcd49922422d (patch)
tree: f16dd8e452ae24ce6802cfb8c6baa53588de91ab /fs
parent: bad962662dbc60e76ec1baae34af56b1ba2dfa5f (diff)
8 files changed, 329 insertions, 1 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d61799949580..df6828570e87 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -121,3 +121,4 @@ xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
 xfs-$(CONFIG_PROC_FS)           += xfs_stats.o
 xfs-$(CONFIG_SYSCTL)            += xfs_sysctl.o
 xfs-$(CONFIG_COMPAT)            += xfs_ioctl32.o
+xfs-$(CONFIG_NFSD_PNFS)         += xfs_pnfs.o
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 5eb4a14e0a0f..b97359ba2648 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -30,6 +30,7 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_log.h"
+#include "xfs_pnfs.h"
 /*
 * Note that we only accept fileids which are long enough rather than allow
@@ -245,4 +246,9 @@ const struct export_operations xfs_export_operations = {
        .fh_to_parent           = xfs_fs_fh_to_parent,
        .get_parent             = xfs_fs_get_parent,
        .commit_metadata        = xfs_fs_nfs_commit_metadata,
+#ifdef CONFIG_NFSD_PNFS
+        .get_uuid               = xfs_fs_get_uuid,
+        .map_blocks             = xfs_fs_map_blocks,
+        .commit_blocks          = xfs_fs_commit_blocks,
+#endif
 };
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index fba6532efba4..74efe5b760dc 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -602,6 +602,12 @@ xfs_growfs_data(
        if (!mutex_trylock(&mp->m_growlock))
                return -EWOULDBLOCK;
        error = xfs_growfs_data_private(mp, in);
+        /*
+         * Increment the generation unconditionally, the error could be from
+         * updating the secondary superblocks, in which case the new size
+         * is live already.
+         */
+        mp->m_generation++;
        mutex_unlock(&mp->m_growlock);
        return error;
 }
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ce80eeb8faa4..e5e2ea0d0b25 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -505,7 +505,7 @@ xfs_setattr_mode(
        inode->i_mode |= mode & ~S_IFMT;
 }
-static void
+void
 xfs_setattr_time(
        struct xfs_inode        *ip,
        struct iattr            *iattr)
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 1c34e4335920..ea7a98e9cb70 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -32,6 +32,7 @@ extern void xfs_setup_inode(struct xfs_inode *);
 */
 #define XFS_ATTR_NOACL          0x01    /* Don't call posix_acl_chmod */
+extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr);
 extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
                               int flags);
 extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a5b2ff822653..0d8abd6364d9 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -174,6 +174,17 @@ typedef struct xfs_mount {
        struct workqueue_struct *m_reclaim_workqueue;
        struct workqueue_struct *m_log_workqueue;
        struct workqueue_struct *m_eofblocks_workqueue;
+        /*
+         * Generation of the filesysyem layout.  This is incremented by each
+         * growfs, and used by the pNFS server to ensure the client updates
+         * its view of the block device once it gets a layout that might
+         * reference the newly added blocks.  Does not need to be persistent
+         * as long as we only allow file system size increments, but if we
+         * ever support shrinks it would have to be persisted in addition
+         * to various other kinds of pain inflicted on the pNFS server.
+         */
+        __uint32_t              m_generation;
 } xfs_mount_t;
 /*
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
new file mode 100644
index 000000000000..89912b34f184
--- /dev/null
+++ b/fs/xfs/xfs_pnfs.c
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_error.h"
+#include "xfs_iomap.h"
+#include "xfs_shared.h"
+#include "xfs_bit.h"
+#include "xfs_pnfs.h"
+/*
+ * Get a unique ID including its location so that the client can identify
+ * the exported device.
+ */
+int
+xfs_fs_get_uuid(
+        struct super_block      *sb,
+        u8                      *buf,
+        u32                     *len,
+        u64                     *offset)
+{
+        struct xfs_mount        *mp = XFS_M(sb);
+        printk_once(KERN_NOTICE
+"XFS (%s): using experimental pNFS feature, use at your own risk!\n",
+                mp->m_fsname);
+        if (*len < sizeof(uuid_t))
+                return -EINVAL;
+        memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t));
+        *len = sizeof(uuid_t);
+        *offset = offsetof(struct xfs_dsb, sb_uuid);
+        return 0;
+}
+static void
+xfs_bmbt_to_iomap(
+        struct xfs_inode        *ip,
+        struct iomap            *iomap,
+        struct xfs_bmbt_irec    *imap)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        if (imap->br_startblock == HOLESTARTBLOCK) {
+                iomap->blkno = IOMAP_NULL_BLOCK;
+                iomap->type = IOMAP_HOLE;
+        } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+                iomap->blkno = IOMAP_NULL_BLOCK;
+                iomap->type = IOMAP_DELALLOC;
+        } else {
+                iomap->blkno =
+                        XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
+                if (imap->br_state == XFS_EXT_UNWRITTEN)
+                        iomap->type = IOMAP_UNWRITTEN;
+                else
+                        iomap->type = IOMAP_MAPPED;
+        }
+        iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+        iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+}
+/*
+ * Get a layout for the pNFS client.
+ */
+int
+xfs_fs_map_blocks(
+        struct inode            *inode,
+        loff_t                  offset,
+        u64                     length,
+        struct iomap            *iomap,
+        bool                    write,
+        u32                     *device_generation)
+{
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_bmbt_irec    imap;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        loff_t                  limit;
+        int                     bmapi_flags = XFS_BMAPI_ENTIRE;
+        int                     nimaps = 1;
+        uint                    lock_flags;
+        int                     error = 0;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
+        /*
+         * We can't export inodes residing on the realtime device.  The realtime
+         * device doesn't have a UUID to identify it, so the client has no way
+         * to find it.
+         */
+        if (XFS_IS_REALTIME_INODE(ip))
+                return -ENXIO;
+        /*
+         * Lock out any other I/O before we flush and invalidate the pagecache,
+         * and then hand out a layout to the remote system.  This is very
+         * similar to direct I/O, except that the synchronization is much more
+         * complicated.  See the comment near xfs_break_layouts for a detailed
+         * explanation.
+         */
+        xfs_ilock(ip, XFS_IOLOCK_EXCL);
+        error = -EINVAL;
+        limit = mp->m_super->s_maxbytes;
+        if (!write)
+                limit = max(limit, round_up(i_size_read(inode),
+                                     inode->i_sb->s_blocksize));
+        if (offset > limit)
+                goto out_unlock;
+        if (offset > limit - length)
+                length = limit - offset;
+        error = filemap_write_and_wait(inode->i_mapping);
+        if (error)
+                goto out_unlock;
+        error = invalidate_inode_pages2(inode->i_mapping);
+        if (WARN_ON_ONCE(error))
+                return error;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        lock_flags = xfs_ilock_data_map_shared(ip);
+        error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+                                &imap, &nimaps, bmapi_flags);
+        xfs_iunlock(ip, lock_flags);
+        if (error)
+                goto out_unlock;
+        if (write) {
+                enum xfs_prealloc_flags flags = 0;
+                ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+                if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) {
+                        error = xfs_iomap_write_direct(ip, offset, length,
+                                                       &imap, nimaps);
+                        if (error)
+                                goto out_unlock;
+                        /*
+                         * Ensure the next transaction is committed
+                         * synchronously so that the blocks allocated and
+                         * handed out to the client are guaranteed to be
+                         * present even after a server crash.
+                         */
+                        flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC;
+                }
+                error = xfs_update_prealloc_flags(ip, flags);
+                if (error)
+                        goto out_unlock;
+        }
+        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+        xfs_bmbt_to_iomap(ip, iomap, &imap);
+        *device_generation = mp->m_generation;
+        return error;
+out_unlock:
+        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+        return error;
+}
+/*
+ * Ensure the size update falls into a valid allocated block.
+ */
+static int
+xfs_pnfs_validate_isize(
+        struct xfs_inode        *ip,
+        xfs_off_t               isize)
+{
+        struct xfs_bmbt_irec    imap;
+        int                     nimaps = 1;
+        int                     error = 0;
+        xfs_ilock(ip, XFS_ILOCK_SHARED);
+        error = xfs_bmapi_read(ip, XFS_B_TO_FSBT(ip->i_mount, isize - 1), 1,
+                                &imap, &nimaps, 0);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        if (error)
+                return error;
+        if (imap.br_startblock == HOLESTARTBLOCK ||
+            imap.br_startblock == DELAYSTARTBLOCK ||
+            imap.br_state == XFS_EXT_UNWRITTEN)
+                return -EIO;
+        return 0;
+}
+/*
+ * Make sure the blocks described by maps are stable on disk.  This includes
+ * converting any unwritten extents, flushing the disk cache and updating the
+ * time stamps.
+ *
+ * Note that we rely on the caller to always send us a timestamp update so that
+ * we always commit a transaction here.  If that stops being true we will have
+ * to manually flush the cache here similar to what the fsync code path does
+ * for datasyncs on files that have no dirty metadata.
+ */
+int
+xfs_fs_commit_blocks(
+        struct inode            *inode,
+        struct iomap            *maps,
+        int                     nr_maps,
+        struct iattr            *iattr)
+{
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_trans        *tp;
+        bool                    update_isize = false;
+        int                     error, i;
+        loff_t                  size;
+        ASSERT(iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME));
+        xfs_ilock(ip, XFS_IOLOCK_EXCL);
+        size = i_size_read(inode);
+        if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) {
+                update_isize = true;
+                size = iattr->ia_size;
+        }
+        for (i = 0; i < nr_maps; i++) {
+                u64 start, length, end;
+                start = maps[i].offset;
+                if (start > size)
+                        continue;
+                end = start + maps[i].length;
+                if (end > size)
+                        end = size;
+                length = end - start;
+                if (!length)
+                        continue;
+        
+                /*
+                 * Make sure reads through the pagecache see the new data.
+                 */
+                error = invalidate_inode_pages2_range(inode->i_mapping,
+                                        start >> PAGE_CACHE_SHIFT,
+                                        (end - 1) >> PAGE_CACHE_SHIFT);
+                WARN_ON_ONCE(error);
+                error = xfs_iomap_write_unwritten(ip, start, length);
+                if (error)
+                        goto out_drop_iolock;
+        }
+        if (update_isize) {
+                error = xfs_pnfs_validate_isize(ip, size);
+                if (error)
+                        goto out_drop_iolock;
+        }
+        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+        if (error)
+                goto out_drop_iolock;
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        xfs_setattr_time(ip, iattr);
+        if (update_isize) {
+                i_size_write(inode, iattr->ia_size);
+                ip->i_d.di_size = iattr->ia_size;
+        }
+        xfs_trans_set_sync(tp);
+        error = xfs_trans_commit(tp, 0);
+out_drop_iolock:
+        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+        return error;
+}
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
new file mode 100644
index 000000000000..0d91255a89ae
--- /dev/null
+++ b/fs/xfs/xfs_pnfs.h
@@ -0,0 +1,11 @@
+#ifndef _XFS_PNFS_H
+#define _XFS_PNFS_H 1
+#ifdef CONFIG_NFSD_PNFS
+int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
+int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
+                struct iomap *iomap, bool write, u32 *device_generation);
+int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
+                struct iattr *iattr);
+#endif /* CONFIG_NFSD_PNFS */
+#endif /* _XFS_PNFS_H */
author	Christoph Hellwig <hch@lst.de>	2015-02-15 19:49:23 -0500
committer	Dave Chinner <david@fromorbit.com>	2015-02-15 19:49:23 -0500
commit	527851124d10f9c50b1c578e0a56fcd49922422d (patch)
tree	f16dd8e452ae24ce6802cfb8c6baa53588de91ab /fs
parent	bad962662dbc60e76ec1baae34af56b1ba2dfa5f (diff)

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index d61799949580..df6828570e87 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile
@@ -121,3 +121,4 @@ xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
121	xfs-$(CONFIG_PROC_FS) += xfs_stats.o	121	xfs-$(CONFIG_PROC_FS) += xfs_stats.o
122	xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o	122	xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
123	xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o	123	xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
		124	xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o


diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 5eb4a14e0a0f..b97359ba2648 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c
@@ -30,6 +30,7 @@
30	#include "xfs_trace.h"	30	#include "xfs_trace.h"
31	#include "xfs_icache.h"	31	#include "xfs_icache.h"
32	#include "xfs_log.h"	32	#include "xfs_log.h"
		33	#include "xfs_pnfs.h"
33		34
34	/*	35	/*
35	* Note that we only accept fileids which are long enough rather than allow	36	* Note that we only accept fileids which are long enough rather than allow
@@ -245,4 +246,9 @@ const struct export_operations xfs_export_operations = {
245	.fh_to_parent = xfs_fs_fh_to_parent,	246	.fh_to_parent = xfs_fs_fh_to_parent,
246	.get_parent = xfs_fs_get_parent,	247	.get_parent = xfs_fs_get_parent,
247	.commit_metadata = xfs_fs_nfs_commit_metadata,	248	.commit_metadata = xfs_fs_nfs_commit_metadata,
		249	#ifdef CONFIG_NFSD_PNFS
		250	.get_uuid = xfs_fs_get_uuid,
		251	.map_blocks = xfs_fs_map_blocks,
		252	.commit_blocks = xfs_fs_commit_blocks,
		253	#endif
248	};	254	};


diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index fba6532efba4..74efe5b760dc 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c
@@ -602,6 +602,12 @@ xfs_growfs_data(
602	if (!mutex_trylock(&mp->m_growlock))	602	if (!mutex_trylock(&mp->m_growlock))
603	return -EWOULDBLOCK;	603	return -EWOULDBLOCK;
604	error = xfs_growfs_data_private(mp, in);	604	error = xfs_growfs_data_private(mp, in);
		605	/*
		606	* Increment the generation unconditionally, the error could be from
		607	* updating the secondary superblocks, in which case the new size
		608	* is live already.
		609	*/
		610	mp->m_generation++;
605	mutex_unlock(&mp->m_growlock);	611	mutex_unlock(&mp->m_growlock);
606	return error;	612	return error;
607	}	613	}


diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ce80eeb8faa4..e5e2ea0d0b25 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c
@@ -505,7 +505,7 @@ xfs_setattr_mode(
505	inode->i_mode \|= mode & ~S_IFMT;	505	inode->i_mode \|= mode & ~S_IFMT;
506	}	506	}
507		507
508	static void	508	void
509	xfs_setattr_time(	509	xfs_setattr_time(
510	struct xfs_inode *ip,	510	struct xfs_inode *ip,
511	struct iattr *iattr)	511	struct iattr *iattr)


diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 1c34e4335920..ea7a98e9cb70 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h
@@ -32,6 +32,7 @@ extern void xfs_setup_inode(struct xfs_inode *);
32	*/	32	*/
33	#define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */	33	#define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */
34		34
		35	extern void xfs_setattr_time(struct xfs_inode ip, struct iattr iattr);
35	extern int xfs_setattr_nonsize(struct xfs_inode ip, struct iattr vap,	36	extern int xfs_setattr_nonsize(struct xfs_inode ip, struct iattr vap,
36	int flags);	37	int flags);
37	extern int xfs_setattr_size(struct xfs_inode ip, struct iattr vap);	38	extern int xfs_setattr_size(struct xfs_inode ip, struct iattr vap);


diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a5b2ff822653..0d8abd6364d9 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h
@@ -174,6 +174,17 @@ typedef struct xfs_mount {
174	struct workqueue_struct *m_reclaim_workqueue;	174	struct workqueue_struct *m_reclaim_workqueue;
175	struct workqueue_struct *m_log_workqueue;	175	struct workqueue_struct *m_log_workqueue;
176	struct workqueue_struct *m_eofblocks_workqueue;	176	struct workqueue_struct *m_eofblocks_workqueue;
		177
		178	/*
		179	* Generation of the filesysyem layout. This is incremented by each
		180	* growfs, and used by the pNFS server to ensure the client updates
		181	* its view of the block device once it gets a layout that might
		182	* reference the newly added blocks. Does not need to be persistent
		183	* as long as we only allow file system size increments, but if we
		184	* ever support shrinks it would have to be persisted in addition
		185	* to various other kinds of pain inflicted on the pNFS server.
		186	*/
		187	__uint32_t m_generation;
177	} xfs_mount_t;	188	} xfs_mount_t;
178		189
179	/*	190	/*


diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c new file mode 100644 index 000000000000..89912b34f184 --- /dev/null +++ b/fs/xfs/xfs_pnfs.c
@@ -0,0 +1,292 @@
		1	/*
		2	* Copyright (c) 2014 Christoph Hellwig.
		3	*/
		4	#include "xfs.h"
		5	#include "xfs_format.h"
		6	#include "xfs_log_format.h"
		7	#include "xfs_trans_resv.h"
		8	#include "xfs_sb.h"
		9	#include "xfs_mount.h"
		10	#include "xfs_inode.h"
		11	#include "xfs_trans.h"
		12	#include "xfs_log.h"
		13	#include "xfs_bmap.h"
		14	#include "xfs_bmap_util.h"
		15	#include "xfs_error.h"
		16	#include "xfs_iomap.h"
		17	#include "xfs_shared.h"
		18	#include "xfs_bit.h"
		19	#include "xfs_pnfs.h"
		20
		21	/*
		22	* Get a unique ID including its location so that the client can identify
		23	* the exported device.
		24	*/
		25	int
		26	xfs_fs_get_uuid(
		27	struct super_block *sb,
		28	u8 *buf,
		29	u32 *len,
		30	u64 *offset)
		31	{
		32	struct xfs_mount *mp = XFS_M(sb);
		33
		34	printk_once(KERN_NOTICE
		35	"XFS (%s): using experimental pNFS feature, use at your own risk!\n",
		36	mp->m_fsname);
		37
		38	if (*len < sizeof(uuid_t))
		39	return -EINVAL;
		40
		41	memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t));
		42	*len = sizeof(uuid_t);
		43	*offset = offsetof(struct xfs_dsb, sb_uuid);
		44	return 0;
		45	}
		46
		47	static void
		48	xfs_bmbt_to_iomap(
		49	struct xfs_inode *ip,
		50	struct iomap *iomap,
		51	struct xfs_bmbt_irec *imap)
		52	{
		53	struct xfs_mount *mp = ip->i_mount;
		54
		55	if (imap->br_startblock == HOLESTARTBLOCK) {
		56	iomap->blkno = IOMAP_NULL_BLOCK;
		57	iomap->type = IOMAP_HOLE;
		58	} else if (imap->br_startblock == DELAYSTARTBLOCK) {
		59	iomap->blkno = IOMAP_NULL_BLOCK;
		60	iomap->type = IOMAP_DELALLOC;
		61	} else {
		62	iomap->blkno =
		63	XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
		64	if (imap->br_state == XFS_EXT_UNWRITTEN)
		65	iomap->type = IOMAP_UNWRITTEN;
		66	else
		67	iomap->type = IOMAP_MAPPED;
		68	}
		69	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
		70	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
		71	}
		72
		73	/*
		74	* Get a layout for the pNFS client.
		75	*/
		76	int
		77	xfs_fs_map_blocks(
		78	struct inode *inode,
		79	loff_t offset,
		80	u64 length,
		81	struct iomap *iomap,
		82	bool write,
		83	u32 *device_generation)
		84	{
		85	struct xfs_inode *ip = XFS_I(inode);
		86	struct xfs_mount *mp = ip->i_mount;
		87	struct xfs_bmbt_irec imap;
		88	xfs_fileoff_t offset_fsb, end_fsb;
		89	loff_t limit;
		90	int bmapi_flags = XFS_BMAPI_ENTIRE;
		91	int nimaps = 1;
		92	uint lock_flags;
		93	int error = 0;
		94
		95	if (XFS_FORCED_SHUTDOWN(mp))
		96	return -EIO;
		97
		98	/*
		99	* We can't export inodes residing on the realtime device. The realtime
		100	* device doesn't have a UUID to identify it, so the client has no way
		101	* to find it.
		102	*/
		103	if (XFS_IS_REALTIME_INODE(ip))
		104	return -ENXIO;
		105
		106	/*
		107	* Lock out any other I/O before we flush and invalidate the pagecache,
		108	* and then hand out a layout to the remote system. This is very
		109	* similar to direct I/O, except that the synchronization is much more
		110	* complicated. See the comment near xfs_break_layouts for a detailed
		111	* explanation.
		112	*/
		113	xfs_ilock(ip, XFS_IOLOCK_EXCL);
		114
		115	error = -EINVAL;
		116	limit = mp->m_super->s_maxbytes;
		117	if (!write)
		118	limit = max(limit, round_up(i_size_read(inode),
		119	inode->i_sb->s_blocksize));
		120	if (offset > limit)
		121	goto out_unlock;
		122	if (offset > limit - length)
		123	length = limit - offset;
		124
		125	error = filemap_write_and_wait(inode->i_mapping);
		126	if (error)
		127	goto out_unlock;
		128	error = invalidate_inode_pages2(inode->i_mapping);
		129	if (WARN_ON_ONCE(error))
		130	return error;
		131
		132	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length);
		133	offset_fsb = XFS_B_TO_FSBT(mp, offset);
		134
		135	lock_flags = xfs_ilock_data_map_shared(ip);
		136	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
		137	&imap, &nimaps, bmapi_flags);
		138	xfs_iunlock(ip, lock_flags);
		139
		140	if (error)
		141	goto out_unlock;
		142
		143	if (write) {
		144	enum xfs_prealloc_flags flags = 0;
		145
		146	ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
		147
		148	if (!nimaps \|\| imap.br_startblock == HOLESTARTBLOCK) {
		149	error = xfs_iomap_write_direct(ip, offset, length,
		150	&imap, nimaps);
		151	if (error)
		152	goto out_unlock;
		153
		154	/*
		155	* Ensure the next transaction is committed
		156	* synchronously so that the blocks allocated and
		157	* handed out to the client are guaranteed to be
		158	* present even after a server crash.
		159	*/
		160	flags \|= XFS_PREALLOC_SET \| XFS_PREALLOC_SYNC;
		161	}
		162
		163	error = xfs_update_prealloc_flags(ip, flags);
		164	if (error)
		165	goto out_unlock;
		166	}
		167	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
		168
		169	xfs_bmbt_to_iomap(ip, iomap, &imap);
		170	*device_generation = mp->m_generation;
		171	return error;
		172	out_unlock:
		173	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
		174	return error;
		175	}
		176
		177	/*
		178	* Ensure the size update falls into a valid allocated block.
		179	*/
		180	static int
		181	xfs_pnfs_validate_isize(
		182	struct xfs_inode *ip,
		183	xfs_off_t isize)
		184	{
		185	struct xfs_bmbt_irec imap;
		186	int nimaps = 1;
		187	int error = 0;
		188
		189	xfs_ilock(ip, XFS_ILOCK_SHARED);
		190	error = xfs_bmapi_read(ip, XFS_B_TO_FSBT(ip->i_mount, isize - 1), 1,
		191	&imap, &nimaps, 0);
		192	xfs_iunlock(ip, XFS_ILOCK_SHARED);
		193	if (error)
		194	return error;
		195
		196	if (imap.br_startblock == HOLESTARTBLOCK \|\|
		197	imap.br_startblock == DELAYSTARTBLOCK \|\|
		198	imap.br_state == XFS_EXT_UNWRITTEN)
		199	return -EIO;
		200	return 0;
		201	}
		202
		203	/*
		204	* Make sure the blocks described by maps are stable on disk. This includes
		205	* converting any unwritten extents, flushing the disk cache and updating the
		206	* time stamps.
		207	*
		208	* Note that we rely on the caller to always send us a timestamp update so that
		209	* we always commit a transaction here. If that stops being true we will have
		210	* to manually flush the cache here similar to what the fsync code path does
		211	* for datasyncs on files that have no dirty metadata.
		212	*/
		213	int
		214	xfs_fs_commit_blocks(
		215	struct inode *inode,
		216	struct iomap *maps,
		217	int nr_maps,
		218	struct iattr *iattr)
		219	{
		220	struct xfs_inode *ip = XFS_I(inode);
		221	struct xfs_mount *mp = ip->i_mount;
		222	struct xfs_trans *tp;
		223	bool update_isize = false;
		224	int error, i;
		225	loff_t size;
		226
		227	ASSERT(iattr->ia_valid & (ATTR_ATIME\|ATTR_CTIME\|ATTR_MTIME));
		228
		229	xfs_ilock(ip, XFS_IOLOCK_EXCL);
		230
		231	size = i_size_read(inode);
		232	if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) {
		233	update_isize = true;
		234	size = iattr->ia_size;
		235	}
		236
		237	for (i = 0; i < nr_maps; i++) {
		238	u64 start, length, end;
		239
		240	start = maps[i].offset;
		241	if (start > size)
		242	continue;
		243
		244	end = start + maps[i].length;
		245	if (end > size)
		246	end = size;
		247
		248	length = end - start;
		249	if (!length)
		250	continue;
		251
		252	/*
		253	* Make sure reads through the pagecache see the new data.
		254	*/
		255	error = invalidate_inode_pages2_range(inode->i_mapping,
		256	start >> PAGE_CACHE_SHIFT,
		257	(end - 1) >> PAGE_CACHE_SHIFT);
		258	WARN_ON_ONCE(error);
		259
		260	error = xfs_iomap_write_unwritten(ip, start, length);
		261	if (error)
		262	goto out_drop_iolock;
		263	}
		264
		265	if (update_isize) {
		266	error = xfs_pnfs_validate_isize(ip, size);
		267	if (error)
		268	goto out_drop_iolock;
		269	}
		270
		271	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
		272	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
		273	if (error)
		274	goto out_drop_iolock;
		275
		276	xfs_ilock(ip, XFS_ILOCK_EXCL);
		277	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
		278	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
		279
		280	xfs_setattr_time(ip, iattr);
		281	if (update_isize) {
		282	i_size_write(inode, iattr->ia_size);
		283	ip->i_d.di_size = iattr->ia_size;
		284	}
		285
		286	xfs_trans_set_sync(tp);
		287	error = xfs_trans_commit(tp, 0);
		288
		289	out_drop_iolock:
		290	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
		291	return error;
		292	}


diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h new file mode 100644 index 000000000000..0d91255a89ae --- /dev/null +++ b/fs/xfs/xfs_pnfs.h
@@ -0,0 +1,11 @@
		1	#ifndef _XFS_PNFS_H
		2	#define _XFS_PNFS_H 1
		3
		4	#ifdef CONFIG_NFSD_PNFS
		5	int xfs_fs_get_uuid(struct super_block sb, u8 buf, u32 len, u64 offset);
		6	int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
		7	struct iomap iomap, bool write, u32 device_generation);
		8	int xfs_fs_commit_blocks(struct inode inode, struct iomap maps, int nr_maps,
		9	struct iattr *iattr);
		10	#endif /* CONFIG_NFSD_PNFS */
		11	#endif /* _XFS_PNFS_H */