aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2013-08-12 06:49:48 -0400
committerBen Myers <bpm@sgi.com>2013-08-12 17:56:06 -0400
commita133d952b44cef278d2da664d742d51ef95f4dd3 (patch)
tree7a6f2d8bfc9be9d29246f2ed94582186331cbd45
parente546cb79ef7ebe53060369dae665fa449a544353 (diff)
xfs: consolidate extent swap code
So we don't need xfs_dfrag.h in userspace anymore, move the extent swap ioctl structure definition to xfs_fs.h where most of the other ioctl structure definitions are. Now that we don't need separate files for extent swapping, separate the basic file descriptor checking code to xfs_ioctl.c, and the code that does the extent swap operation to xfs_bmap_util.c. This cleanly separates the user interface code from the physical mechanism used to do the extent swap. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/xfs_bmap_util.c343
-rw-r--r--fs/xfs/xfs_bmap_util.h3
-rw-r--r--fs/xfs/xfs_dfrag.c459
-rw-r--r--fs/xfs/xfs_dfrag.h53
-rw-r--r--fs/xfs/xfs_fs.h15
-rw-r--r--fs/xfs/xfs_ioctl.c72
-rw-r--r--fs/xfs/xfs_ioctl.h4
-rw-r--r--fs/xfs/xfs_ioctl32.c3
9 files changed, 436 insertions, 517 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 201c61df3c45..d6ccf5742d18 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -32,7 +32,6 @@ xfs-y += xfs_aops.o \
32 xfs_bit.o \ 32 xfs_bit.o \
33 xfs_bmap_util.o \ 33 xfs_bmap_util.o \
34 xfs_buf.o \ 34 xfs_buf.o \
35 xfs_dfrag.o \
36 xfs_dir2_readdir.o \ 35 xfs_dir2_readdir.o \
37 xfs_discard.o \ 36 xfs_discard.o \
38 xfs_error.o \ 37 xfs_error.o \
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index f557022bd0e7..b5232d094418 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1693,3 +1693,346 @@ xfs_change_file_space(
1693 xfs_trans_set_sync(tp); 1693 xfs_trans_set_sync(tp);
1694 return xfs_trans_commit(tp, 0); 1694 return xfs_trans_commit(tp, 0);
1695} 1695}
1696
1697/*
1698 * We need to check that the format of the data fork in the temporary inode is
1699 * valid for the target inode before doing the swap. This is not a problem with
1700 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
1701 * data fork depending on the space the attribute fork is taking so we can get
1702 * invalid formats on the target inode.
1703 *
1704 * E.g. target has space for 7 extents in extent format, temp inode only has
1705 * space for 6. If we defragment down to 7 extents, then the tmp format is a
1706 * btree, but when swapped it needs to be in extent format. Hence we can't just
1707 * blindly swap data forks on attr2 filesystems.
1708 *
1709 * Note that we check the swap in both directions so that we don't end up with
1710 * a corrupt temporary inode, either.
1711 *
1712 * Note that fixing the way xfs_fsr sets up the attribute fork in the source
1713 * inode will prevent this situation from occurring, so all we do here is
1714 * reject and log the attempt. basically we are putting the responsibility on
1715 * userspace to get this right.
1716 */
1717static int
1718xfs_swap_extents_check_format(
1719 xfs_inode_t *ip, /* target inode */
1720 xfs_inode_t *tip) /* tmp inode */
1721{
1722
1723 /* Should never get a local format */
1724 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
1725 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
1726 return EINVAL;
1727
1728 /*
1729 * if the target inode has less extents that then temporary inode then
1730 * why did userspace call us?
1731 */
1732 if (ip->i_d.di_nextents < tip->i_d.di_nextents)
1733 return EINVAL;
1734
1735 /*
1736 * if the target inode is in extent form and the temp inode is in btree
1737 * form then we will end up with the target inode in the wrong format
1738 * as we already know there are less extents in the temp inode.
1739 */
1740 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1741 tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
1742 return EINVAL;
1743
1744 /* Check temp in extent form to max in target */
1745 if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1746 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
1747 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1748 return EINVAL;
1749
1750 /* Check target in extent form to max in temp */
1751 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1752 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
1753 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1754 return EINVAL;
1755
1756 /*
1757 * If we are in a btree format, check that the temp root block will fit
1758 * in the target and that it has enough extents to be in btree format
1759 * in the target.
1760 *
1761 * Note that we have to be careful to allow btree->extent conversions
1762 * (a common defrag case) which will occur when the temp inode is in
1763 * extent format...
1764 */
1765 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1766 if (XFS_IFORK_BOFF(ip) &&
1767 XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
1768 return EINVAL;
1769 if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
1770 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1771 return EINVAL;
1772 }
1773
1774 /* Reciprocal target->temp btree format checks */
1775 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1776 if (XFS_IFORK_BOFF(tip) &&
1777 XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
1778 return EINVAL;
1779 if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
1780 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1781 return EINVAL;
1782 }
1783
1784 return 0;
1785}
1786
1787int
1788xfs_swap_extents(
1789 xfs_inode_t *ip, /* target inode */
1790 xfs_inode_t *tip, /* tmp inode */
1791 xfs_swapext_t *sxp)
1792{
1793 xfs_mount_t *mp = ip->i_mount;
1794 xfs_trans_t *tp;
1795 xfs_bstat_t *sbp = &sxp->sx_stat;
1796 xfs_ifork_t *tempifp, *ifp, *tifp;
1797 int src_log_flags, target_log_flags;
1798 int error = 0;
1799 int aforkblks = 0;
1800 int taforkblks = 0;
1801 __uint64_t tmp;
1802
1803 /*
1804 * We have no way of updating owner information in the BMBT blocks for
1805 * each inode on CRC enabled filesystems, so to avoid corrupting the
1806 * this metadata we simply don't allow extent swaps to occur.
1807 */
1808 if (xfs_sb_version_hascrc(&mp->m_sb))
1809 return XFS_ERROR(EINVAL);
1810
1811 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
1812 if (!tempifp) {
1813 error = XFS_ERROR(ENOMEM);
1814 goto out;
1815 }
1816
1817 /*
1818 * we have to do two separate lock calls here to keep lockdep
1819 * happy. If we try to get all the locks in one call, lock will
1820 * report false positives when we drop the ILOCK and regain them
1821 * below.
1822 */
1823 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
1824 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
1825
1826 /* Verify that both files have the same format */
1827 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
1828 error = XFS_ERROR(EINVAL);
1829 goto out_unlock;
1830 }
1831
1832 /* Verify both files are either real-time or non-realtime */
1833 if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
1834 error = XFS_ERROR(EINVAL);
1835 goto out_unlock;
1836 }
1837
1838 error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
1839 if (error)
1840 goto out_unlock;
1841 truncate_pagecache_range(VFS_I(tip), 0, -1);
1842
1843 /* Verify O_DIRECT for ftmp */
1844 if (VN_CACHED(VFS_I(tip)) != 0) {
1845 error = XFS_ERROR(EINVAL);
1846 goto out_unlock;
1847 }
1848
1849 /* Verify all data are being swapped */
1850 if (sxp->sx_offset != 0 ||
1851 sxp->sx_length != ip->i_d.di_size ||
1852 sxp->sx_length != tip->i_d.di_size) {
1853 error = XFS_ERROR(EFAULT);
1854 goto out_unlock;
1855 }
1856
1857 trace_xfs_swap_extent_before(ip, 0);
1858 trace_xfs_swap_extent_before(tip, 1);
1859
1860 /* check inode formats now that data is flushed */
1861 error = xfs_swap_extents_check_format(ip, tip);
1862 if (error) {
1863 xfs_notice(mp,
1864 "%s: inode 0x%llx format is incompatible for exchanging.",
1865 __func__, ip->i_ino);
1866 goto out_unlock;
1867 }
1868
1869 /*
1870 * Compare the current change & modify times with that
1871 * passed in. If they differ, we abort this swap.
1872 * This is the mechanism used to ensure the calling
1873 * process that the file was not changed out from
1874 * under it.
1875 */
1876 if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
1877 (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
1878 (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
1879 (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
1880 error = XFS_ERROR(EBUSY);
1881 goto out_unlock;
1882 }
1883
1884 /* We need to fail if the file is memory mapped. Once we have tossed
1885 * all existing pages, the page fault will have no option
1886 * but to go to the filesystem for pages. By making the page fault call
1887 * vop_read (or write in the case of autogrow) they block on the iolock
1888 * until we have switched the extents.
1889 */
1890 if (VN_MAPPED(VFS_I(ip))) {
1891 error = XFS_ERROR(EBUSY);
1892 goto out_unlock;
1893 }
1894
1895 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1896 xfs_iunlock(tip, XFS_ILOCK_EXCL);
1897
1898 /*
1899 * There is a race condition here since we gave up the
1900 * ilock. However, the data fork will not change since
1901 * we have the iolock (locked for truncation too) so we
1902 * are safe. We don't really care if non-io related
1903 * fields change.
1904 */
1905 truncate_pagecache_range(VFS_I(ip), 0, -1);
1906
1907 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
1908 if ((error = xfs_trans_reserve(tp, 0,
1909 XFS_ICHANGE_LOG_RES(mp), 0,
1910 0, 0))) {
1911 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1912 xfs_iunlock(tip, XFS_IOLOCK_EXCL);
1913 xfs_trans_cancel(tp, 0);
1914 goto out;
1915 }
1916 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
1917
1918 /*
1919 * Count the number of extended attribute blocks
1920 */
1921 if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
1922 (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
1923 error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
1924 if (error)
1925 goto out_trans_cancel;
1926 }
1927 if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
1928 (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
1929 error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
1930 &taforkblks);
1931 if (error)
1932 goto out_trans_cancel;
1933 }
1934
1935 /*
1936 * Swap the data forks of the inodes
1937 */
1938 ifp = &ip->i_df;
1939 tifp = &tip->i_df;
1940 *tempifp = *ifp; /* struct copy */
1941 *ifp = *tifp; /* struct copy */
1942 *tifp = *tempifp; /* struct copy */
1943
1944 /*
1945 * Fix the on-disk inode values
1946 */
1947 tmp = (__uint64_t)ip->i_d.di_nblocks;
1948 ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
1949 tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
1950
1951 tmp = (__uint64_t) ip->i_d.di_nextents;
1952 ip->i_d.di_nextents = tip->i_d.di_nextents;
1953 tip->i_d.di_nextents = tmp;
1954
1955 tmp = (__uint64_t) ip->i_d.di_format;
1956 ip->i_d.di_format = tip->i_d.di_format;
1957 tip->i_d.di_format = tmp;
1958
1959 /*
1960 * The extents in the source inode could still contain speculative
1961 * preallocation beyond EOF (e.g. the file is open but not modified
1962 * while defrag is in progress). In that case, we need to copy over the
1963 * number of delalloc blocks the data fork in the source inode is
1964 * tracking beyond EOF so that when the fork is truncated away when the
1965 * temporary inode is unlinked we don't underrun the i_delayed_blks
1966 * counter on that inode.
1967 */
1968 ASSERT(tip->i_delayed_blks == 0);
1969 tip->i_delayed_blks = ip->i_delayed_blks;
1970 ip->i_delayed_blks = 0;
1971
1972 src_log_flags = XFS_ILOG_CORE;
1973 switch (ip->i_d.di_format) {
1974 case XFS_DINODE_FMT_EXTENTS:
1975 /* If the extents fit in the inode, fix the
1976 * pointer. Otherwise it's already NULL or
1977 * pointing to the extent.
1978 */
1979 if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
1980 ifp->if_u1.if_extents =
1981 ifp->if_u2.if_inline_ext;
1982 }
1983 src_log_flags |= XFS_ILOG_DEXT;
1984 break;
1985 case XFS_DINODE_FMT_BTREE:
1986 src_log_flags |= XFS_ILOG_DBROOT;
1987 break;
1988 }
1989
1990 target_log_flags = XFS_ILOG_CORE;
1991 switch (tip->i_d.di_format) {
1992 case XFS_DINODE_FMT_EXTENTS:
1993 /* If the extents fit in the inode, fix the
1994 * pointer. Otherwise it's already NULL or
1995 * pointing to the extent.
1996 */
1997 if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
1998 tifp->if_u1.if_extents =
1999 tifp->if_u2.if_inline_ext;
2000 }
2001 target_log_flags |= XFS_ILOG_DEXT;
2002 break;
2003 case XFS_DINODE_FMT_BTREE:
2004 target_log_flags |= XFS_ILOG_DBROOT;
2005 break;
2006 }
2007
2008
2009 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
2010 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
2011
2012 xfs_trans_log_inode(tp, ip, src_log_flags);
2013 xfs_trans_log_inode(tp, tip, target_log_flags);
2014
2015 /*
2016 * If this is a synchronous mount, make sure that the
2017 * transaction goes to disk before returning to the user.
2018 */
2019 if (mp->m_flags & XFS_MOUNT_WSYNC)
2020 xfs_trans_set_sync(tp);
2021
2022 error = xfs_trans_commit(tp, 0);
2023
2024 trace_xfs_swap_extent_after(ip, 0);
2025 trace_xfs_swap_extent_after(tip, 1);
2026out:
2027 kmem_free(tempifp);
2028 return error;
2029
2030out_unlock:
2031 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
2032 xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
2033 goto out;
2034
2035out_trans_cancel:
2036 xfs_trans_cancel(tp, 0);
2037 goto out_unlock;
2038}
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index a6d207769dee..061260946f7a 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -102,6 +102,9 @@ bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
102int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip, 102int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
103 bool need_iolock); 103 bool need_iolock);
104 104
105int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
106 struct xfs_swapext *sx);
107
105xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); 108xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
106 109
107#endif /* __XFS_BMAP_UTIL_H__ */ 110#endif /* __XFS_BMAP_UTIL_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
deleted file mode 100644
index b83d45f8cb12..000000000000
--- a/fs/xfs/xfs_dfrag.c
+++ /dev/null
@@ -1,459 +0,0 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_log.h"
22#include "xfs_trans.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h"
26#include "xfs_bmap_btree.h"
27#include "xfs_alloc_btree.h"
28#include "xfs_ialloc_btree.h"
29#include "xfs_btree.h"
30#include "xfs_dinode.h"
31#include "xfs_inode.h"
32#include "xfs_inode_item.h"
33#include "xfs_bmap.h"
34#include "xfs_bmap_util.h"
35#include "xfs_itable.h"
36#include "xfs_dfrag.h"
37#include "xfs_error.h"
38#include "xfs_trace.h"
39
40
41static int xfs_swap_extents(
42 xfs_inode_t *ip, /* target inode */
43 xfs_inode_t *tip, /* tmp inode */
44 xfs_swapext_t *sxp);
45
46/*
47 * ioctl interface for swapext
48 */
49int
50xfs_swapext(
51 xfs_swapext_t *sxp)
52{
53 xfs_inode_t *ip, *tip;
54 struct fd f, tmp;
55 int error = 0;
56
57 /* Pull information for the target fd */
58 f = fdget((int)sxp->sx_fdtarget);
59 if (!f.file) {
60 error = XFS_ERROR(EINVAL);
61 goto out;
62 }
63
64 if (!(f.file->f_mode & FMODE_WRITE) ||
65 !(f.file->f_mode & FMODE_READ) ||
66 (f.file->f_flags & O_APPEND)) {
67 error = XFS_ERROR(EBADF);
68 goto out_put_file;
69 }
70
71 tmp = fdget((int)sxp->sx_fdtmp);
72 if (!tmp.file) {
73 error = XFS_ERROR(EINVAL);
74 goto out_put_file;
75 }
76
77 if (!(tmp.file->f_mode & FMODE_WRITE) ||
78 !(tmp.file->f_mode & FMODE_READ) ||
79 (tmp.file->f_flags & O_APPEND)) {
80 error = XFS_ERROR(EBADF);
81 goto out_put_tmp_file;
82 }
83
84 if (IS_SWAPFILE(file_inode(f.file)) ||
85 IS_SWAPFILE(file_inode(tmp.file))) {
86 error = XFS_ERROR(EINVAL);
87 goto out_put_tmp_file;
88 }
89
90 ip = XFS_I(file_inode(f.file));
91 tip = XFS_I(file_inode(tmp.file));
92
93 if (ip->i_mount != tip->i_mount) {
94 error = XFS_ERROR(EINVAL);
95 goto out_put_tmp_file;
96 }
97
98 if (ip->i_ino == tip->i_ino) {
99 error = XFS_ERROR(EINVAL);
100 goto out_put_tmp_file;
101 }
102
103 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
104 error = XFS_ERROR(EIO);
105 goto out_put_tmp_file;
106 }
107
108 error = xfs_swap_extents(ip, tip, sxp);
109
110 out_put_tmp_file:
111 fdput(tmp);
112 out_put_file:
113 fdput(f);
114 out:
115 return error;
116}
117
118/*
119 * We need to check that the format of the data fork in the temporary inode is
120 * valid for the target inode before doing the swap. This is not a problem with
121 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
122 * data fork depending on the space the attribute fork is taking so we can get
123 * invalid formats on the target inode.
124 *
125 * E.g. target has space for 7 extents in extent format, temp inode only has
126 * space for 6. If we defragment down to 7 extents, then the tmp format is a
127 * btree, but when swapped it needs to be in extent format. Hence we can't just
128 * blindly swap data forks on attr2 filesystems.
129 *
130 * Note that we check the swap in both directions so that we don't end up with
131 * a corrupt temporary inode, either.
132 *
133 * Note that fixing the way xfs_fsr sets up the attribute fork in the source
134 * inode will prevent this situation from occurring, so all we do here is
135 * reject and log the attempt. basically we are putting the responsibility on
136 * userspace to get this right.
137 */
138static int
139xfs_swap_extents_check_format(
140 xfs_inode_t *ip, /* target inode */
141 xfs_inode_t *tip) /* tmp inode */
142{
143
144 /* Should never get a local format */
145 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
146 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
147 return EINVAL;
148
149 /*
150 * if the target inode has less extents that then temporary inode then
151 * why did userspace call us?
152 */
153 if (ip->i_d.di_nextents < tip->i_d.di_nextents)
154 return EINVAL;
155
156 /*
157 * if the target inode is in extent form and the temp inode is in btree
158 * form then we will end up with the target inode in the wrong format
159 * as we already know there are less extents in the temp inode.
160 */
161 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
162 tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
163 return EINVAL;
164
165 /* Check temp in extent form to max in target */
166 if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
167 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
168 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
169 return EINVAL;
170
171 /* Check target in extent form to max in temp */
172 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
173 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
174 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
175 return EINVAL;
176
177 /*
178 * If we are in a btree format, check that the temp root block will fit
179 * in the target and that it has enough extents to be in btree format
180 * in the target.
181 *
182 * Note that we have to be careful to allow btree->extent conversions
183 * (a common defrag case) which will occur when the temp inode is in
184 * extent format...
185 */
186 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
187 if (XFS_IFORK_BOFF(ip) &&
188 XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
189 return EINVAL;
190 if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
191 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
192 return EINVAL;
193 }
194
195 /* Reciprocal target->temp btree format checks */
196 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
197 if (XFS_IFORK_BOFF(tip) &&
198 XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
199 return EINVAL;
200 if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
201 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
202 return EINVAL;
203 }
204
205 return 0;
206}
207
208static int
209xfs_swap_extents(
210 xfs_inode_t *ip, /* target inode */
211 xfs_inode_t *tip, /* tmp inode */
212 xfs_swapext_t *sxp)
213{
214 xfs_mount_t *mp = ip->i_mount;
215 xfs_trans_t *tp;
216 xfs_bstat_t *sbp = &sxp->sx_stat;
217 xfs_ifork_t *tempifp, *ifp, *tifp;
218 int src_log_flags, target_log_flags;
219 int error = 0;
220 int aforkblks = 0;
221 int taforkblks = 0;
222 __uint64_t tmp;
223
224 /*
225 * We have no way of updating owner information in the BMBT blocks for
226 * each inode on CRC enabled filesystems, so to avoid corrupting the
227 * this metadata we simply don't allow extent swaps to occur.
228 */
229 if (xfs_sb_version_hascrc(&mp->m_sb))
230 return XFS_ERROR(EINVAL);
231
232 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
233 if (!tempifp) {
234 error = XFS_ERROR(ENOMEM);
235 goto out;
236 }
237
238 /*
239 * we have to do two separate lock calls here to keep lockdep
240 * happy. If we try to get all the locks in one call, lock will
241 * report false positives when we drop the ILOCK and regain them
242 * below.
243 */
244 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
245 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
246
247 /* Verify that both files have the same format */
248 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
249 error = XFS_ERROR(EINVAL);
250 goto out_unlock;
251 }
252
253 /* Verify both files are either real-time or non-realtime */
254 if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
255 error = XFS_ERROR(EINVAL);
256 goto out_unlock;
257 }
258
259 error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
260 if (error)
261 goto out_unlock;
262 truncate_pagecache_range(VFS_I(tip), 0, -1);
263
264 /* Verify O_DIRECT for ftmp */
265 if (VN_CACHED(VFS_I(tip)) != 0) {
266 error = XFS_ERROR(EINVAL);
267 goto out_unlock;
268 }
269
270 /* Verify all data are being swapped */
271 if (sxp->sx_offset != 0 ||
272 sxp->sx_length != ip->i_d.di_size ||
273 sxp->sx_length != tip->i_d.di_size) {
274 error = XFS_ERROR(EFAULT);
275 goto out_unlock;
276 }
277
278 trace_xfs_swap_extent_before(ip, 0);
279 trace_xfs_swap_extent_before(tip, 1);
280
281 /* check inode formats now that data is flushed */
282 error = xfs_swap_extents_check_format(ip, tip);
283 if (error) {
284 xfs_notice(mp,
285 "%s: inode 0x%llx format is incompatible for exchanging.",
286 __func__, ip->i_ino);
287 goto out_unlock;
288 }
289
290 /*
291 * Compare the current change & modify times with that
292 * passed in. If they differ, we abort this swap.
293 * This is the mechanism used to ensure the calling
294 * process that the file was not changed out from
295 * under it.
296 */
297 if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
298 (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
299 (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
300 (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
301 error = XFS_ERROR(EBUSY);
302 goto out_unlock;
303 }
304
305 /* We need to fail if the file is memory mapped. Once we have tossed
306 * all existing pages, the page fault will have no option
307 * but to go to the filesystem for pages. By making the page fault call
308 * vop_read (or write in the case of autogrow) they block on the iolock
309 * until we have switched the extents.
310 */
311 if (VN_MAPPED(VFS_I(ip))) {
312 error = XFS_ERROR(EBUSY);
313 goto out_unlock;
314 }
315
316 xfs_iunlock(ip, XFS_ILOCK_EXCL);
317 xfs_iunlock(tip, XFS_ILOCK_EXCL);
318
319 /*
320 * There is a race condition here since we gave up the
321 * ilock. However, the data fork will not change since
322 * we have the iolock (locked for truncation too) so we
323 * are safe. We don't really care if non-io related
324 * fields change.
325 */
326 truncate_pagecache_range(VFS_I(ip), 0, -1);
327
328 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
329 if ((error = xfs_trans_reserve(tp, 0,
330 XFS_ICHANGE_LOG_RES(mp), 0,
331 0, 0))) {
332 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
333 xfs_iunlock(tip, XFS_IOLOCK_EXCL);
334 xfs_trans_cancel(tp, 0);
335 goto out;
336 }
337 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
338
339 /*
340 * Count the number of extended attribute blocks
341 */
342 if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
343 (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
344 error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
345 if (error)
346 goto out_trans_cancel;
347 }
348 if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
349 (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
350 error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
351 &taforkblks);
352 if (error)
353 goto out_trans_cancel;
354 }
355
356 /*
357 * Swap the data forks of the inodes
358 */
359 ifp = &ip->i_df;
360 tifp = &tip->i_df;
361 *tempifp = *ifp; /* struct copy */
362 *ifp = *tifp; /* struct copy */
363 *tifp = *tempifp; /* struct copy */
364
365 /*
366 * Fix the on-disk inode values
367 */
368 tmp = (__uint64_t)ip->i_d.di_nblocks;
369 ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
370 tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
371
372 tmp = (__uint64_t) ip->i_d.di_nextents;
373 ip->i_d.di_nextents = tip->i_d.di_nextents;
374 tip->i_d.di_nextents = tmp;
375
376 tmp = (__uint64_t) ip->i_d.di_format;
377 ip->i_d.di_format = tip->i_d.di_format;
378 tip->i_d.di_format = tmp;
379
380 /*
381 * The extents in the source inode could still contain speculative
382 * preallocation beyond EOF (e.g. the file is open but not modified
383 * while defrag is in progress). In that case, we need to copy over the
384 * number of delalloc blocks the data fork in the source inode is
385 * tracking beyond EOF so that when the fork is truncated away when the
386 * temporary inode is unlinked we don't underrun the i_delayed_blks
387 * counter on that inode.
388 */
389 ASSERT(tip->i_delayed_blks == 0);
390 tip->i_delayed_blks = ip->i_delayed_blks;
391 ip->i_delayed_blks = 0;
392
393 src_log_flags = XFS_ILOG_CORE;
394 switch (ip->i_d.di_format) {
395 case XFS_DINODE_FMT_EXTENTS:
396 /* If the extents fit in the inode, fix the
397 * pointer. Otherwise it's already NULL or
398 * pointing to the extent.
399 */
400 if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
401 ifp->if_u1.if_extents =
402 ifp->if_u2.if_inline_ext;
403 }
404 src_log_flags |= XFS_ILOG_DEXT;
405 break;
406 case XFS_DINODE_FMT_BTREE:
407 src_log_flags |= XFS_ILOG_DBROOT;
408 break;
409 }
410
411 target_log_flags = XFS_ILOG_CORE;
412 switch (tip->i_d.di_format) {
413 case XFS_DINODE_FMT_EXTENTS:
414 /* If the extents fit in the inode, fix the
415 * pointer. Otherwise it's already NULL or
416 * pointing to the extent.
417 */
418 if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
419 tifp->if_u1.if_extents =
420 tifp->if_u2.if_inline_ext;
421 }
422 target_log_flags |= XFS_ILOG_DEXT;
423 break;
424 case XFS_DINODE_FMT_BTREE:
425 target_log_flags |= XFS_ILOG_DBROOT;
426 break;
427 }
428
429
430 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
431 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
432
433 xfs_trans_log_inode(tp, ip, src_log_flags);
434 xfs_trans_log_inode(tp, tip, target_log_flags);
435
436 /*
437 * If this is a synchronous mount, make sure that the
438 * transaction goes to disk before returning to the user.
439 */
440 if (mp->m_flags & XFS_MOUNT_WSYNC)
441 xfs_trans_set_sync(tp);
442
443 error = xfs_trans_commit(tp, 0);
444
445 trace_xfs_swap_extent_after(ip, 0);
446 trace_xfs_swap_extent_after(tip, 1);
447out:
448 kmem_free(tempifp);
449 return error;
450
451out_unlock:
452 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
453 xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
454 goto out;
455
456out_trans_cancel:
457 xfs_trans_cancel(tp, 0);
458 goto out_unlock;
459}
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
deleted file mode 100644
index 20bdd935c121..000000000000
--- a/fs/xfs/xfs_dfrag.h
+++ /dev/null
@@ -1,53 +0,0 @@
1/*
2 * Copyright (c) 2000,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_DFRAG_H__
19#define __XFS_DFRAG_H__
20
21/*
22 * Structure passed to xfs_swapext
23 */
24
25typedef struct xfs_swapext
26{
27 __int64_t sx_version; /* version */
28 __int64_t sx_fdtarget; /* fd of target file */
29 __int64_t sx_fdtmp; /* fd of tmp file */
30 xfs_off_t sx_offset; /* offset into file */
31 xfs_off_t sx_length; /* leng from offset */
32 char sx_pad[16]; /* pad space, unused */
33 xfs_bstat_t sx_stat; /* stat of target b4 copy */
34} xfs_swapext_t;
35
36/*
37 * Version flag
38 */
39#define XFS_SX_VERSION 0
40
41#ifdef __KERNEL__
42/*
43 * Prototypes for visible xfs_dfrag.c routines.
44 */
45
46/*
47 * Syscall interface for xfs_swapext
48 */
49int xfs_swapext(struct xfs_swapext *sx);
50
51#endif /* __KERNEL__ */
52
53#endif /* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 68c2e18f7e07..74b24b2ecd07 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -461,6 +461,21 @@ typedef struct xfs_handle {
461 + (handle).ha_fid.fid_len) 461 + (handle).ha_fid.fid_len)
462 462
463/* 463/*
464 * Structure passed to XFS_IOC_SWAPEXT
465 */
466typedef struct xfs_swapext
467{
468 __int64_t sx_version; /* version */
469#define XFS_SX_VERSION 0
470 __int64_t sx_fdtarget; /* fd of target file */
471 __int64_t sx_fdtmp; /* fd of tmp file */
472 xfs_off_t sx_offset; /* offset into file */
473 xfs_off_t sx_length; /* leng from offset */
474 char sx_pad[16]; /* pad space, unused */
475 xfs_bstat_t sx_stat; /* stat of target b4 copy */
476} xfs_swapext_t;
477
478/*
464 * Flags for going down operation 479 * Flags for going down operation
465 */ 480 */
466#define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ 481#define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 557c7b8b2425..efb216de5f69 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -35,7 +35,6 @@
35#include "xfs_bmap.h" 35#include "xfs_bmap.h"
36#include "xfs_bmap_util.h" 36#include "xfs_bmap_util.h"
37#include "xfs_buf_item.h" 37#include "xfs_buf_item.h"
38#include "xfs_dfrag.h"
39#include "xfs_fsops.h" 38#include "xfs_fsops.h"
40#include "xfs_discard.h" 39#include "xfs_discard.h"
41#include "xfs_quota.h" 40#include "xfs_quota.h"
@@ -1363,6 +1362,75 @@ xfs_ioc_getbmapx(
1363 return 0; 1362 return 0;
1364} 1363}
1365 1364
1365int
1366xfs_ioc_swapext(
1367 xfs_swapext_t *sxp)
1368{
1369 xfs_inode_t *ip, *tip;
1370 struct fd f, tmp;
1371 int error = 0;
1372
1373 /* Pull information for the target fd */
1374 f = fdget((int)sxp->sx_fdtarget);
1375 if (!f.file) {
1376 error = XFS_ERROR(EINVAL);
1377 goto out;
1378 }
1379
1380 if (!(f.file->f_mode & FMODE_WRITE) ||
1381 !(f.file->f_mode & FMODE_READ) ||
1382 (f.file->f_flags & O_APPEND)) {
1383 error = XFS_ERROR(EBADF);
1384 goto out_put_file;
1385 }
1386
1387 tmp = fdget((int)sxp->sx_fdtmp);
1388 if (!tmp.file) {
1389 error = XFS_ERROR(EINVAL);
1390 goto out_put_file;
1391 }
1392
1393 if (!(tmp.file->f_mode & FMODE_WRITE) ||
1394 !(tmp.file->f_mode & FMODE_READ) ||
1395 (tmp.file->f_flags & O_APPEND)) {
1396 error = XFS_ERROR(EBADF);
1397 goto out_put_tmp_file;
1398 }
1399
1400 if (IS_SWAPFILE(file_inode(f.file)) ||
1401 IS_SWAPFILE(file_inode(tmp.file))) {
1402 error = XFS_ERROR(EINVAL);
1403 goto out_put_tmp_file;
1404 }
1405
1406 ip = XFS_I(file_inode(f.file));
1407 tip = XFS_I(file_inode(tmp.file));
1408
1409 if (ip->i_mount != tip->i_mount) {
1410 error = XFS_ERROR(EINVAL);
1411 goto out_put_tmp_file;
1412 }
1413
1414 if (ip->i_ino == tip->i_ino) {
1415 error = XFS_ERROR(EINVAL);
1416 goto out_put_tmp_file;
1417 }
1418
1419 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1420 error = XFS_ERROR(EIO);
1421 goto out_put_tmp_file;
1422 }
1423
1424 error = xfs_swap_extents(ip, tip, sxp);
1425
1426 out_put_tmp_file:
1427 fdput(tmp);
1428 out_put_file:
1429 fdput(f);
1430 out:
1431 return error;
1432}
1433
1366/* 1434/*
1367 * Note: some of the ioctl's return positive numbers as a 1435 * Note: some of the ioctl's return positive numbers as a
1368 * byte count indicating success, such as readlink_by_handle. 1436 * byte count indicating success, such as readlink_by_handle.
@@ -1507,7 +1575,7 @@ xfs_file_ioctl(
1507 error = mnt_want_write_file(filp); 1575 error = mnt_want_write_file(filp);
1508 if (error) 1576 if (error)
1509 return error; 1577 return error;
1510 error = xfs_swapext(&sxp); 1578 error = xfs_ioc_swapext(&sxp);
1511 mnt_drop_write_file(filp); 1579 mnt_drop_write_file(filp);
1512 return -error; 1580 return -error;
1513 } 1581 }
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 1233dee4fef0..77c02c7900b6 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -27,6 +27,10 @@ xfs_ioc_space(
27 unsigned int cmd, 27 unsigned int cmd,
28 xfs_flock64_t *bf); 28 xfs_flock64_t *bf);
29 29
30int
31xfs_ioc_swapext(
32 xfs_swapext_t *sxp);
33
30extern int 34extern int
31xfs_find_handle( 35xfs_find_handle(
32 unsigned int cmd, 36 unsigned int cmd,
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 55a3072e7f56..d3ab9534307f 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -33,7 +33,6 @@
33#include "xfs_inode.h" 33#include "xfs_inode.h"
34#include "xfs_itable.h" 34#include "xfs_itable.h"
35#include "xfs_error.h" 35#include "xfs_error.h"
36#include "xfs_dfrag.h"
37#include "xfs_fsops.h" 36#include "xfs_fsops.h"
38#include "xfs_alloc.h" 37#include "xfs_alloc.h"
39#include "xfs_rtalloc.h" 38#include "xfs_rtalloc.h"
@@ -643,7 +642,7 @@ xfs_file_compat_ioctl(
643 error = mnt_want_write_file(filp); 642 error = mnt_want_write_file(filp);
644 if (error) 643 if (error)
645 return error; 644 return error;
646 error = xfs_swapext(&sxp); 645 error = xfs_ioc_swapext(&sxp);
647 mnt_drop_write_file(filp); 646 mnt_drop_write_file(filp);
648 return -error; 647 return -error;
649 } 648 }