aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-09-09 14:19:09 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-09 14:19:09 -0400
commit300893b08f3bc7057a7a5f84074090ba66c8b5ca (patch)
tree5fc5aef0b9dbab8e47e161303d57e631786c7d17 /fs/xfs
parent45150c43b1b0c16e665fd0a5cdcca128b8192db1 (diff)
parent1d03c6fa88af35e55047a1f2ab116f0fdf2f55aa (diff)
Merge tag 'xfs-for-linus-v3.12-rc1' of git://oss.sgi.com/xfs/xfs
Pull xfs updates from Ben Myers: "For 3.12-rc1 there are a number of bugfixes in addition to work to ease usage of shared code between libxfs and the kernel, the rest of the work to enable project and group quotas to be used simultaneously, performance optimisations in the log and the CIL, directory entry file type support, fixes for log space reservations, some spelling/grammar cleanups, and the addition of user namespace support. - introduce readahead to log recovery - add directory entry file type support - fix a number of spelling errors in comments - introduce new Q_XGETQSTATV quotactl for project quotas - add USER_NS support - log space reservation rework - CIL optimisations - kernel/userspace libxfs rework" * tag 'xfs-for-linus-v3.12-rc1' of git://oss.sgi.com/xfs/xfs: (112 commits) xfs: XFS_MOUNT_QUOTA_ALL needed by userspace xfs: dtype changed xfs_dir2_sfe_put_ino to xfs_dir3_sfe_put_ino Fix wrong flag ASSERT in xfs_attr_shortform_getvalue xfs: finish removing IOP_* macros. xfs: inode log reservations are too small xfs: check correct status variable for xfs_inobt_get_rec() call xfs: inode buffers may not be valid during recovery readahead xfs: check LSN ordering for v5 superblocks during recovery xfs: btree block LSN escaping to disk uninitialised XFS: Assertion failed: first <= last && last < BBTOB(bp->b_length), file: fs/xfs/xfs_trans_buf.c, line: 568 xfs: fix bad dquot buffer size in log recovery readahead xfs: don't account buffer cancellation during log recovery readahead xfs: check for underflow in xfs_iformat_fork() xfs: xfs_dir3_sfe_put_ino can be static xfs: introduce object readahead to log recovery xfs: Simplify xfs_ail_min() with list_first_entry_or_null() xfs: Register hotcpu notifier after initialization xfs: add xfs sb v4 support for dirent filetype field xfs: Add write support for dirent filetype field xfs: Add read-only support for dirent filetype field ...
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/Makefile20
-rw-r--r--fs/xfs/xfs_acl.c24
-rw-r--r--fs/xfs/xfs_ag.h53
-rw-r--r--fs/xfs/xfs_alloc.c6
-rw-r--r--fs/xfs/xfs_aops.c23
-rw-r--r--fs/xfs/xfs_attr.c427
-rw-r--r--fs/xfs/xfs_attr.h9
-rw-r--r--fs/xfs/xfs_attr_inactive.c453
-rw-r--r--fs/xfs/xfs_attr_leaf.c657
-rw-r--r--fs/xfs/xfs_attr_leaf.h2
-rw-r--r--fs/xfs/xfs_attr_list.c655
-rw-r--r--fs/xfs/xfs_attr_remote.c18
-rw-r--r--fs/xfs/xfs_bmap.c823
-rw-r--r--fs/xfs/xfs_bmap.h56
-rw-r--r--fs/xfs/xfs_bmap_btree.c6
-rw-r--r--fs/xfs/xfs_bmap_util.c2026
-rw-r--r--fs/xfs/xfs_bmap_util.h110
-rw-r--r--fs/xfs/xfs_btree.c7
-rw-r--r--fs/xfs/xfs_btree.h2
-rw-r--r--fs/xfs/xfs_buf.c5
-rw-r--r--fs/xfs/xfs_buf_item.c58
-rw-r--r--fs/xfs/xfs_buf_item.h100
-rw-r--r--fs/xfs/xfs_da_btree.c8
-rw-r--r--fs/xfs/xfs_da_btree.h12
-rw-r--r--fs/xfs/xfs_dfrag.c459
-rw-r--r--fs/xfs/xfs_dfrag.h53
-rw-r--r--fs/xfs/xfs_dir2.c58
-rw-r--r--fs/xfs/xfs_dir2.h46
-rw-r--r--fs/xfs/xfs_dir2_block.c122
-rw-r--r--fs/xfs/xfs_dir2_data.c25
-rw-r--r--fs/xfs/xfs_dir2_format.h186
-rw-r--r--fs/xfs/xfs_dir2_leaf.c404
-rw-r--r--fs/xfs/xfs_dir2_node.c14
-rw-r--r--fs/xfs/xfs_dir2_priv.h49
-rw-r--r--fs/xfs/xfs_dir2_readdir.c695
-rw-r--r--fs/xfs/xfs_dir2_sf.c240
-rw-r--r--fs/xfs/xfs_discard.c5
-rw-r--r--fs/xfs/xfs_dquot.c8
-rw-r--r--fs/xfs/xfs_dquot_item.c23
-rw-r--r--fs/xfs/xfs_error.c1
-rw-r--r--fs/xfs/xfs_export.c5
-rw-r--r--fs/xfs/xfs_extent_busy.c2
-rw-r--r--fs/xfs/xfs_extfree_item.c50
-rw-r--r--fs/xfs/xfs_extfree_item.h88
-rw-r--r--fs/xfs/xfs_file.c3
-rw-r--r--fs/xfs/xfs_filestream.c8
-rw-r--r--fs/xfs/xfs_filestream.h4
-rw-r--r--fs/xfs/xfs_format.h169
-rw-r--r--fs/xfs/xfs_fs.h38
-rw-r--r--fs/xfs/xfs_fsops.c8
-rw-r--r--fs/xfs/xfs_ialloc.c7
-rw-r--r--fs/xfs/xfs_icache.c15
-rw-r--r--fs/xfs/xfs_icache.h50
-rw-r--r--fs/xfs/xfs_icreate_item.c21
-rw-r--r--fs/xfs/xfs_icreate_item.h18
-rw-r--r--fs/xfs/xfs_inode.c3749
-rw-r--r--fs/xfs/xfs_inode.h312
-rw-r--r--fs/xfs/xfs_inode_buf.c483
-rw-r--r--fs/xfs/xfs_inode_buf.h53
-rw-r--r--fs/xfs/xfs_inode_fork.c1920
-rw-r--r--fs/xfs/xfs_inode_fork.h171
-rw-r--r--fs/xfs/xfs_inode_item.c53
-rw-r--r--fs/xfs/xfs_inode_item.h115
-rw-r--r--fs/xfs/xfs_ioctl.c148
-rw-r--r--fs/xfs/xfs_ioctl.h10
-rw-r--r--fs/xfs/xfs_ioctl32.c4
-rw-r--r--fs/xfs/xfs_iomap.c21
-rw-r--r--fs/xfs/xfs_iops.c78
-rw-r--r--fs/xfs/xfs_iops.h13
-rw-r--r--fs/xfs/xfs_linux.h60
-rw-r--r--fs/xfs/xfs_log.c113
-rw-r--r--fs/xfs/xfs_log.h90
-rw-r--r--fs/xfs/xfs_log_cil.c371
-rw-r--r--fs/xfs/xfs_log_format.h852
-rw-r--r--fs/xfs/xfs_log_priv.h155
-rw-r--r--fs/xfs/xfs_log_recover.c407
-rw-r--r--fs/xfs/xfs_log_rlimit.c147
-rw-r--r--fs/xfs/xfs_mount.c755
-rw-r--r--fs/xfs/xfs_mount.h113
-rw-r--r--fs/xfs/xfs_qm.c95
-rw-r--r--fs/xfs/xfs_qm.h2
-rw-r--r--fs/xfs/xfs_qm_bhv.c1
-rw-r--r--fs/xfs/xfs_qm_syscalls.c126
-rw-r--r--fs/xfs/xfs_quota.h278
-rw-r--r--fs/xfs/xfs_quota_defs.h157
-rw-r--r--fs/xfs/xfs_quotaops.c17
-rw-r--r--fs/xfs/xfs_rename.c346
-rw-r--r--fs/xfs/xfs_rtalloc.c28
-rw-r--r--fs/xfs/xfs_rtalloc.h53
-rw-r--r--fs/xfs/xfs_sb.c834
-rw-r--r--fs/xfs/xfs_sb.h72
-rw-r--r--fs/xfs/xfs_super.c31
-rw-r--r--fs/xfs/xfs_symlink.c196
-rw-r--r--fs/xfs/xfs_symlink.h41
-rw-r--r--fs/xfs/xfs_symlink_remote.c200
-rw-r--r--fs/xfs/xfs_trace.c1
-rw-r--r--fs/xfs/xfs_trans.c732
-rw-r--r--fs/xfs/xfs_trans.h301
-rw-r--r--fs/xfs/xfs_trans_ail.c18
-rw-r--r--fs/xfs/xfs_trans_buf.c2
-rw-r--r--fs/xfs/xfs_trans_dquot.c1
-rw-r--r--fs/xfs/xfs_trans_priv.h15
-rw-r--r--fs/xfs/xfs_trans_resv.c803
-rw-r--r--fs/xfs/xfs_trans_resv.h116
-rw-r--r--fs/xfs/xfs_types.h60
-rw-r--r--fs/xfs/xfs_utils.c314
-rw-r--r--fs/xfs/xfs_utils.h27
-rw-r--r--fs/xfs/xfs_vnodeops.c1870
-rw-r--r--fs/xfs/xfs_vnodeops.h55
-rw-r--r--fs/xfs/xfs_xattr.c2
110 files changed, 13527 insertions, 11894 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 4a4508023a3c..0719e4db93f2 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -27,9 +27,12 @@ xfs-y += xfs_trace.o
27 27
28# highlevel code 28# highlevel code
29xfs-y += xfs_aops.o \ 29xfs-y += xfs_aops.o \
30 xfs_attr_inactive.o \
31 xfs_attr_list.o \
30 xfs_bit.o \ 32 xfs_bit.o \
33 xfs_bmap_util.o \
31 xfs_buf.o \ 34 xfs_buf.o \
32 xfs_dfrag.o \ 35 xfs_dir2_readdir.o \
33 xfs_discard.o \ 36 xfs_discard.o \
34 xfs_error.o \ 37 xfs_error.o \
35 xfs_export.o \ 38 xfs_export.o \
@@ -44,11 +47,11 @@ xfs-y += xfs_aops.o \
44 xfs_iops.o \ 47 xfs_iops.o \
45 xfs_itable.o \ 48 xfs_itable.o \
46 xfs_message.o \ 49 xfs_message.o \
50 xfs_mount.o \
47 xfs_mru_cache.o \ 51 xfs_mru_cache.o \
48 xfs_rename.o \
49 xfs_super.o \ 52 xfs_super.o \
50 xfs_utils.o \ 53 xfs_symlink.o \
51 xfs_vnodeops.o \ 54 xfs_trans.o \
52 xfs_xattr.o \ 55 xfs_xattr.o \
53 kmem.o \ 56 kmem.o \
54 uuid.o 57 uuid.o
@@ -73,10 +76,13 @@ xfs-y += xfs_alloc.o \
73 xfs_ialloc_btree.o \ 76 xfs_ialloc_btree.o \
74 xfs_icreate_item.o \ 77 xfs_icreate_item.o \
75 xfs_inode.o \ 78 xfs_inode.o \
79 xfs_inode_fork.o \
80 xfs_inode_buf.o \
76 xfs_log_recover.o \ 81 xfs_log_recover.o \
77 xfs_mount.o \ 82 xfs_log_rlimit.o \
78 xfs_symlink.o \ 83 xfs_sb.o \
79 xfs_trans.o 84 xfs_symlink_remote.o \
85 xfs_trans_resv.o
80 86
81# low-level transaction/log code 87# low-level transaction/log code
82xfs-y += xfs_log.o \ 88xfs-y += xfs_log.o \
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 306d883d89bc..69518960b2ba 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -16,11 +16,13 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_log_format.h"
20#include "xfs_trans_resv.h"
19#include "xfs_acl.h" 21#include "xfs_acl.h"
20#include "xfs_attr.h" 22#include "xfs_attr.h"
21#include "xfs_bmap_btree.h" 23#include "xfs_bmap_btree.h"
22#include "xfs_inode.h" 24#include "xfs_inode.h"
23#include "xfs_vnodeops.h" 25#include "xfs_ag.h"
24#include "xfs_sb.h" 26#include "xfs_sb.h"
25#include "xfs_mount.h" 27#include "xfs_mount.h"
26#include "xfs_trace.h" 28#include "xfs_trace.h"
@@ -68,14 +70,15 @@ xfs_acl_from_disk(
68 70
69 switch (acl_e->e_tag) { 71 switch (acl_e->e_tag) {
70 case ACL_USER: 72 case ACL_USER:
73 acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id));
74 break;
71 case ACL_GROUP: 75 case ACL_GROUP:
72 acl_e->e_id = be32_to_cpu(ace->ae_id); 76 acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id));
73 break; 77 break;
74 case ACL_USER_OBJ: 78 case ACL_USER_OBJ:
75 case ACL_GROUP_OBJ: 79 case ACL_GROUP_OBJ:
76 case ACL_MASK: 80 case ACL_MASK:
77 case ACL_OTHER: 81 case ACL_OTHER:
78 acl_e->e_id = ACL_UNDEFINED_ID;
79 break; 82 break;
80 default: 83 default:
81 goto fail; 84 goto fail;
@@ -101,7 +104,18 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
101 acl_e = &acl->a_entries[i]; 104 acl_e = &acl->a_entries[i];
102 105
103 ace->ae_tag = cpu_to_be32(acl_e->e_tag); 106 ace->ae_tag = cpu_to_be32(acl_e->e_tag);
104 ace->ae_id = cpu_to_be32(acl_e->e_id); 107 switch (acl_e->e_tag) {
108 case ACL_USER:
109 ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid));
110 break;
111 case ACL_GROUP:
112 ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid));
113 break;
114 default:
115 ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID);
116 break;
117 }
118
105 ace->ae_perm = cpu_to_be16(acl_e->e_perm); 119 ace->ae_perm = cpu_to_be16(acl_e->e_perm);
106 } 120 }
107} 121}
@@ -360,7 +374,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
360 return -EINVAL; 374 return -EINVAL;
361 if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) 375 if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
362 return value ? -EACCES : 0; 376 return value ? -EACCES : 0;
363 if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER)) 377 if (!inode_owner_or_capable(inode))
364 return -EPERM; 378 return -EPERM;
365 379
366 if (!value) 380 if (!value)
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 317aa86d96ea..1cb740afd674 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,59 +227,6 @@ typedef struct xfs_agfl {
227} xfs_agfl_t; 227} xfs_agfl_t;
228 228
229/* 229/*
230 * Per-ag incore structure, copies of information in agf and agi,
231 * to improve the performance of allocation group selection.
232 */
233#define XFS_PAGB_NUM_SLOTS 128
234
235typedef struct xfs_perag {
236 struct xfs_mount *pag_mount; /* owner filesystem */
237 xfs_agnumber_t pag_agno; /* AG this structure belongs to */
238 atomic_t pag_ref; /* perag reference count */
239 char pagf_init; /* this agf's entry is initialized */
240 char pagi_init; /* this agi's entry is initialized */
241 char pagf_metadata; /* the agf is preferred to be metadata */
242 char pagi_inodeok; /* The agi is ok for inodes */
243 __uint8_t pagf_levels[XFS_BTNUM_AGF];
244 /* # of levels in bno & cnt btree */
245 __uint32_t pagf_flcount; /* count of blocks in freelist */
246 xfs_extlen_t pagf_freeblks; /* total free blocks */
247 xfs_extlen_t pagf_longest; /* longest free space */
248 __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
249 xfs_agino_t pagi_freecount; /* number of free inodes */
250 xfs_agino_t pagi_count; /* number of allocated inodes */
251
252 /*
253 * Inode allocation search lookup optimisation.
254 * If the pagino matches, the search for new inodes
255 * doesn't need to search the near ones again straight away
256 */
257 xfs_agino_t pagl_pagino;
258 xfs_agino_t pagl_leftrec;
259 xfs_agino_t pagl_rightrec;
260#ifdef __KERNEL__
261 spinlock_t pagb_lock; /* lock for pagb_tree */
262 struct rb_root pagb_tree; /* ordered tree of busy extents */
263
264 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
265
266 spinlock_t pag_ici_lock; /* incore inode cache lock */
267 struct radix_tree_root pag_ici_root; /* incore inode cache root */
268 int pag_ici_reclaimable; /* reclaimable inodes */
269 struct mutex pag_ici_reclaim_lock; /* serialisation point */
270 unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
271
272 /* buffer cache index */
273 spinlock_t pag_buf_lock; /* lock for pag_buf_tree */
274 struct rb_root pag_buf_tree; /* ordered tree of active buffers */
275
276 /* for rcu-safe freeing */
277 struct rcu_head rcu_head;
278#endif
279 int pagb_count; /* pagb slots in use */
280} xfs_perag_t;
281
282/*
283 * tags for inode radix tree 230 * tags for inode radix tree
284 */ 231 */
285#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup 232#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 71596e57283a..5a1393f5e020 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -878,7 +878,7 @@ xfs_alloc_ag_vextent_near(
878 xfs_agblock_t ltnew; /* useful start bno of left side */ 878 xfs_agblock_t ltnew; /* useful start bno of left side */
879 xfs_extlen_t rlen; /* length of returned extent */ 879 xfs_extlen_t rlen; /* length of returned extent */
880 int forced = 0; 880 int forced = 0;
881#if defined(DEBUG) && defined(__KERNEL__) 881#ifdef DEBUG
882 /* 882 /*
883 * Randomly don't execute the first algorithm. 883 * Randomly don't execute the first algorithm.
884 */ 884 */
@@ -938,8 +938,8 @@ restart:
938 xfs_extlen_t blen=0; 938 xfs_extlen_t blen=0;
939 xfs_agblock_t bnew=0; 939 xfs_agblock_t bnew=0;
940 940
941#if defined(DEBUG) && defined(__KERNEL__) 941#ifdef DEBUG
942 if (!dofirst) 942 if (dofirst)
943 break; 943 break;
944#endif 944#endif
945 /* 945 /*
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e11d654af786..977da0ec6604 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -28,9 +28,9 @@
28#include "xfs_alloc.h" 28#include "xfs_alloc.h"
29#include "xfs_error.h" 29#include "xfs_error.h"
30#include "xfs_iomap.h" 30#include "xfs_iomap.h"
31#include "xfs_vnodeops.h"
32#include "xfs_trace.h" 31#include "xfs_trace.h"
33#include "xfs_bmap.h" 32#include "xfs_bmap.h"
33#include "xfs_bmap_util.h"
34#include <linux/aio.h> 34#include <linux/aio.h>
35#include <linux/gfp.h> 35#include <linux/gfp.h>
36#include <linux/mpage.h> 36#include <linux/mpage.h>
@@ -108,7 +108,7 @@ xfs_setfilesize_trans_alloc(
108 108
109 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 109 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
110 110
111 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); 111 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
112 if (error) { 112 if (error) {
113 xfs_trans_cancel(tp, 0); 113 xfs_trans_cancel(tp, 0);
114 return error; 114 return error;
@@ -440,7 +440,7 @@ xfs_start_page_writeback(
440 end_page_writeback(page); 440 end_page_writeback(page);
441} 441}
442 442
443static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) 443static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
444{ 444{
445 return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); 445 return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
446} 446}
@@ -514,7 +514,7 @@ xfs_submit_ioend(
514 goto retry; 514 goto retry;
515 } 515 }
516 516
517 if (bio_add_buffer(bio, bh) != bh->b_size) { 517 if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
518 xfs_submit_ioend_bio(wbc, ioend, bio); 518 xfs_submit_ioend_bio(wbc, ioend, bio);
519 goto retry; 519 goto retry;
520 } 520 }
@@ -1498,13 +1498,26 @@ xfs_vm_write_failed(
1498 loff_t pos, 1498 loff_t pos,
1499 unsigned len) 1499 unsigned len)
1500{ 1500{
1501 loff_t block_offset = pos & PAGE_MASK; 1501 loff_t block_offset;
1502 loff_t block_start; 1502 loff_t block_start;
1503 loff_t block_end; 1503 loff_t block_end;
1504 loff_t from = pos & (PAGE_CACHE_SIZE - 1); 1504 loff_t from = pos & (PAGE_CACHE_SIZE - 1);
1505 loff_t to = from + len; 1505 loff_t to = from + len;
1506 struct buffer_head *bh, *head; 1506 struct buffer_head *bh, *head;
1507 1507
1508 /*
1509 * The request pos offset might be 32 or 64 bit, this is all fine
1510 * on 64-bit platform. However, for 64-bit pos request on 32-bit
1511 * platform, the high 32-bit will be masked off if we evaluate the
1512 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
1513 * 0xfffff000 as an unsigned long, hence the result is incorrect
1514 * which could cause the following ASSERT failed in most cases.
1515 * In order to avoid this, we can evaluate the block_offset of the
1516 * start of the page by using shifts rather than masks the mismatch
1517 * problem.
1518 */
1519 block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
1520
1508 ASSERT(block_offset + from == pos); 1521 ASSERT(block_offset + from == pos);
1509 1522
1510 head = page_buffers(page); 1523 head = page_buffers(page);
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 20fe3fe9d341..ddcf2267ffa6 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -17,10 +17,11 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_format.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_trans_priv.h"
24#include "xfs_sb.h" 25#include "xfs_sb.h"
25#include "xfs_ag.h" 26#include "xfs_ag.h"
26#include "xfs_mount.h" 27#include "xfs_mount.h"
@@ -32,13 +33,13 @@
32#include "xfs_alloc.h" 33#include "xfs_alloc.h"
33#include "xfs_inode_item.h" 34#include "xfs_inode_item.h"
34#include "xfs_bmap.h" 35#include "xfs_bmap.h"
36#include "xfs_bmap_util.h"
35#include "xfs_attr.h" 37#include "xfs_attr.h"
36#include "xfs_attr_leaf.h" 38#include "xfs_attr_leaf.h"
37#include "xfs_attr_remote.h" 39#include "xfs_attr_remote.h"
38#include "xfs_error.h" 40#include "xfs_error.h"
39#include "xfs_quota.h" 41#include "xfs_quota.h"
40#include "xfs_trans_space.h" 42#include "xfs_trans_space.h"
41#include "xfs_vnodeops.h"
42#include "xfs_trace.h" 43#include "xfs_trace.h"
43 44
44/* 45/*
@@ -62,7 +63,6 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args);
62STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); 63STATIC int xfs_attr_leaf_get(xfs_da_args_t *args);
63STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args); 64STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args);
64STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); 65STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args);
65STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context);
66 66
67/* 67/*
68 * Internal routines when attribute list is more than one block. 68 * Internal routines when attribute list is more than one block.
@@ -70,7 +70,6 @@ STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context);
70STATIC int xfs_attr_node_get(xfs_da_args_t *args); 70STATIC int xfs_attr_node_get(xfs_da_args_t *args);
71STATIC int xfs_attr_node_addname(xfs_da_args_t *args); 71STATIC int xfs_attr_node_addname(xfs_da_args_t *args);
72STATIC int xfs_attr_node_removename(xfs_da_args_t *args); 72STATIC int xfs_attr_node_removename(xfs_da_args_t *args);
73STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context);
74STATIC int xfs_attr_fillstate(xfs_da_state_t *state); 73STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
75STATIC int xfs_attr_refillstate(xfs_da_state_t *state); 74STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
76 75
@@ -90,7 +89,7 @@ xfs_attr_name_to_xname(
90 return 0; 89 return 0;
91} 90}
92 91
93STATIC int 92int
94xfs_inode_hasattr( 93xfs_inode_hasattr(
95 struct xfs_inode *ip) 94 struct xfs_inode *ip)
96{ 95{
@@ -227,13 +226,14 @@ xfs_attr_set_int(
227 int valuelen, 226 int valuelen,
228 int flags) 227 int flags)
229{ 228{
230 xfs_da_args_t args; 229 xfs_da_args_t args;
231 xfs_fsblock_t firstblock; 230 xfs_fsblock_t firstblock;
232 xfs_bmap_free_t flist; 231 xfs_bmap_free_t flist;
233 int error, err2, committed; 232 int error, err2, committed;
234 xfs_mount_t *mp = dp->i_mount; 233 struct xfs_mount *mp = dp->i_mount;
235 int rsvd = (flags & ATTR_ROOT) != 0; 234 struct xfs_trans_res tres;
236 int local; 235 int rsvd = (flags & ATTR_ROOT) != 0;
236 int local;
237 237
238 /* 238 /*
239 * Attach the dquots to the inode. 239 * Attach the dquots to the inode.
@@ -293,11 +293,11 @@ xfs_attr_set_int(
293 if (rsvd) 293 if (rsvd)
294 args.trans->t_flags |= XFS_TRANS_RESERVE; 294 args.trans->t_flags |= XFS_TRANS_RESERVE;
295 295
296 error = xfs_trans_reserve(args.trans, args.total, 296 tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
297 XFS_ATTRSETM_LOG_RES(mp) + 297 M_RES(mp)->tr_attrsetrt.tr_logres * args.total;
298 XFS_ATTRSETRT_LOG_RES(mp) * args.total, 298 tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
299 0, XFS_TRANS_PERM_LOG_RES, 299 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
300 XFS_ATTRSET_LOG_COUNT); 300 error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
301 if (error) { 301 if (error) {
302 xfs_trans_cancel(args.trans, 0); 302 xfs_trans_cancel(args.trans, 0);
303 return(error); 303 return(error);
@@ -517,11 +517,9 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
517 if (flags & ATTR_ROOT) 517 if (flags & ATTR_ROOT)
518 args.trans->t_flags |= XFS_TRANS_RESERVE; 518 args.trans->t_flags |= XFS_TRANS_RESERVE;
519 519
520 if ((error = xfs_trans_reserve(args.trans, 520 error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm,
521 XFS_ATTRRM_SPACE_RES(mp), 521 XFS_ATTRRM_SPACE_RES(mp), 0);
522 XFS_ATTRRM_LOG_RES(mp), 522 if (error) {
523 0, XFS_TRANS_PERM_LOG_RES,
524 XFS_ATTRRM_LOG_COUNT))) {
525 xfs_trans_cancel(args.trans, 0); 523 xfs_trans_cancel(args.trans, 0);
526 return(error); 524 return(error);
527 } 525 }
@@ -611,228 +609,6 @@ xfs_attr_remove(
611 return xfs_attr_remove_int(dp, &xname, flags); 609 return xfs_attr_remove_int(dp, &xname, flags);
612} 610}
613 611
614int
615xfs_attr_list_int(xfs_attr_list_context_t *context)
616{
617 int error;
618 xfs_inode_t *dp = context->dp;
619
620 XFS_STATS_INC(xs_attr_list);
621
622 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
623 return EIO;
624
625 xfs_ilock(dp, XFS_ILOCK_SHARED);
626
627 /*
628 * Decide on what work routines to call based on the inode size.
629 */
630 if (!xfs_inode_hasattr(dp)) {
631 error = 0;
632 } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
633 error = xfs_attr_shortform_list(context);
634 } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
635 error = xfs_attr_leaf_list(context);
636 } else {
637 error = xfs_attr_node_list(context);
638 }
639
640 xfs_iunlock(dp, XFS_ILOCK_SHARED);
641
642 return error;
643}
644
645#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \
646 (((struct attrlist_ent *) 0)->a_name - (char *) 0)
647#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \
648 ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
649 & ~(sizeof(u_int32_t)-1))
650
651/*
652 * Format an attribute and copy it out to the user's buffer.
653 * Take care to check values and protect against them changing later,
654 * we may be reading them directly out of a user buffer.
655 */
656/*ARGSUSED*/
657STATIC int
658xfs_attr_put_listent(
659 xfs_attr_list_context_t *context,
660 int flags,
661 unsigned char *name,
662 int namelen,
663 int valuelen,
664 unsigned char *value)
665{
666 struct attrlist *alist = (struct attrlist *)context->alist;
667 attrlist_ent_t *aep;
668 int arraytop;
669
670 ASSERT(!(context->flags & ATTR_KERNOVAL));
671 ASSERT(context->count >= 0);
672 ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
673 ASSERT(context->firstu >= sizeof(*alist));
674 ASSERT(context->firstu <= context->bufsize);
675
676 /*
677 * Only list entries in the right namespace.
678 */
679 if (((context->flags & ATTR_SECURE) == 0) !=
680 ((flags & XFS_ATTR_SECURE) == 0))
681 return 0;
682 if (((context->flags & ATTR_ROOT) == 0) !=
683 ((flags & XFS_ATTR_ROOT) == 0))
684 return 0;
685
686 arraytop = sizeof(*alist) +
687 context->count * sizeof(alist->al_offset[0]);
688 context->firstu -= ATTR_ENTSIZE(namelen);
689 if (context->firstu < arraytop) {
690 trace_xfs_attr_list_full(context);
691 alist->al_more = 1;
692 context->seen_enough = 1;
693 return 1;
694 }
695
696 aep = (attrlist_ent_t *)&context->alist[context->firstu];
697 aep->a_valuelen = valuelen;
698 memcpy(aep->a_name, name, namelen);
699 aep->a_name[namelen] = 0;
700 alist->al_offset[context->count++] = context->firstu;
701 alist->al_count = context->count;
702 trace_xfs_attr_list_add(context);
703 return 0;
704}
705
706/*
707 * Generate a list of extended attribute names and optionally
708 * also value lengths. Positive return value follows the XFS
709 * convention of being an error, zero or negative return code
710 * is the length of the buffer returned (negated), indicating
711 * success.
712 */
713int
714xfs_attr_list(
715 xfs_inode_t *dp,
716 char *buffer,
717 int bufsize,
718 int flags,
719 attrlist_cursor_kern_t *cursor)
720{
721 xfs_attr_list_context_t context;
722 struct attrlist *alist;
723 int error;
724
725 /*
726 * Validate the cursor.
727 */
728 if (cursor->pad1 || cursor->pad2)
729 return(XFS_ERROR(EINVAL));
730 if ((cursor->initted == 0) &&
731 (cursor->hashval || cursor->blkno || cursor->offset))
732 return XFS_ERROR(EINVAL);
733
734 /*
735 * Check for a properly aligned buffer.
736 */
737 if (((long)buffer) & (sizeof(int)-1))
738 return XFS_ERROR(EFAULT);
739 if (flags & ATTR_KERNOVAL)
740 bufsize = 0;
741
742 /*
743 * Initialize the output buffer.
744 */
745 memset(&context, 0, sizeof(context));
746 context.dp = dp;
747 context.cursor = cursor;
748 context.resynch = 1;
749 context.flags = flags;
750 context.alist = buffer;
751 context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */
752 context.firstu = context.bufsize;
753 context.put_listent = xfs_attr_put_listent;
754
755 alist = (struct attrlist *)context.alist;
756 alist->al_count = 0;
757 alist->al_more = 0;
758 alist->al_offset[0] = context.bufsize;
759
760 error = xfs_attr_list_int(&context);
761 ASSERT(error >= 0);
762 return error;
763}
764
765int /* error */
766xfs_attr_inactive(xfs_inode_t *dp)
767{
768 xfs_trans_t *trans;
769 xfs_mount_t *mp;
770 int error;
771
772 mp = dp->i_mount;
773 ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
774
775 xfs_ilock(dp, XFS_ILOCK_SHARED);
776 if (!xfs_inode_hasattr(dp) ||
777 dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
778 xfs_iunlock(dp, XFS_ILOCK_SHARED);
779 return 0;
780 }
781 xfs_iunlock(dp, XFS_ILOCK_SHARED);
782
783 /*
784 * Start our first transaction of the day.
785 *
786 * All future transactions during this code must be "chained" off
787 * this one via the trans_dup() call. All transactions will contain
788 * the inode, and the inode will always be marked with trans_ihold().
789 * Since the inode will be locked in all transactions, we must log
790 * the inode in every transaction to let it float upward through
791 * the log.
792 */
793 trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
794 if ((error = xfs_trans_reserve(trans, 0, XFS_ATTRINVAL_LOG_RES(mp), 0,
795 XFS_TRANS_PERM_LOG_RES,
796 XFS_ATTRINVAL_LOG_COUNT))) {
797 xfs_trans_cancel(trans, 0);
798 return(error);
799 }
800 xfs_ilock(dp, XFS_ILOCK_EXCL);
801
802 /*
803 * No need to make quota reservations here. We expect to release some
804 * blocks, not allocate, in the common case.
805 */
806 xfs_trans_ijoin(trans, dp, 0);
807
808 /*
809 * Decide on what work routines to call based on the inode size.
810 */
811 if (!xfs_inode_hasattr(dp) ||
812 dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
813 error = 0;
814 goto out;
815 }
816 error = xfs_attr3_root_inactive(&trans, dp);
817 if (error)
818 goto out;
819
820 error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
821 if (error)
822 goto out;
823
824 error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
825 xfs_iunlock(dp, XFS_ILOCK_EXCL);
826
827 return(error);
828
829out:
830 xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
831 xfs_iunlock(dp, XFS_ILOCK_EXCL);
832 return(error);
833}
834
835
836 612
837/*======================================================================== 613/*========================================================================
838 * External routines when attribute list is inside the inode 614 * External routines when attribute list is inside the inode
@@ -1166,28 +942,6 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
1166 return error; 942 return error;
1167} 943}
1168 944
1169/*
1170 * Copy out attribute entries for attr_list(), for leaf attribute lists.
1171 */
1172STATIC int
1173xfs_attr_leaf_list(xfs_attr_list_context_t *context)
1174{
1175 int error;
1176 struct xfs_buf *bp;
1177
1178 trace_xfs_attr_leaf_list(context);
1179
1180 context->cursor->blkno = 0;
1181 error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
1182 if (error)
1183 return XFS_ERROR(error);
1184
1185 error = xfs_attr3_leaf_list_int(bp, context);
1186 xfs_trans_brelse(NULL, bp);
1187 return XFS_ERROR(error);
1188}
1189
1190
1191/*======================================================================== 945/*========================================================================
1192 * External routines when attribute list size > XFS_LBSIZE(mp). 946 * External routines when attribute list size > XFS_LBSIZE(mp).
1193 *========================================================================*/ 947 *========================================================================*/
@@ -1260,6 +1014,7 @@ restart:
1260 * have been a b-tree. 1014 * have been a b-tree.
1261 */ 1015 */
1262 xfs_da_state_free(state); 1016 xfs_da_state_free(state);
1017 state = NULL;
1263 xfs_bmap_init(args->flist, args->firstblock); 1018 xfs_bmap_init(args->flist, args->firstblock);
1264 error = xfs_attr3_leaf_to_node(args); 1019 error = xfs_attr3_leaf_to_node(args);
1265 if (!error) { 1020 if (!error) {
@@ -1780,143 +1535,3 @@ xfs_attr_node_get(xfs_da_args_t *args)
1780 xfs_da_state_free(state); 1535 xfs_da_state_free(state);
1781 return(retval); 1536 return(retval);
1782} 1537}
1783
1784STATIC int /* error */
1785xfs_attr_node_list(xfs_attr_list_context_t *context)
1786{
1787 attrlist_cursor_kern_t *cursor;
1788 xfs_attr_leafblock_t *leaf;
1789 xfs_da_intnode_t *node;
1790 struct xfs_attr3_icleaf_hdr leafhdr;
1791 struct xfs_da3_icnode_hdr nodehdr;
1792 struct xfs_da_node_entry *btree;
1793 int error, i;
1794 struct xfs_buf *bp;
1795
1796 trace_xfs_attr_node_list(context);
1797
1798 cursor = context->cursor;
1799 cursor->initted = 1;
1800
1801 /*
1802 * Do all sorts of validation on the passed-in cursor structure.
1803 * If anything is amiss, ignore the cursor and look up the hashval
1804 * starting from the btree root.
1805 */
1806 bp = NULL;
1807 if (cursor->blkno > 0) {
1808 error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1,
1809 &bp, XFS_ATTR_FORK);
1810 if ((error != 0) && (error != EFSCORRUPTED))
1811 return(error);
1812 if (bp) {
1813 struct xfs_attr_leaf_entry *entries;
1814
1815 node = bp->b_addr;
1816 switch (be16_to_cpu(node->hdr.info.magic)) {
1817 case XFS_DA_NODE_MAGIC:
1818 case XFS_DA3_NODE_MAGIC:
1819 trace_xfs_attr_list_wrong_blk(context);
1820 xfs_trans_brelse(NULL, bp);
1821 bp = NULL;
1822 break;
1823 case XFS_ATTR_LEAF_MAGIC:
1824 case XFS_ATTR3_LEAF_MAGIC:
1825 leaf = bp->b_addr;
1826 xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
1827 entries = xfs_attr3_leaf_entryp(leaf);
1828 if (cursor->hashval > be32_to_cpu(
1829 entries[leafhdr.count - 1].hashval)) {
1830 trace_xfs_attr_list_wrong_blk(context);
1831 xfs_trans_brelse(NULL, bp);
1832 bp = NULL;
1833 } else if (cursor->hashval <= be32_to_cpu(
1834 entries[0].hashval)) {
1835 trace_xfs_attr_list_wrong_blk(context);
1836 xfs_trans_brelse(NULL, bp);
1837 bp = NULL;
1838 }
1839 break;
1840 default:
1841 trace_xfs_attr_list_wrong_blk(context);
1842 xfs_trans_brelse(NULL, bp);
1843 bp = NULL;
1844 }
1845 }
1846 }
1847
1848 /*
1849 * We did not find what we expected given the cursor's contents,
1850 * so we start from the top and work down based on the hash value.
1851 * Note that start of node block is same as start of leaf block.
1852 */
1853 if (bp == NULL) {
1854 cursor->blkno = 0;
1855 for (;;) {
1856 __uint16_t magic;
1857
1858 error = xfs_da3_node_read(NULL, context->dp,
1859 cursor->blkno, -1, &bp,
1860 XFS_ATTR_FORK);
1861 if (error)
1862 return(error);
1863 node = bp->b_addr;
1864 magic = be16_to_cpu(node->hdr.info.magic);
1865 if (magic == XFS_ATTR_LEAF_MAGIC ||
1866 magic == XFS_ATTR3_LEAF_MAGIC)
1867 break;
1868 if (magic != XFS_DA_NODE_MAGIC &&
1869 magic != XFS_DA3_NODE_MAGIC) {
1870 XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
1871 XFS_ERRLEVEL_LOW,
1872 context->dp->i_mount,
1873 node);
1874 xfs_trans_brelse(NULL, bp);
1875 return XFS_ERROR(EFSCORRUPTED);
1876 }
1877
1878 xfs_da3_node_hdr_from_disk(&nodehdr, node);
1879 btree = xfs_da3_node_tree_p(node);
1880 for (i = 0; i < nodehdr.count; btree++, i++) {
1881 if (cursor->hashval
1882 <= be32_to_cpu(btree->hashval)) {
1883 cursor->blkno = be32_to_cpu(btree->before);
1884 trace_xfs_attr_list_node_descend(context,
1885 btree);
1886 break;
1887 }
1888 }
1889 if (i == nodehdr.count) {
1890 xfs_trans_brelse(NULL, bp);
1891 return 0;
1892 }
1893 xfs_trans_brelse(NULL, bp);
1894 }
1895 }
1896 ASSERT(bp != NULL);
1897
1898 /*
1899 * Roll upward through the blocks, processing each leaf block in
1900 * order. As long as there is space in the result buffer, keep
1901 * adding the information.
1902 */
1903 for (;;) {
1904 leaf = bp->b_addr;
1905 error = xfs_attr3_leaf_list_int(bp, context);
1906 if (error) {
1907 xfs_trans_brelse(NULL, bp);
1908 return error;
1909 }
1910 xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
1911 if (context->seen_enough || leafhdr.forw == 0)
1912 break;
1913 cursor->blkno = leafhdr.forw;
1914 xfs_trans_brelse(NULL, bp);
1915 error = xfs_attr3_leaf_read(NULL, context->dp, cursor->blkno, -1,
1916 &bp);
1917 if (error)
1918 return error;
1919 }
1920 xfs_trans_brelse(NULL, bp);
1921 return 0;
1922}
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index de8dd58da46c..dd4824589470 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -141,5 +141,14 @@ typedef struct xfs_attr_list_context {
141 */ 141 */
142int xfs_attr_inactive(struct xfs_inode *dp); 142int xfs_attr_inactive(struct xfs_inode *dp);
143int xfs_attr_list_int(struct xfs_attr_list_context *); 143int xfs_attr_list_int(struct xfs_attr_list_context *);
144int xfs_inode_hasattr(struct xfs_inode *ip);
145int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
146 unsigned char *value, int *valuelenp, int flags);
147int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
148 unsigned char *value, int valuelen, int flags);
149int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
150int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
151 int flags, struct attrlist_cursor_kern *cursor);
152
144 153
145#endif /* __XFS_ATTR_H__ */ 154#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
new file mode 100644
index 000000000000..bb24b07cbedb
--- /dev/null
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -0,0 +1,453 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_format.h"
22#include "xfs_bit.h"
23#include "xfs_log.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_da_btree.h"
29#include "xfs_bmap_btree.h"
30#include "xfs_alloc_btree.h"
31#include "xfs_ialloc_btree.h"
32#include "xfs_alloc.h"
33#include "xfs_btree.h"
34#include "xfs_attr_remote.h"
35#include "xfs_dinode.h"
36#include "xfs_inode.h"
37#include "xfs_inode_item.h"
38#include "xfs_bmap.h"
39#include "xfs_attr.h"
40#include "xfs_attr_leaf.h"
41#include "xfs_error.h"
42#include "xfs_quota.h"
43#include "xfs_trace.h"
44#include "xfs_trans_priv.h"
45
46/*
47 * Look at all the extents for this logical region,
48 * invalidate any buffers that are incore/in transactions.
49 */
50STATIC int
51xfs_attr3_leaf_freextent(
52 struct xfs_trans **trans,
53 struct xfs_inode *dp,
54 xfs_dablk_t blkno,
55 int blkcnt)
56{
57 struct xfs_bmbt_irec map;
58 struct xfs_buf *bp;
59 xfs_dablk_t tblkno;
60 xfs_daddr_t dblkno;
61 int tblkcnt;
62 int dblkcnt;
63 int nmap;
64 int error;
65
66 /*
67 * Roll through the "value", invalidating the attribute value's
68 * blocks.
69 */
70 tblkno = blkno;
71 tblkcnt = blkcnt;
72 while (tblkcnt > 0) {
73 /*
74 * Try to remember where we decided to put the value.
75 */
76 nmap = 1;
77 error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
78 &map, &nmap, XFS_BMAPI_ATTRFORK);
79 if (error) {
80 return(error);
81 }
82 ASSERT(nmap == 1);
83 ASSERT(map.br_startblock != DELAYSTARTBLOCK);
84
85 /*
86 * If it's a hole, these are already unmapped
87 * so there's nothing to invalidate.
88 */
89 if (map.br_startblock != HOLESTARTBLOCK) {
90
91 dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
92 map.br_startblock);
93 dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
94 map.br_blockcount);
95 bp = xfs_trans_get_buf(*trans,
96 dp->i_mount->m_ddev_targp,
97 dblkno, dblkcnt, 0);
98 if (!bp)
99 return ENOMEM;
100 xfs_trans_binval(*trans, bp);
101 /*
102 * Roll to next transaction.
103 */
104 error = xfs_trans_roll(trans, dp);
105 if (error)
106 return (error);
107 }
108
109 tblkno += map.br_blockcount;
110 tblkcnt -= map.br_blockcount;
111 }
112
113 return(0);
114}
115
116/*
117 * Invalidate all of the "remote" value regions pointed to by a particular
118 * leaf block.
119 * Note that we must release the lock on the buffer so that we are not
120 * caught holding something that the logging code wants to flush to disk.
121 */
122STATIC int
123xfs_attr3_leaf_inactive(
124 struct xfs_trans **trans,
125 struct xfs_inode *dp,
126 struct xfs_buf *bp)
127{
128 struct xfs_attr_leafblock *leaf;
129 struct xfs_attr3_icleaf_hdr ichdr;
130 struct xfs_attr_leaf_entry *entry;
131 struct xfs_attr_leaf_name_remote *name_rmt;
132 struct xfs_attr_inactive_list *list;
133 struct xfs_attr_inactive_list *lp;
134 int error;
135 int count;
136 int size;
137 int tmp;
138 int i;
139
140 leaf = bp->b_addr;
141 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
142
143 /*
144 * Count the number of "remote" value extents.
145 */
146 count = 0;
147 entry = xfs_attr3_leaf_entryp(leaf);
148 for (i = 0; i < ichdr.count; entry++, i++) {
149 if (be16_to_cpu(entry->nameidx) &&
150 ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
151 name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
152 if (name_rmt->valueblk)
153 count++;
154 }
155 }
156
157 /*
158 * If there are no "remote" values, we're done.
159 */
160 if (count == 0) {
161 xfs_trans_brelse(*trans, bp);
162 return 0;
163 }
164
165 /*
166 * Allocate storage for a list of all the "remote" value extents.
167 */
168 size = count * sizeof(xfs_attr_inactive_list_t);
169 list = kmem_alloc(size, KM_SLEEP);
170
171 /*
172 * Identify each of the "remote" value extents.
173 */
174 lp = list;
175 entry = xfs_attr3_leaf_entryp(leaf);
176 for (i = 0; i < ichdr.count; entry++, i++) {
177 if (be16_to_cpu(entry->nameidx) &&
178 ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
179 name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
180 if (name_rmt->valueblk) {
181 lp->valueblk = be32_to_cpu(name_rmt->valueblk);
182 lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
183 be32_to_cpu(name_rmt->valuelen));
184 lp++;
185 }
186 }
187 }
188 xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */
189
190 /*
191 * Invalidate each of the "remote" value extents.
192 */
193 error = 0;
194 for (lp = list, i = 0; i < count; i++, lp++) {
195 tmp = xfs_attr3_leaf_freextent(trans, dp,
196 lp->valueblk, lp->valuelen);
197
198 if (error == 0)
199 error = tmp; /* save only the 1st errno */
200 }
201
202 kmem_free(list);
203 return error;
204}
205
206/*
207 * Recurse (gasp!) through the attribute nodes until we find leaves.
208 * We're doing a depth-first traversal in order to invalidate everything.
209 */
210STATIC int
211xfs_attr3_node_inactive(
212 struct xfs_trans **trans,
213 struct xfs_inode *dp,
214 struct xfs_buf *bp,
215 int level)
216{
217 xfs_da_blkinfo_t *info;
218 xfs_da_intnode_t *node;
219 xfs_dablk_t child_fsb;
220 xfs_daddr_t parent_blkno, child_blkno;
221 int error, i;
222 struct xfs_buf *child_bp;
223 struct xfs_da_node_entry *btree;
224 struct xfs_da3_icnode_hdr ichdr;
225
226 /*
227 * Since this code is recursive (gasp!) we must protect ourselves.
228 */
229 if (level > XFS_DA_NODE_MAXDEPTH) {
230 xfs_trans_brelse(*trans, bp); /* no locks for later trans */
231 return XFS_ERROR(EIO);
232 }
233
234 node = bp->b_addr;
235 xfs_da3_node_hdr_from_disk(&ichdr, node);
236 parent_blkno = bp->b_bn;
237 if (!ichdr.count) {
238 xfs_trans_brelse(*trans, bp);
239 return 0;
240 }
241 btree = xfs_da3_node_tree_p(node);
242 child_fsb = be32_to_cpu(btree[0].before);
243 xfs_trans_brelse(*trans, bp); /* no locks for later trans */
244
245 /*
246 * If this is the node level just above the leaves, simply loop
247 * over the leaves removing all of them. If this is higher up
248 * in the tree, recurse downward.
249 */
250 for (i = 0; i < ichdr.count; i++) {
251 /*
252 * Read the subsidiary block to see what we have to work with.
253 * Don't do this in a transaction. This is a depth-first
254 * traversal of the tree so we may deal with many blocks
255 * before we come back to this one.
256 */
257 error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
258 XFS_ATTR_FORK);
259 if (error)
260 return(error);
261 if (child_bp) {
262 /* save for re-read later */
263 child_blkno = XFS_BUF_ADDR(child_bp);
264
265 /*
266 * Invalidate the subtree, however we have to.
267 */
268 info = child_bp->b_addr;
269 switch (info->magic) {
270 case cpu_to_be16(XFS_DA_NODE_MAGIC):
271 case cpu_to_be16(XFS_DA3_NODE_MAGIC):
272 error = xfs_attr3_node_inactive(trans, dp,
273 child_bp, level + 1);
274 break;
275 case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
276 case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
277 error = xfs_attr3_leaf_inactive(trans, dp,
278 child_bp);
279 break;
280 default:
281 error = XFS_ERROR(EIO);
282 xfs_trans_brelse(*trans, child_bp);
283 break;
284 }
285 if (error)
286 return error;
287
288 /*
289 * Remove the subsidiary block from the cache
290 * and from the log.
291 */
292 error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
293 &child_bp, XFS_ATTR_FORK);
294 if (error)
295 return error;
296 xfs_trans_binval(*trans, child_bp);
297 }
298
299 /*
300 * If we're not done, re-read the parent to get the next
301 * child block number.
302 */
303 if (i + 1 < ichdr.count) {
304 error = xfs_da3_node_read(*trans, dp, 0, parent_blkno,
305 &bp, XFS_ATTR_FORK);
306 if (error)
307 return error;
308 child_fsb = be32_to_cpu(btree[i + 1].before);
309 xfs_trans_brelse(*trans, bp);
310 }
311 /*
312 * Atomically commit the whole invalidate stuff.
313 */
314 error = xfs_trans_roll(trans, dp);
315 if (error)
316 return error;
317 }
318
319 return 0;
320}
321
322/*
323 * Indiscriminately delete the entire attribute fork
324 *
325 * Recurse (gasp!) through the attribute nodes until we find leaves.
326 * We're doing a depth-first traversal in order to invalidate everything.
327 */
328int
329xfs_attr3_root_inactive(
330 struct xfs_trans **trans,
331 struct xfs_inode *dp)
332{
333 struct xfs_da_blkinfo *info;
334 struct xfs_buf *bp;
335 xfs_daddr_t blkno;
336 int error;
337
338 /*
339 * Read block 0 to see what we have to work with.
340 * We only get here if we have extents, since we remove
341 * the extents in reverse order the extent containing
342 * block 0 must still be there.
343 */
344 error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
345 if (error)
346 return error;
347 blkno = bp->b_bn;
348
349 /*
350 * Invalidate the tree, even if the "tree" is only a single leaf block.
351 * This is a depth-first traversal!
352 */
353 info = bp->b_addr;
354 switch (info->magic) {
355 case cpu_to_be16(XFS_DA_NODE_MAGIC):
356 case cpu_to_be16(XFS_DA3_NODE_MAGIC):
357 error = xfs_attr3_node_inactive(trans, dp, bp, 1);
358 break;
359 case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
360 case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
361 error = xfs_attr3_leaf_inactive(trans, dp, bp);
362 break;
363 default:
364 error = XFS_ERROR(EIO);
365 xfs_trans_brelse(*trans, bp);
366 break;
367 }
368 if (error)
369 return error;
370
371 /*
372 * Invalidate the incore copy of the root block.
373 */
374 error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
375 if (error)
376 return error;
377 xfs_trans_binval(*trans, bp); /* remove from cache */
378 /*
379 * Commit the invalidate and start the next transaction.
380 */
381 error = xfs_trans_roll(trans, dp);
382
383 return error;
384}
385
386int
387xfs_attr_inactive(xfs_inode_t *dp)
388{
389 xfs_trans_t *trans;
390 xfs_mount_t *mp;
391 int error;
392
393 mp = dp->i_mount;
394 ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
395
396 xfs_ilock(dp, XFS_ILOCK_SHARED);
397 if (!xfs_inode_hasattr(dp) ||
398 dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
399 xfs_iunlock(dp, XFS_ILOCK_SHARED);
400 return 0;
401 }
402 xfs_iunlock(dp, XFS_ILOCK_SHARED);
403
404 /*
405 * Start our first transaction of the day.
406 *
407 * All future transactions during this code must be "chained" off
408 * this one via the trans_dup() call. All transactions will contain
409 * the inode, and the inode will always be marked with trans_ihold().
410 * Since the inode will be locked in all transactions, we must log
411 * the inode in every transaction to let it float upward through
412 * the log.
413 */
414 trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL);
415 error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
416 if (error) {
417 xfs_trans_cancel(trans, 0);
418 return(error);
419 }
420 xfs_ilock(dp, XFS_ILOCK_EXCL);
421
422 /*
423 * No need to make quota reservations here. We expect to release some
424 * blocks, not allocate, in the common case.
425 */
426 xfs_trans_ijoin(trans, dp, 0);
427
428 /*
429 * Decide on what work routines to call based on the inode size.
430 */
431 if (!xfs_inode_hasattr(dp) ||
432 dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
433 error = 0;
434 goto out;
435 }
436 error = xfs_attr3_root_inactive(&trans, dp);
437 if (error)
438 goto out;
439
440 error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0);
441 if (error)
442 goto out;
443
444 error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
445 xfs_iunlock(dp, XFS_ILOCK_EXCL);
446
447 return(error);
448
449out:
450 xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
451 xfs_iunlock(dp, XFS_ILOCK_EXCL);
452 return(error);
453}
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index b800fbcafc7f..86db20a9cc02 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -22,6 +22,7 @@
22#include "xfs_bit.h" 22#include "xfs_bit.h"
23#include "xfs_log.h" 23#include "xfs_log.h"
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
25#include "xfs_sb.h" 26#include "xfs_sb.h"
26#include "xfs_ag.h" 27#include "xfs_ag.h"
27#include "xfs_mount.h" 28#include "xfs_mount.h"
@@ -78,16 +79,6 @@ STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
78 int *number_usedbytes_in_blk1); 79 int *number_usedbytes_in_blk1);
79 80
80/* 81/*
81 * Routines used for shrinking the Btree.
82 */
83STATIC int xfs_attr3_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
84 struct xfs_buf *bp, int level);
85STATIC int xfs_attr3_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp,
86 struct xfs_buf *bp);
87STATIC int xfs_attr3_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
88 xfs_dablk_t blkno, int blkcnt);
89
90/*
91 * Utility routines. 82 * Utility routines.
92 */ 83 */
93STATIC void xfs_attr3_leaf_moveents(struct xfs_attr_leafblock *src_leaf, 84STATIC void xfs_attr3_leaf_moveents(struct xfs_attr_leafblock *src_leaf,
@@ -635,7 +626,7 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
635 xfs_attr_sf_entry_t *sfe; 626 xfs_attr_sf_entry_t *sfe;
636 int i; 627 int i;
637 628
638 ASSERT(args->dp->i_d.di_aformat == XFS_IFINLINE); 629 ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE);
639 sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; 630 sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data;
640 sfe = &sf->list[0]; 631 sfe = &sf->list[0];
641 for (i = 0; i < sf->hdr.count; 632 for (i = 0; i < sf->hdr.count;
@@ -751,182 +742,6 @@ out:
751 return(error); 742 return(error);
752} 743}
753 744
754STATIC int
755xfs_attr_shortform_compare(const void *a, const void *b)
756{
757 xfs_attr_sf_sort_t *sa, *sb;
758
759 sa = (xfs_attr_sf_sort_t *)a;
760 sb = (xfs_attr_sf_sort_t *)b;
761 if (sa->hash < sb->hash) {
762 return(-1);
763 } else if (sa->hash > sb->hash) {
764 return(1);
765 } else {
766 return(sa->entno - sb->entno);
767 }
768}
769
770
771#define XFS_ISRESET_CURSOR(cursor) \
772 (!((cursor)->initted) && !((cursor)->hashval) && \
773 !((cursor)->blkno) && !((cursor)->offset))
774/*
775 * Copy out entries of shortform attribute lists for attr_list().
776 * Shortform attribute lists are not stored in hashval sorted order.
777 * If the output buffer is not large enough to hold them all, then we
778 * we have to calculate each entries' hashvalue and sort them before
779 * we can begin returning them to the user.
780 */
781/*ARGSUSED*/
782int
783xfs_attr_shortform_list(xfs_attr_list_context_t *context)
784{
785 attrlist_cursor_kern_t *cursor;
786 xfs_attr_sf_sort_t *sbuf, *sbp;
787 xfs_attr_shortform_t *sf;
788 xfs_attr_sf_entry_t *sfe;
789 xfs_inode_t *dp;
790 int sbsize, nsbuf, count, i;
791 int error;
792
793 ASSERT(context != NULL);
794 dp = context->dp;
795 ASSERT(dp != NULL);
796 ASSERT(dp->i_afp != NULL);
797 sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
798 ASSERT(sf != NULL);
799 if (!sf->hdr.count)
800 return(0);
801 cursor = context->cursor;
802 ASSERT(cursor != NULL);
803
804 trace_xfs_attr_list_sf(context);
805
806 /*
807 * If the buffer is large enough and the cursor is at the start,
808 * do not bother with sorting since we will return everything in
809 * one buffer and another call using the cursor won't need to be
810 * made.
811 * Note the generous fudge factor of 16 overhead bytes per entry.
812 * If bufsize is zero then put_listent must be a search function
813 * and can just scan through what we have.
814 */
815 if (context->bufsize == 0 ||
816 (XFS_ISRESET_CURSOR(cursor) &&
817 (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
818 for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
819 error = context->put_listent(context,
820 sfe->flags,
821 sfe->nameval,
822 (int)sfe->namelen,
823 (int)sfe->valuelen,
824 &sfe->nameval[sfe->namelen]);
825
826 /*
827 * Either search callback finished early or
828 * didn't fit it all in the buffer after all.
829 */
830 if (context->seen_enough)
831 break;
832
833 if (error)
834 return error;
835 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
836 }
837 trace_xfs_attr_list_sf_all(context);
838 return(0);
839 }
840
841 /* do no more for a search callback */
842 if (context->bufsize == 0)
843 return 0;
844
845 /*
846 * It didn't all fit, so we have to sort everything on hashval.
847 */
848 sbsize = sf->hdr.count * sizeof(*sbuf);
849 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
850
851 /*
852 * Scan the attribute list for the rest of the entries, storing
853 * the relevant info from only those that match into a buffer.
854 */
855 nsbuf = 0;
856 for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
857 if (unlikely(
858 ((char *)sfe < (char *)sf) ||
859 ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
860 XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
861 XFS_ERRLEVEL_LOW,
862 context->dp->i_mount, sfe);
863 kmem_free(sbuf);
864 return XFS_ERROR(EFSCORRUPTED);
865 }
866
867 sbp->entno = i;
868 sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
869 sbp->name = sfe->nameval;
870 sbp->namelen = sfe->namelen;
871 /* These are bytes, and both on-disk, don't endian-flip */
872 sbp->valuelen = sfe->valuelen;
873 sbp->flags = sfe->flags;
874 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
875 sbp++;
876 nsbuf++;
877 }
878
879 /*
880 * Sort the entries on hash then entno.
881 */
882 xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
883
884 /*
885 * Re-find our place IN THE SORTED LIST.
886 */
887 count = 0;
888 cursor->initted = 1;
889 cursor->blkno = 0;
890 for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
891 if (sbp->hash == cursor->hashval) {
892 if (cursor->offset == count) {
893 break;
894 }
895 count++;
896 } else if (sbp->hash > cursor->hashval) {
897 break;
898 }
899 }
900 if (i == nsbuf) {
901 kmem_free(sbuf);
902 return(0);
903 }
904
905 /*
906 * Loop putting entries into the user buffer.
907 */
908 for ( ; i < nsbuf; i++, sbp++) {
909 if (cursor->hashval != sbp->hash) {
910 cursor->hashval = sbp->hash;
911 cursor->offset = 0;
912 }
913 error = context->put_listent(context,
914 sbp->flags,
915 sbp->name,
916 sbp->namelen,
917 sbp->valuelen,
918 &sbp->name[sbp->namelen]);
919 if (error)
920 return error;
921 if (context->seen_enough)
922 break;
923 cursor->offset++;
924 }
925
926 kmem_free(sbuf);
927 return(0);
928}
929
930/* 745/*
931 * Check a leaf attribute block to see if all the entries would fit into 746 * Check a leaf attribute block to see if all the entries would fit into
932 * a shortform attribute list. 747 * a shortform attribute list.
@@ -1121,7 +936,6 @@ out:
1121 return error; 936 return error;
1122} 937}
1123 938
1124
1125/*======================================================================== 939/*========================================================================
1126 * Routines used for growing the Btree. 940 * Routines used for growing the Btree.
1127 *========================================================================*/ 941 *========================================================================*/
@@ -1482,7 +1296,6 @@ xfs_attr3_leaf_compact(
1482 ichdr_dst->freemap[0].size = ichdr_dst->firstused - 1296 ichdr_dst->freemap[0].size = ichdr_dst->firstused -
1483 ichdr_dst->freemap[0].base; 1297 ichdr_dst->freemap[0].base;
1484 1298
1485
1486 /* write the header back to initialise the underlying buffer */ 1299 /* write the header back to initialise the underlying buffer */
1487 xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst); 1300 xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
1488 1301
@@ -2643,130 +2456,6 @@ xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local)
2643 return size; 2456 return size;
2644} 2457}
2645 2458
2646/*
2647 * Copy out attribute list entries for attr_list(), for leaf attribute lists.
2648 */
2649int
2650xfs_attr3_leaf_list_int(
2651 struct xfs_buf *bp,
2652 struct xfs_attr_list_context *context)
2653{
2654 struct attrlist_cursor_kern *cursor;
2655 struct xfs_attr_leafblock *leaf;
2656 struct xfs_attr3_icleaf_hdr ichdr;
2657 struct xfs_attr_leaf_entry *entries;
2658 struct xfs_attr_leaf_entry *entry;
2659 int retval;
2660 int i;
2661
2662 trace_xfs_attr_list_leaf(context);
2663
2664 leaf = bp->b_addr;
2665 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
2666 entries = xfs_attr3_leaf_entryp(leaf);
2667
2668 cursor = context->cursor;
2669 cursor->initted = 1;
2670
2671 /*
2672 * Re-find our place in the leaf block if this is a new syscall.
2673 */
2674 if (context->resynch) {
2675 entry = &entries[0];
2676 for (i = 0; i < ichdr.count; entry++, i++) {
2677 if (be32_to_cpu(entry->hashval) == cursor->hashval) {
2678 if (cursor->offset == context->dupcnt) {
2679 context->dupcnt = 0;
2680 break;
2681 }
2682 context->dupcnt++;
2683 } else if (be32_to_cpu(entry->hashval) >
2684 cursor->hashval) {
2685 context->dupcnt = 0;
2686 break;
2687 }
2688 }
2689 if (i == ichdr.count) {
2690 trace_xfs_attr_list_notfound(context);
2691 return 0;
2692 }
2693 } else {
2694 entry = &entries[0];
2695 i = 0;
2696 }
2697 context->resynch = 0;
2698
2699 /*
2700 * We have found our place, start copying out the new attributes.
2701 */
2702 retval = 0;
2703 for (; i < ichdr.count; entry++, i++) {
2704 if (be32_to_cpu(entry->hashval) != cursor->hashval) {
2705 cursor->hashval = be32_to_cpu(entry->hashval);
2706 cursor->offset = 0;
2707 }
2708
2709 if (entry->flags & XFS_ATTR_INCOMPLETE)
2710 continue; /* skip incomplete entries */
2711
2712 if (entry->flags & XFS_ATTR_LOCAL) {
2713 xfs_attr_leaf_name_local_t *name_loc =
2714 xfs_attr3_leaf_name_local(leaf, i);
2715
2716 retval = context->put_listent(context,
2717 entry->flags,
2718 name_loc->nameval,
2719 (int)name_loc->namelen,
2720 be16_to_cpu(name_loc->valuelen),
2721 &name_loc->nameval[name_loc->namelen]);
2722 if (retval)
2723 return retval;
2724 } else {
2725 xfs_attr_leaf_name_remote_t *name_rmt =
2726 xfs_attr3_leaf_name_remote(leaf, i);
2727
2728 int valuelen = be32_to_cpu(name_rmt->valuelen);
2729
2730 if (context->put_value) {
2731 xfs_da_args_t args;
2732
2733 memset((char *)&args, 0, sizeof(args));
2734 args.dp = context->dp;
2735 args.whichfork = XFS_ATTR_FORK;
2736 args.valuelen = valuelen;
2737 args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
2738 args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
2739 args.rmtblkcnt = xfs_attr3_rmt_blocks(
2740 args.dp->i_mount, valuelen);
2741 retval = xfs_attr_rmtval_get(&args);
2742 if (retval)
2743 return retval;
2744 retval = context->put_listent(context,
2745 entry->flags,
2746 name_rmt->name,
2747 (int)name_rmt->namelen,
2748 valuelen,
2749 args.value);
2750 kmem_free(args.value);
2751 } else {
2752 retval = context->put_listent(context,
2753 entry->flags,
2754 name_rmt->name,
2755 (int)name_rmt->namelen,
2756 valuelen,
2757 NULL);
2758 }
2759 if (retval)
2760 return retval;
2761 }
2762 if (context->seen_enough)
2763 break;
2764 cursor->offset++;
2765 }
2766 trace_xfs_attr_list_leaf_end(context);
2767 return retval;
2768}
2769
2770 2459
2771/*======================================================================== 2460/*========================================================================
2772 * Manage the INCOMPLETE flag in a leaf entry 2461 * Manage the INCOMPLETE flag in a leaf entry
@@ -3011,345 +2700,3 @@ xfs_attr3_leaf_flipflags(
3011 2700
3012 return error; 2701 return error;
3013} 2702}
3014
3015/*========================================================================
3016 * Indiscriminately delete the entire attribute fork
3017 *========================================================================*/
3018
3019/*
3020 * Recurse (gasp!) through the attribute nodes until we find leaves.
3021 * We're doing a depth-first traversal in order to invalidate everything.
3022 */
3023int
3024xfs_attr3_root_inactive(
3025 struct xfs_trans **trans,
3026 struct xfs_inode *dp)
3027{
3028 struct xfs_da_blkinfo *info;
3029 struct xfs_buf *bp;
3030 xfs_daddr_t blkno;
3031 int error;
3032
3033 /*
3034 * Read block 0 to see what we have to work with.
3035 * We only get here if we have extents, since we remove
3036 * the extents in reverse order the extent containing
3037 * block 0 must still be there.
3038 */
3039 error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
3040 if (error)
3041 return error;
3042 blkno = bp->b_bn;
3043
3044 /*
3045 * Invalidate the tree, even if the "tree" is only a single leaf block.
3046 * This is a depth-first traversal!
3047 */
3048 info = bp->b_addr;
3049 switch (info->magic) {
3050 case cpu_to_be16(XFS_DA_NODE_MAGIC):
3051 case cpu_to_be16(XFS_DA3_NODE_MAGIC):
3052 error = xfs_attr3_node_inactive(trans, dp, bp, 1);
3053 break;
3054 case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
3055 case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
3056 error = xfs_attr3_leaf_inactive(trans, dp, bp);
3057 break;
3058 default:
3059 error = XFS_ERROR(EIO);
3060 xfs_trans_brelse(*trans, bp);
3061 break;
3062 }
3063 if (error)
3064 return error;
3065
3066 /*
3067 * Invalidate the incore copy of the root block.
3068 */
3069 error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK);
3070 if (error)
3071 return error;
3072 xfs_trans_binval(*trans, bp); /* remove from cache */
3073 /*
3074 * Commit the invalidate and start the next transaction.
3075 */
3076 error = xfs_trans_roll(trans, dp);
3077
3078 return error;
3079}
3080
3081/*
3082 * Recurse (gasp!) through the attribute nodes until we find leaves.
3083 * We're doing a depth-first traversal in order to invalidate everything.
3084 */
3085STATIC int
3086xfs_attr3_node_inactive(
3087 struct xfs_trans **trans,
3088 struct xfs_inode *dp,
3089 struct xfs_buf *bp,
3090 int level)
3091{
3092 xfs_da_blkinfo_t *info;
3093 xfs_da_intnode_t *node;
3094 xfs_dablk_t child_fsb;
3095 xfs_daddr_t parent_blkno, child_blkno;
3096 int error, i;
3097 struct xfs_buf *child_bp;
3098 struct xfs_da_node_entry *btree;
3099 struct xfs_da3_icnode_hdr ichdr;
3100
3101 /*
3102 * Since this code is recursive (gasp!) we must protect ourselves.
3103 */
3104 if (level > XFS_DA_NODE_MAXDEPTH) {
3105 xfs_trans_brelse(*trans, bp); /* no locks for later trans */
3106 return XFS_ERROR(EIO);
3107 }
3108
3109 node = bp->b_addr;
3110 xfs_da3_node_hdr_from_disk(&ichdr, node);
3111 parent_blkno = bp->b_bn;
3112 if (!ichdr.count) {
3113 xfs_trans_brelse(*trans, bp);
3114 return 0;
3115 }
3116 btree = xfs_da3_node_tree_p(node);
3117 child_fsb = be32_to_cpu(btree[0].before);
3118 xfs_trans_brelse(*trans, bp); /* no locks for later trans */
3119
3120 /*
3121 * If this is the node level just above the leaves, simply loop
3122 * over the leaves removing all of them. If this is higher up
3123 * in the tree, recurse downward.
3124 */
3125 for (i = 0; i < ichdr.count; i++) {
3126 /*
3127 * Read the subsidiary block to see what we have to work with.
3128 * Don't do this in a transaction. This is a depth-first
3129 * traversal of the tree so we may deal with many blocks
3130 * before we come back to this one.
3131 */
3132 error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
3133 XFS_ATTR_FORK);
3134 if (error)
3135 return(error);
3136 if (child_bp) {
3137 /* save for re-read later */
3138 child_blkno = XFS_BUF_ADDR(child_bp);
3139
3140 /*
3141 * Invalidate the subtree, however we have to.
3142 */
3143 info = child_bp->b_addr;
3144 switch (info->magic) {
3145 case cpu_to_be16(XFS_DA_NODE_MAGIC):
3146 case cpu_to_be16(XFS_DA3_NODE_MAGIC):
3147 error = xfs_attr3_node_inactive(trans, dp,
3148 child_bp, level + 1);
3149 break;
3150 case cpu_to_be16(XFS_ATTR_LEAF_MAGIC):
3151 case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC):
3152 error = xfs_attr3_leaf_inactive(trans, dp,
3153 child_bp);
3154 break;
3155 default:
3156 error = XFS_ERROR(EIO);
3157 xfs_trans_brelse(*trans, child_bp);
3158 break;
3159 }
3160 if (error)
3161 return error;
3162
3163 /*
3164 * Remove the subsidiary block from the cache
3165 * and from the log.
3166 */
3167 error = xfs_da_get_buf(*trans, dp, 0, child_blkno,
3168 &child_bp, XFS_ATTR_FORK);
3169 if (error)
3170 return error;
3171 xfs_trans_binval(*trans, child_bp);
3172 }
3173
3174 /*
3175 * If we're not done, re-read the parent to get the next
3176 * child block number.
3177 */
3178 if (i + 1 < ichdr.count) {
3179 error = xfs_da3_node_read(*trans, dp, 0, parent_blkno,
3180 &bp, XFS_ATTR_FORK);
3181 if (error)
3182 return error;
3183 child_fsb = be32_to_cpu(btree[i + 1].before);
3184 xfs_trans_brelse(*trans, bp);
3185 }
3186 /*
3187 * Atomically commit the whole invalidate stuff.
3188 */
3189 error = xfs_trans_roll(trans, dp);
3190 if (error)
3191 return error;
3192 }
3193
3194 return 0;
3195}
3196
3197/*
3198 * Invalidate all of the "remote" value regions pointed to by a particular
3199 * leaf block.
3200 * Note that we must release the lock on the buffer so that we are not
3201 * caught holding something that the logging code wants to flush to disk.
3202 */
3203STATIC int
3204xfs_attr3_leaf_inactive(
3205 struct xfs_trans **trans,
3206 struct xfs_inode *dp,
3207 struct xfs_buf *bp)
3208{
3209 struct xfs_attr_leafblock *leaf;
3210 struct xfs_attr3_icleaf_hdr ichdr;
3211 struct xfs_attr_leaf_entry *entry;
3212 struct xfs_attr_leaf_name_remote *name_rmt;
3213 struct xfs_attr_inactive_list *list;
3214 struct xfs_attr_inactive_list *lp;
3215 int error;
3216 int count;
3217 int size;
3218 int tmp;
3219 int i;
3220
3221 leaf = bp->b_addr;
3222 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
3223
3224 /*
3225 * Count the number of "remote" value extents.
3226 */
3227 count = 0;
3228 entry = xfs_attr3_leaf_entryp(leaf);
3229 for (i = 0; i < ichdr.count; entry++, i++) {
3230 if (be16_to_cpu(entry->nameidx) &&
3231 ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
3232 name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
3233 if (name_rmt->valueblk)
3234 count++;
3235 }
3236 }
3237
3238 /*
3239 * If there are no "remote" values, we're done.
3240 */
3241 if (count == 0) {
3242 xfs_trans_brelse(*trans, bp);
3243 return 0;
3244 }
3245
3246 /*
3247 * Allocate storage for a list of all the "remote" value extents.
3248 */
3249 size = count * sizeof(xfs_attr_inactive_list_t);
3250 list = kmem_alloc(size, KM_SLEEP);
3251
3252 /*
3253 * Identify each of the "remote" value extents.
3254 */
3255 lp = list;
3256 entry = xfs_attr3_leaf_entryp(leaf);
3257 for (i = 0; i < ichdr.count; entry++, i++) {
3258 if (be16_to_cpu(entry->nameidx) &&
3259 ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
3260 name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
3261 if (name_rmt->valueblk) {
3262 lp->valueblk = be32_to_cpu(name_rmt->valueblk);
3263 lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
3264 be32_to_cpu(name_rmt->valuelen));
3265 lp++;
3266 }
3267 }
3268 }
3269 xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */
3270
3271 /*
3272 * Invalidate each of the "remote" value extents.
3273 */
3274 error = 0;
3275 for (lp = list, i = 0; i < count; i++, lp++) {
3276 tmp = xfs_attr3_leaf_freextent(trans, dp,
3277 lp->valueblk, lp->valuelen);
3278
3279 if (error == 0)
3280 error = tmp; /* save only the 1st errno */
3281 }
3282
3283 kmem_free(list);
3284 return error;
3285}
3286
3287/*
3288 * Look at all the extents for this logical region,
3289 * invalidate any buffers that are incore/in transactions.
3290 */
3291STATIC int
3292xfs_attr3_leaf_freextent(
3293 struct xfs_trans **trans,
3294 struct xfs_inode *dp,
3295 xfs_dablk_t blkno,
3296 int blkcnt)
3297{
3298 struct xfs_bmbt_irec map;
3299 struct xfs_buf *bp;
3300 xfs_dablk_t tblkno;
3301 xfs_daddr_t dblkno;
3302 int tblkcnt;
3303 int dblkcnt;
3304 int nmap;
3305 int error;
3306
3307 /*
3308 * Roll through the "value", invalidating the attribute value's
3309 * blocks.
3310 */
3311 tblkno = blkno;
3312 tblkcnt = blkcnt;
3313 while (tblkcnt > 0) {
3314 /*
3315 * Try to remember where we decided to put the value.
3316 */
3317 nmap = 1;
3318 error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
3319 &map, &nmap, XFS_BMAPI_ATTRFORK);
3320 if (error) {
3321 return(error);
3322 }
3323 ASSERT(nmap == 1);
3324 ASSERT(map.br_startblock != DELAYSTARTBLOCK);
3325
3326 /*
3327 * If it's a hole, these are already unmapped
3328 * so there's nothing to invalidate.
3329 */
3330 if (map.br_startblock != HOLESTARTBLOCK) {
3331
3332 dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
3333 map.br_startblock);
3334 dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
3335 map.br_blockcount);
3336 bp = xfs_trans_get_buf(*trans,
3337 dp->i_mount->m_ddev_targp,
3338 dblkno, dblkcnt, 0);
3339 if (!bp)
3340 return ENOMEM;
3341 xfs_trans_binval(*trans, bp);
3342 /*
3343 * Roll to next transaction.
3344 */
3345 error = xfs_trans_roll(trans, dp);
3346 if (error)
3347 return (error);
3348 }
3349
3350 tblkno += map.br_blockcount;
3351 tblkcnt -= map.br_blockcount;
3352 }
3353
3354 return(0);
3355}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 444a7704596c..c1022138c7e6 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -333,6 +333,8 @@ int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
333 struct xfs_buf **bpp); 333 struct xfs_buf **bpp);
334void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to, 334void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to,
335 struct xfs_attr_leafblock *from); 335 struct xfs_attr_leafblock *from);
336void xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to,
337 struct xfs_attr3_icleaf_hdr *from);
336 338
337extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; 339extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
338 340
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
new file mode 100644
index 000000000000..cbc80d485177
--- /dev/null
+++ b/fs/xfs/xfs_attr_list.c
@@ -0,0 +1,655 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_types.h"
22#include "xfs_bit.h"
23#include "xfs_log.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_da_btree.h"
29#include "xfs_bmap_btree.h"
30#include "xfs_alloc_btree.h"
31#include "xfs_ialloc_btree.h"
32#include "xfs_alloc.h"
33#include "xfs_btree.h"
34#include "xfs_attr_sf.h"
35#include "xfs_attr_remote.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h"
38#include "xfs_inode_item.h"
39#include "xfs_bmap.h"
40#include "xfs_attr.h"
41#include "xfs_attr_leaf.h"
42#include "xfs_error.h"
43#include "xfs_trace.h"
44#include "xfs_buf_item.h"
45#include "xfs_cksum.h"
46
47STATIC int
48xfs_attr_shortform_compare(const void *a, const void *b)
49{
50 xfs_attr_sf_sort_t *sa, *sb;
51
52 sa = (xfs_attr_sf_sort_t *)a;
53 sb = (xfs_attr_sf_sort_t *)b;
54 if (sa->hash < sb->hash) {
55 return(-1);
56 } else if (sa->hash > sb->hash) {
57 return(1);
58 } else {
59 return(sa->entno - sb->entno);
60 }
61}
62
63#define XFS_ISRESET_CURSOR(cursor) \
64 (!((cursor)->initted) && !((cursor)->hashval) && \
65 !((cursor)->blkno) && !((cursor)->offset))
66/*
67 * Copy out entries of shortform attribute lists for attr_list().
68 * Shortform attribute lists are not stored in hashval sorted order.
69 * If the output buffer is not large enough to hold them all, then we
70 * we have to calculate each entries' hashvalue and sort them before
71 * we can begin returning them to the user.
72 */
73int
74xfs_attr_shortform_list(xfs_attr_list_context_t *context)
75{
76 attrlist_cursor_kern_t *cursor;
77 xfs_attr_sf_sort_t *sbuf, *sbp;
78 xfs_attr_shortform_t *sf;
79 xfs_attr_sf_entry_t *sfe;
80 xfs_inode_t *dp;
81 int sbsize, nsbuf, count, i;
82 int error;
83
84 ASSERT(context != NULL);
85 dp = context->dp;
86 ASSERT(dp != NULL);
87 ASSERT(dp->i_afp != NULL);
88 sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
89 ASSERT(sf != NULL);
90 if (!sf->hdr.count)
91 return(0);
92 cursor = context->cursor;
93 ASSERT(cursor != NULL);
94
95 trace_xfs_attr_list_sf(context);
96
97 /*
98 * If the buffer is large enough and the cursor is at the start,
99 * do not bother with sorting since we will return everything in
100 * one buffer and another call using the cursor won't need to be
101 * made.
102 * Note the generous fudge factor of 16 overhead bytes per entry.
103 * If bufsize is zero then put_listent must be a search function
104 * and can just scan through what we have.
105 */
106 if (context->bufsize == 0 ||
107 (XFS_ISRESET_CURSOR(cursor) &&
108 (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
109 for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
110 error = context->put_listent(context,
111 sfe->flags,
112 sfe->nameval,
113 (int)sfe->namelen,
114 (int)sfe->valuelen,
115 &sfe->nameval[sfe->namelen]);
116
117 /*
118 * Either search callback finished early or
119 * didn't fit it all in the buffer after all.
120 */
121 if (context->seen_enough)
122 break;
123
124 if (error)
125 return error;
126 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
127 }
128 trace_xfs_attr_list_sf_all(context);
129 return(0);
130 }
131
132 /* do no more for a search callback */
133 if (context->bufsize == 0)
134 return 0;
135
136 /*
137 * It didn't all fit, so we have to sort everything on hashval.
138 */
139 sbsize = sf->hdr.count * sizeof(*sbuf);
140 sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
141
142 /*
143 * Scan the attribute list for the rest of the entries, storing
144 * the relevant info from only those that match into a buffer.
145 */
146 nsbuf = 0;
147 for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
148 if (unlikely(
149 ((char *)sfe < (char *)sf) ||
150 ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) {
151 XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
152 XFS_ERRLEVEL_LOW,
153 context->dp->i_mount, sfe);
154 kmem_free(sbuf);
155 return XFS_ERROR(EFSCORRUPTED);
156 }
157
158 sbp->entno = i;
159 sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen);
160 sbp->name = sfe->nameval;
161 sbp->namelen = sfe->namelen;
162 /* These are bytes, and both on-disk, don't endian-flip */
163 sbp->valuelen = sfe->valuelen;
164 sbp->flags = sfe->flags;
165 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
166 sbp++;
167 nsbuf++;
168 }
169
170 /*
171 * Sort the entries on hash then entno.
172 */
173 xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare);
174
175 /*
176 * Re-find our place IN THE SORTED LIST.
177 */
178 count = 0;
179 cursor->initted = 1;
180 cursor->blkno = 0;
181 for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) {
182 if (sbp->hash == cursor->hashval) {
183 if (cursor->offset == count) {
184 break;
185 }
186 count++;
187 } else if (sbp->hash > cursor->hashval) {
188 break;
189 }
190 }
191 if (i == nsbuf) {
192 kmem_free(sbuf);
193 return(0);
194 }
195
196 /*
197 * Loop putting entries into the user buffer.
198 */
199 for ( ; i < nsbuf; i++, sbp++) {
200 if (cursor->hashval != sbp->hash) {
201 cursor->hashval = sbp->hash;
202 cursor->offset = 0;
203 }
204 error = context->put_listent(context,
205 sbp->flags,
206 sbp->name,
207 sbp->namelen,
208 sbp->valuelen,
209 &sbp->name[sbp->namelen]);
210 if (error)
211 return error;
212 if (context->seen_enough)
213 break;
214 cursor->offset++;
215 }
216
217 kmem_free(sbuf);
218 return(0);
219}
220
221STATIC int
222xfs_attr_node_list(xfs_attr_list_context_t *context)
223{
224 attrlist_cursor_kern_t *cursor;
225 xfs_attr_leafblock_t *leaf;
226 xfs_da_intnode_t *node;
227 struct xfs_attr3_icleaf_hdr leafhdr;
228 struct xfs_da3_icnode_hdr nodehdr;
229 struct xfs_da_node_entry *btree;
230 int error, i;
231 struct xfs_buf *bp;
232
233 trace_xfs_attr_node_list(context);
234
235 cursor = context->cursor;
236 cursor->initted = 1;
237
238 /*
239 * Do all sorts of validation on the passed-in cursor structure.
240 * If anything is amiss, ignore the cursor and look up the hashval
241 * starting from the btree root.
242 */
243 bp = NULL;
244 if (cursor->blkno > 0) {
245 error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1,
246 &bp, XFS_ATTR_FORK);
247 if ((error != 0) && (error != EFSCORRUPTED))
248 return(error);
249 if (bp) {
250 struct xfs_attr_leaf_entry *entries;
251
252 node = bp->b_addr;
253 switch (be16_to_cpu(node->hdr.info.magic)) {
254 case XFS_DA_NODE_MAGIC:
255 case XFS_DA3_NODE_MAGIC:
256 trace_xfs_attr_list_wrong_blk(context);
257 xfs_trans_brelse(NULL, bp);
258 bp = NULL;
259 break;
260 case XFS_ATTR_LEAF_MAGIC:
261 case XFS_ATTR3_LEAF_MAGIC:
262 leaf = bp->b_addr;
263 xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
264 entries = xfs_attr3_leaf_entryp(leaf);
265 if (cursor->hashval > be32_to_cpu(
266 entries[leafhdr.count - 1].hashval)) {
267 trace_xfs_attr_list_wrong_blk(context);
268 xfs_trans_brelse(NULL, bp);
269 bp = NULL;
270 } else if (cursor->hashval <= be32_to_cpu(
271 entries[0].hashval)) {
272 trace_xfs_attr_list_wrong_blk(context);
273 xfs_trans_brelse(NULL, bp);
274 bp = NULL;
275 }
276 break;
277 default:
278 trace_xfs_attr_list_wrong_blk(context);
279 xfs_trans_brelse(NULL, bp);
280 bp = NULL;
281 }
282 }
283 }
284
285 /*
286 * We did not find what we expected given the cursor's contents,
287 * so we start from the top and work down based on the hash value.
288 * Note that start of node block is same as start of leaf block.
289 */
290 if (bp == NULL) {
291 cursor->blkno = 0;
292 for (;;) {
293 __uint16_t magic;
294
295 error = xfs_da3_node_read(NULL, context->dp,
296 cursor->blkno, -1, &bp,
297 XFS_ATTR_FORK);
298 if (error)
299 return(error);
300 node = bp->b_addr;
301 magic = be16_to_cpu(node->hdr.info.magic);
302 if (magic == XFS_ATTR_LEAF_MAGIC ||
303 magic == XFS_ATTR3_LEAF_MAGIC)
304 break;
305 if (magic != XFS_DA_NODE_MAGIC &&
306 magic != XFS_DA3_NODE_MAGIC) {
307 XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)",
308 XFS_ERRLEVEL_LOW,
309 context->dp->i_mount,
310 node);
311 xfs_trans_brelse(NULL, bp);
312 return XFS_ERROR(EFSCORRUPTED);
313 }
314
315 xfs_da3_node_hdr_from_disk(&nodehdr, node);
316 btree = xfs_da3_node_tree_p(node);
317 for (i = 0; i < nodehdr.count; btree++, i++) {
318 if (cursor->hashval
319 <= be32_to_cpu(btree->hashval)) {
320 cursor->blkno = be32_to_cpu(btree->before);
321 trace_xfs_attr_list_node_descend(context,
322 btree);
323 break;
324 }
325 }
326 if (i == nodehdr.count) {
327 xfs_trans_brelse(NULL, bp);
328 return 0;
329 }
330 xfs_trans_brelse(NULL, bp);
331 }
332 }
333 ASSERT(bp != NULL);
334
335 /*
336 * Roll upward through the blocks, processing each leaf block in
337 * order. As long as there is space in the result buffer, keep
338 * adding the information.
339 */
340 for (;;) {
341 leaf = bp->b_addr;
342 error = xfs_attr3_leaf_list_int(bp, context);
343 if (error) {
344 xfs_trans_brelse(NULL, bp);
345 return error;
346 }
347 xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
348 if (context->seen_enough || leafhdr.forw == 0)
349 break;
350 cursor->blkno = leafhdr.forw;
351 xfs_trans_brelse(NULL, bp);
352 error = xfs_attr3_leaf_read(NULL, context->dp, cursor->blkno, -1,
353 &bp);
354 if (error)
355 return error;
356 }
357 xfs_trans_brelse(NULL, bp);
358 return 0;
359}
360
361/*
362 * Copy out attribute list entries for attr_list(), for leaf attribute lists.
363 */
364int
365xfs_attr3_leaf_list_int(
366 struct xfs_buf *bp,
367 struct xfs_attr_list_context *context)
368{
369 struct attrlist_cursor_kern *cursor;
370 struct xfs_attr_leafblock *leaf;
371 struct xfs_attr3_icleaf_hdr ichdr;
372 struct xfs_attr_leaf_entry *entries;
373 struct xfs_attr_leaf_entry *entry;
374 int retval;
375 int i;
376
377 trace_xfs_attr_list_leaf(context);
378
379 leaf = bp->b_addr;
380 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
381 entries = xfs_attr3_leaf_entryp(leaf);
382
383 cursor = context->cursor;
384 cursor->initted = 1;
385
386 /*
387 * Re-find our place in the leaf block if this is a new syscall.
388 */
389 if (context->resynch) {
390 entry = &entries[0];
391 for (i = 0; i < ichdr.count; entry++, i++) {
392 if (be32_to_cpu(entry->hashval) == cursor->hashval) {
393 if (cursor->offset == context->dupcnt) {
394 context->dupcnt = 0;
395 break;
396 }
397 context->dupcnt++;
398 } else if (be32_to_cpu(entry->hashval) >
399 cursor->hashval) {
400 context->dupcnt = 0;
401 break;
402 }
403 }
404 if (i == ichdr.count) {
405 trace_xfs_attr_list_notfound(context);
406 return 0;
407 }
408 } else {
409 entry = &entries[0];
410 i = 0;
411 }
412 context->resynch = 0;
413
414 /*
415 * We have found our place, start copying out the new attributes.
416 */
417 retval = 0;
418 for (; i < ichdr.count; entry++, i++) {
419 if (be32_to_cpu(entry->hashval) != cursor->hashval) {
420 cursor->hashval = be32_to_cpu(entry->hashval);
421 cursor->offset = 0;
422 }
423
424 if (entry->flags & XFS_ATTR_INCOMPLETE)
425 continue; /* skip incomplete entries */
426
427 if (entry->flags & XFS_ATTR_LOCAL) {
428 xfs_attr_leaf_name_local_t *name_loc =
429 xfs_attr3_leaf_name_local(leaf, i);
430
431 retval = context->put_listent(context,
432 entry->flags,
433 name_loc->nameval,
434 (int)name_loc->namelen,
435 be16_to_cpu(name_loc->valuelen),
436 &name_loc->nameval[name_loc->namelen]);
437 if (retval)
438 return retval;
439 } else {
440 xfs_attr_leaf_name_remote_t *name_rmt =
441 xfs_attr3_leaf_name_remote(leaf, i);
442
443 int valuelen = be32_to_cpu(name_rmt->valuelen);
444
445 if (context->put_value) {
446 xfs_da_args_t args;
447
448 memset((char *)&args, 0, sizeof(args));
449 args.dp = context->dp;
450 args.whichfork = XFS_ATTR_FORK;
451 args.valuelen = valuelen;
452 args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
453 args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
454 args.rmtblkcnt = xfs_attr3_rmt_blocks(
455 args.dp->i_mount, valuelen);
456 retval = xfs_attr_rmtval_get(&args);
457 if (retval)
458 return retval;
459 retval = context->put_listent(context,
460 entry->flags,
461 name_rmt->name,
462 (int)name_rmt->namelen,
463 valuelen,
464 args.value);
465 kmem_free(args.value);
466 } else {
467 retval = context->put_listent(context,
468 entry->flags,
469 name_rmt->name,
470 (int)name_rmt->namelen,
471 valuelen,
472 NULL);
473 }
474 if (retval)
475 return retval;
476 }
477 if (context->seen_enough)
478 break;
479 cursor->offset++;
480 }
481 trace_xfs_attr_list_leaf_end(context);
482 return retval;
483}
484
485/*
486 * Copy out attribute entries for attr_list(), for leaf attribute lists.
487 */
488STATIC int
489xfs_attr_leaf_list(xfs_attr_list_context_t *context)
490{
491 int error;
492 struct xfs_buf *bp;
493
494 trace_xfs_attr_leaf_list(context);
495
496 context->cursor->blkno = 0;
497 error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
498 if (error)
499 return XFS_ERROR(error);
500
501 error = xfs_attr3_leaf_list_int(bp, context);
502 xfs_trans_brelse(NULL, bp);
503 return XFS_ERROR(error);
504}
505
506int
507xfs_attr_list_int(
508 xfs_attr_list_context_t *context)
509{
510 int error;
511 xfs_inode_t *dp = context->dp;
512
513 XFS_STATS_INC(xs_attr_list);
514
515 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
516 return EIO;
517
518 xfs_ilock(dp, XFS_ILOCK_SHARED);
519
520 /*
521 * Decide on what work routines to call based on the inode size.
522 */
523 if (!xfs_inode_hasattr(dp)) {
524 error = 0;
525 } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
526 error = xfs_attr_shortform_list(context);
527 } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
528 error = xfs_attr_leaf_list(context);
529 } else {
530 error = xfs_attr_node_list(context);
531 }
532
533 xfs_iunlock(dp, XFS_ILOCK_SHARED);
534
535 return error;
536}
537
538#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \
539 (((struct attrlist_ent *) 0)->a_name - (char *) 0)
540#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \
541 ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \
542 & ~(sizeof(u_int32_t)-1))
543
544/*
545 * Format an attribute and copy it out to the user's buffer.
546 * Take care to check values and protect against them changing later,
547 * we may be reading them directly out of a user buffer.
548 */
549STATIC int
550xfs_attr_put_listent(
551 xfs_attr_list_context_t *context,
552 int flags,
553 unsigned char *name,
554 int namelen,
555 int valuelen,
556 unsigned char *value)
557{
558 struct attrlist *alist = (struct attrlist *)context->alist;
559 attrlist_ent_t *aep;
560 int arraytop;
561
562 ASSERT(!(context->flags & ATTR_KERNOVAL));
563 ASSERT(context->count >= 0);
564 ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
565 ASSERT(context->firstu >= sizeof(*alist));
566 ASSERT(context->firstu <= context->bufsize);
567
568 /*
569 * Only list entries in the right namespace.
570 */
571 if (((context->flags & ATTR_SECURE) == 0) !=
572 ((flags & XFS_ATTR_SECURE) == 0))
573 return 0;
574 if (((context->flags & ATTR_ROOT) == 0) !=
575 ((flags & XFS_ATTR_ROOT) == 0))
576 return 0;
577
578 arraytop = sizeof(*alist) +
579 context->count * sizeof(alist->al_offset[0]);
580 context->firstu -= ATTR_ENTSIZE(namelen);
581 if (context->firstu < arraytop) {
582 trace_xfs_attr_list_full(context);
583 alist->al_more = 1;
584 context->seen_enough = 1;
585 return 1;
586 }
587
588 aep = (attrlist_ent_t *)&context->alist[context->firstu];
589 aep->a_valuelen = valuelen;
590 memcpy(aep->a_name, name, namelen);
591 aep->a_name[namelen] = 0;
592 alist->al_offset[context->count++] = context->firstu;
593 alist->al_count = context->count;
594 trace_xfs_attr_list_add(context);
595 return 0;
596}
597
598/*
599 * Generate a list of extended attribute names and optionally
600 * also value lengths. Positive return value follows the XFS
601 * convention of being an error, zero or negative return code
602 * is the length of the buffer returned (negated), indicating
603 * success.
604 */
605int
606xfs_attr_list(
607 xfs_inode_t *dp,
608 char *buffer,
609 int bufsize,
610 int flags,
611 attrlist_cursor_kern_t *cursor)
612{
613 xfs_attr_list_context_t context;
614 struct attrlist *alist;
615 int error;
616
617 /*
618 * Validate the cursor.
619 */
620 if (cursor->pad1 || cursor->pad2)
621 return(XFS_ERROR(EINVAL));
622 if ((cursor->initted == 0) &&
623 (cursor->hashval || cursor->blkno || cursor->offset))
624 return XFS_ERROR(EINVAL);
625
626 /*
627 * Check for a properly aligned buffer.
628 */
629 if (((long)buffer) & (sizeof(int)-1))
630 return XFS_ERROR(EFAULT);
631 if (flags & ATTR_KERNOVAL)
632 bufsize = 0;
633
634 /*
635 * Initialize the output buffer.
636 */
637 memset(&context, 0, sizeof(context));
638 context.dp = dp;
639 context.cursor = cursor;
640 context.resynch = 1;
641 context.flags = flags;
642 context.alist = buffer;
643 context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */
644 context.firstu = context.bufsize;
645 context.put_listent = xfs_attr_put_listent;
646
647 alist = (struct attrlist *)context.alist;
648 alist->al_count = 0;
649 alist->al_more = 0;
650 alist->al_offset[0] = context.bufsize;
651
652 error = xfs_attr_list_int(&context);
653 ASSERT(error >= 0);
654 return error;
655}
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index ef6b0c124528..712a502de619 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -22,6 +22,7 @@
22#include "xfs_bit.h" 22#include "xfs_bit.h"
23#include "xfs_log.h" 23#include "xfs_log.h"
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
25#include "xfs_sb.h" 26#include "xfs_sb.h"
26#include "xfs_ag.h" 27#include "xfs_ag.h"
27#include "xfs_mount.h" 28#include "xfs_mount.h"
@@ -33,6 +34,7 @@
33#include "xfs_alloc.h" 34#include "xfs_alloc.h"
34#include "xfs_inode_item.h" 35#include "xfs_inode_item.h"
35#include "xfs_bmap.h" 36#include "xfs_bmap.h"
37#include "xfs_bmap_util.h"
36#include "xfs_attr.h" 38#include "xfs_attr.h"
37#include "xfs_attr_leaf.h" 39#include "xfs_attr_leaf.h"
38#include "xfs_attr_remote.h" 40#include "xfs_attr_remote.h"
@@ -237,7 +239,7 @@ xfs_attr_rmtval_copyout(
237 xfs_ino_t ino, 239 xfs_ino_t ino,
238 int *offset, 240 int *offset,
239 int *valuelen, 241 int *valuelen,
240 char **dst) 242 __uint8_t **dst)
241{ 243{
242 char *src = bp->b_addr; 244 char *src = bp->b_addr;
243 xfs_daddr_t bno = bp->b_bn; 245 xfs_daddr_t bno = bp->b_bn;
@@ -249,7 +251,7 @@ xfs_attr_rmtval_copyout(
249 int hdr_size = 0; 251 int hdr_size = 0;
250 int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); 252 int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp));
251 253
252 byte_cnt = min_t(int, *valuelen, byte_cnt); 254 byte_cnt = min(*valuelen, byte_cnt);
253 255
254 if (xfs_sb_version_hascrc(&mp->m_sb)) { 256 if (xfs_sb_version_hascrc(&mp->m_sb)) {
255 if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset, 257 if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset,
@@ -284,7 +286,7 @@ xfs_attr_rmtval_copyin(
284 xfs_ino_t ino, 286 xfs_ino_t ino,
285 int *offset, 287 int *offset,
286 int *valuelen, 288 int *valuelen,
287 char **src) 289 __uint8_t **src)
288{ 290{
289 char *dst = bp->b_addr; 291 char *dst = bp->b_addr;
290 xfs_daddr_t bno = bp->b_bn; 292 xfs_daddr_t bno = bp->b_bn;
@@ -337,7 +339,7 @@ xfs_attr_rmtval_get(
337 struct xfs_mount *mp = args->dp->i_mount; 339 struct xfs_mount *mp = args->dp->i_mount;
338 struct xfs_buf *bp; 340 struct xfs_buf *bp;
339 xfs_dablk_t lblkno = args->rmtblkno; 341 xfs_dablk_t lblkno = args->rmtblkno;
340 char *dst = args->value; 342 __uint8_t *dst = args->value;
341 int valuelen = args->valuelen; 343 int valuelen = args->valuelen;
342 int nmap; 344 int nmap;
343 int error; 345 int error;
@@ -401,7 +403,7 @@ xfs_attr_rmtval_set(
401 struct xfs_bmbt_irec map; 403 struct xfs_bmbt_irec map;
402 xfs_dablk_t lblkno; 404 xfs_dablk_t lblkno;
403 xfs_fileoff_t lfileoff = 0; 405 xfs_fileoff_t lfileoff = 0;
404 char *src = args->value; 406 __uint8_t *src = args->value;
405 int blkcnt; 407 int blkcnt;
406 int valuelen; 408 int valuelen;
407 int nmap; 409 int nmap;
@@ -543,11 +545,6 @@ xfs_attr_rmtval_remove(
543 545
544 /* 546 /*
545 * Roll through the "value", invalidating the attribute value's blocks. 547 * Roll through the "value", invalidating the attribute value's blocks.
546 * Note that args->rmtblkcnt is the minimum number of data blocks we'll
547 * see for a CRC enabled remote attribute. Each extent will have a
548 * header, and so we may have more blocks than we realise here. If we
549 * fail to map the blocks correctly, we'll have problems with the buffer
550 * lookups.
551 */ 548 */
552 lblkno = args->rmtblkno; 549 lblkno = args->rmtblkno;
553 blkcnt = args->rmtblkcnt; 550 blkcnt = args->rmtblkcnt;
@@ -628,4 +625,3 @@ xfs_attr_rmtval_remove(
628 } 625 }
629 return(0); 626 return(0);
630} 627}
631
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 05c698ccb238..92b830901d60 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -17,16 +17,17 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_format.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h" 23#include "xfs_inum.h"
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
29#include "xfs_dir2_format.h"
30#include "xfs_dir2.h"
30#include "xfs_bmap_btree.h" 31#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 32#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 33#include "xfs_ialloc_btree.h"
@@ -39,6 +40,7 @@
39#include "xfs_extfree_item.h" 40#include "xfs_extfree_item.h"
40#include "xfs_alloc.h" 41#include "xfs_alloc.h"
41#include "xfs_bmap.h" 42#include "xfs_bmap.h"
43#include "xfs_bmap_util.h"
42#include "xfs_rtalloc.h" 44#include "xfs_rtalloc.h"
43#include "xfs_error.h" 45#include "xfs_error.h"
44#include "xfs_attr_leaf.h" 46#include "xfs_attr_leaf.h"
@@ -46,7 +48,6 @@
46#include "xfs_trans_space.h" 48#include "xfs_trans_space.h"
47#include "xfs_buf_item.h" 49#include "xfs_buf_item.h"
48#include "xfs_filestream.h" 50#include "xfs_filestream.h"
49#include "xfs_vnodeops.h"
50#include "xfs_trace.h" 51#include "xfs_trace.h"
51#include "xfs_symlink.h" 52#include "xfs_symlink.h"
52 53
@@ -108,19 +109,6 @@ xfs_bmap_compute_maxlevels(
108 mp->m_bm_maxlevels[whichfork] = level; 109 mp->m_bm_maxlevels[whichfork] = level;
109} 110}
110 111
111/*
112 * Convert the given file system block to a disk block. We have to treat it
113 * differently based on whether the file is a real time file or not, because the
114 * bmap code does.
115 */
116xfs_daddr_t
117xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
118{
119 return (XFS_IS_REALTIME_INODE(ip) ? \
120 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
121 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
122}
123
124STATIC int /* error */ 112STATIC int /* error */
125xfs_bmbt_lookup_eq( 113xfs_bmbt_lookup_eq(
126 struct xfs_btree_cur *cur, 114 struct xfs_btree_cur *cur,
@@ -263,173 +251,6 @@ xfs_bmap_forkoff_reset(
263} 251}
264 252
265/* 253/*
266 * Extent tree block counting routines.
267 */
268
269/*
270 * Count leaf blocks given a range of extent records.
271 */
272STATIC void
273xfs_bmap_count_leaves(
274 xfs_ifork_t *ifp,
275 xfs_extnum_t idx,
276 int numrecs,
277 int *count)
278{
279 int b;
280
281 for (b = 0; b < numrecs; b++) {
282 xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
283 *count += xfs_bmbt_get_blockcount(frp);
284 }
285}
286
287/*
288 * Count leaf blocks given a range of extent records originally
289 * in btree format.
290 */
291STATIC void
292xfs_bmap_disk_count_leaves(
293 struct xfs_mount *mp,
294 struct xfs_btree_block *block,
295 int numrecs,
296 int *count)
297{
298 int b;
299 xfs_bmbt_rec_t *frp;
300
301 for (b = 1; b <= numrecs; b++) {
302 frp = XFS_BMBT_REC_ADDR(mp, block, b);
303 *count += xfs_bmbt_disk_get_blockcount(frp);
304 }
305}
306
307/*
308 * Recursively walks each level of a btree
309 * to count total fsblocks is use.
310 */
311STATIC int /* error */
312xfs_bmap_count_tree(
313 xfs_mount_t *mp, /* file system mount point */
314 xfs_trans_t *tp, /* transaction pointer */
315 xfs_ifork_t *ifp, /* inode fork pointer */
316 xfs_fsblock_t blockno, /* file system block number */
317 int levelin, /* level in btree */
318 int *count) /* Count of blocks */
319{
320 int error;
321 xfs_buf_t *bp, *nbp;
322 int level = levelin;
323 __be64 *pp;
324 xfs_fsblock_t bno = blockno;
325 xfs_fsblock_t nextbno;
326 struct xfs_btree_block *block, *nextblock;
327 int numrecs;
328
329 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
330 &xfs_bmbt_buf_ops);
331 if (error)
332 return error;
333 *count += 1;
334 block = XFS_BUF_TO_BLOCK(bp);
335
336 if (--level) {
337 /* Not at node above leaves, count this level of nodes */
338 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
339 while (nextbno != NULLFSBLOCK) {
340 error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
341 XFS_BMAP_BTREE_REF,
342 &xfs_bmbt_buf_ops);
343 if (error)
344 return error;
345 *count += 1;
346 nextblock = XFS_BUF_TO_BLOCK(nbp);
347 nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
348 xfs_trans_brelse(tp, nbp);
349 }
350
351 /* Dive to the next level */
352 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
353 bno = be64_to_cpu(*pp);
354 if (unlikely((error =
355 xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
356 xfs_trans_brelse(tp, bp);
357 XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
358 XFS_ERRLEVEL_LOW, mp);
359 return XFS_ERROR(EFSCORRUPTED);
360 }
361 xfs_trans_brelse(tp, bp);
362 } else {
363 /* count all level 1 nodes and their leaves */
364 for (;;) {
365 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
366 numrecs = be16_to_cpu(block->bb_numrecs);
367 xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
368 xfs_trans_brelse(tp, bp);
369 if (nextbno == NULLFSBLOCK)
370 break;
371 bno = nextbno;
372 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
373 XFS_BMAP_BTREE_REF,
374 &xfs_bmbt_buf_ops);
375 if (error)
376 return error;
377 *count += 1;
378 block = XFS_BUF_TO_BLOCK(bp);
379 }
380 }
381 return 0;
382}
383
384/*
385 * Count fsblocks of the given fork.
386 */
387int /* error */
388xfs_bmap_count_blocks(
389 xfs_trans_t *tp, /* transaction pointer */
390 xfs_inode_t *ip, /* incore inode */
391 int whichfork, /* data or attr fork */
392 int *count) /* out: count of blocks */
393{
394 struct xfs_btree_block *block; /* current btree block */
395 xfs_fsblock_t bno; /* block # of "block" */
396 xfs_ifork_t *ifp; /* fork structure */
397 int level; /* btree level, for checking */
398 xfs_mount_t *mp; /* file system mount structure */
399 __be64 *pp; /* pointer to block address */
400
401 bno = NULLFSBLOCK;
402 mp = ip->i_mount;
403 ifp = XFS_IFORK_PTR(ip, whichfork);
404 if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
405 xfs_bmap_count_leaves(ifp, 0,
406 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
407 count);
408 return 0;
409 }
410
411 /*
412 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
413 */
414 block = ifp->if_broot;
415 level = be16_to_cpu(block->bb_level);
416 ASSERT(level > 0);
417 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
418 bno = be64_to_cpu(*pp);
419 ASSERT(bno != NULLDFSBNO);
420 ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
421 ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
422
423 if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
424 XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
425 mp);
426 return XFS_ERROR(EFSCORRUPTED);
427 }
428
429 return 0;
430}
431
432/*
433 * Debug/sanity checking code 254 * Debug/sanity checking code
434 */ 255 */
435 256
@@ -724,8 +545,8 @@ xfs_bmap_trace_exlist(
724 545
725/* 546/*
726 * Validate that the bmbt_irecs being returned from bmapi are valid 547 * Validate that the bmbt_irecs being returned from bmapi are valid
727 * given the callers original parameters. Specifically check the 548 * given the caller's original parameters. Specifically check the
728 * ranges of the returned irecs to ensure that they only extent beyond 549 * ranges of the returned irecs to ensure that they only extend beyond
729 * the given parameters if the XFS_BMAPI_ENTIRE flag was set. 550 * the given parameters if the XFS_BMAPI_ENTIRE flag was set.
730 */ 551 */
731STATIC void 552STATIC void
@@ -823,7 +644,7 @@ xfs_bmap_add_free(
823 * Remove the entry "free" from the free item list. Prev points to the 644 * Remove the entry "free" from the free item list. Prev points to the
824 * previous entry, unless "free" is the head of the list. 645 * previous entry, unless "free" is the head of the list.
825 */ 646 */
826STATIC void 647void
827xfs_bmap_del_free( 648xfs_bmap_del_free(
828 xfs_bmap_free_t *flist, /* free item list header */ 649 xfs_bmap_free_t *flist, /* free item list header */
829 xfs_bmap_free_item_t *prev, /* previous item on list, if any */ 650 xfs_bmap_free_item_t *prev, /* previous item on list, if any */
@@ -837,92 +658,6 @@ xfs_bmap_del_free(
837 kmem_zone_free(xfs_bmap_free_item_zone, free); 658 kmem_zone_free(xfs_bmap_free_item_zone, free);
838} 659}
839 660
840
841/*
842 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
843 * caller. Frees all the extents that need freeing, which must be done
844 * last due to locking considerations. We never free any extents in
845 * the first transaction.
846 *
847 * Return 1 if the given transaction was committed and a new one
848 * started, and 0 otherwise in the committed parameter.
849 */
850int /* error */
851xfs_bmap_finish(
852 xfs_trans_t **tp, /* transaction pointer addr */
853 xfs_bmap_free_t *flist, /* i/o: list extents to free */
854 int *committed) /* xact committed or not */
855{
856 xfs_efd_log_item_t *efd; /* extent free data */
857 xfs_efi_log_item_t *efi; /* extent free intention */
858 int error; /* error return value */
859 xfs_bmap_free_item_t *free; /* free extent item */
860 unsigned int logres; /* new log reservation */
861 unsigned int logcount; /* new log count */
862 xfs_mount_t *mp; /* filesystem mount structure */
863 xfs_bmap_free_item_t *next; /* next item on free list */
864 xfs_trans_t *ntp; /* new transaction pointer */
865
866 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
867 if (flist->xbf_count == 0) {
868 *committed = 0;
869 return 0;
870 }
871 ntp = *tp;
872 efi = xfs_trans_get_efi(ntp, flist->xbf_count);
873 for (free = flist->xbf_first; free; free = free->xbfi_next)
874 xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
875 free->xbfi_blockcount);
876 logres = ntp->t_log_res;
877 logcount = ntp->t_log_count;
878 ntp = xfs_trans_dup(*tp);
879 error = xfs_trans_commit(*tp, 0);
880 *tp = ntp;
881 *committed = 1;
882 /*
883 * We have a new transaction, so we should return committed=1,
884 * even though we're returning an error.
885 */
886 if (error)
887 return error;
888
889 /*
890 * transaction commit worked ok so we can drop the extra ticket
891 * reference that we gained in xfs_trans_dup()
892 */
893 xfs_log_ticket_put(ntp->t_ticket);
894
895 if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
896 logcount)))
897 return error;
898 efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
899 for (free = flist->xbf_first; free != NULL; free = next) {
900 next = free->xbfi_next;
901 if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
902 free->xbfi_blockcount))) {
903 /*
904 * The bmap free list will be cleaned up at a
905 * higher level. The EFI will be canceled when
906 * this transaction is aborted.
907 * Need to force shutdown here to make sure it
908 * happens, since this transaction may not be
909 * dirty yet.
910 */
911 mp = ntp->t_mountp;
912 if (!XFS_FORCED_SHUTDOWN(mp))
913 xfs_force_shutdown(mp,
914 (error == EFSCORRUPTED) ?
915 SHUTDOWN_CORRUPT_INCORE :
916 SHUTDOWN_META_IO_ERROR);
917 return error;
918 }
919 xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
920 free->xbfi_blockcount);
921 xfs_bmap_del_free(flist, NULL, free);
922 }
923 return 0;
924}
925
926/* 661/*
927 * Free up any items left in the list. 662 * Free up any items left in the list.
928 */ 663 */
@@ -1413,8 +1148,8 @@ xfs_bmap_add_attrfork(
1413 blks = XFS_ADDAFORK_SPACE_RES(mp); 1148 blks = XFS_ADDAFORK_SPACE_RES(mp);
1414 if (rsvd) 1149 if (rsvd)
1415 tp->t_flags |= XFS_TRANS_RESERVE; 1150 tp->t_flags |= XFS_TRANS_RESERVE;
1416 if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0, 1151 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
1417 XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT))) 1152 if (error)
1418 goto error0; 1153 goto error0;
1419 xfs_ilock(ip, XFS_ILOCK_EXCL); 1154 xfs_ilock(ip, XFS_ILOCK_EXCL);
1420 error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? 1155 error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
@@ -1815,7 +1550,7 @@ xfs_bmap_first_unused(
1815} 1550}
1816 1551
1817/* 1552/*
1818 * Returns the file-relative block number of the last block + 1 before 1553 * Returns the file-relative block number of the last block - 1 before
1819 * last_block (input value) in the file. 1554 * last_block (input value) in the file.
1820 * This is not based on i_size, it is based on the extent records. 1555 * This is not based on i_size, it is based on the extent records.
1821 * Returns 0 for local files, as they do not have extent records. 1556 * Returns 0 for local files, as they do not have extent records.
@@ -1863,7 +1598,7 @@ xfs_bmap_last_before(
1863 return 0; 1598 return 0;
1864} 1599}
1865 1600
1866STATIC int 1601int
1867xfs_bmap_last_extent( 1602xfs_bmap_last_extent(
1868 struct xfs_trans *tp, 1603 struct xfs_trans *tp,
1869 struct xfs_inode *ip, 1604 struct xfs_inode *ip,
@@ -1927,29 +1662,6 @@ xfs_bmap_isaeof(
1927} 1662}
1928 1663
1929/* 1664/*
1930 * Check if the endoff is outside the last extent. If so the caller will grow
1931 * the allocation to a stripe unit boundary. All offsets are considered outside
1932 * the end of file for an empty fork, so 1 is returned in *eof in that case.
1933 */
1934int
1935xfs_bmap_eof(
1936 struct xfs_inode *ip,
1937 xfs_fileoff_t endoff,
1938 int whichfork,
1939 int *eof)
1940{
1941 struct xfs_bmbt_irec rec;
1942 int error;
1943
1944 error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
1945 if (error || *eof)
1946 return error;
1947
1948 *eof = endoff >= rec.br_startoff + rec.br_blockcount;
1949 return 0;
1950}
1951
1952/*
1953 * Returns the file-relative block number of the first block past eof in 1665 * Returns the file-relative block number of the first block past eof in
1954 * the file. This is not based on i_size, it is based on the extent records. 1666 * the file. This is not based on i_size, it is based on the extent records.
1955 * Returns 0 for local files, as they do not have extent records. 1667 * Returns 0 for local files, as they do not have extent records.
@@ -3488,7 +3200,7 @@ done:
3488/* 3200/*
3489 * Adjust the size of the new extent based on di_extsize and rt extsize. 3201 * Adjust the size of the new extent based on di_extsize and rt extsize.
3490 */ 3202 */
3491STATIC int 3203int
3492xfs_bmap_extsize_align( 3204xfs_bmap_extsize_align(
3493 xfs_mount_t *mp, 3205 xfs_mount_t *mp,
3494 xfs_bmbt_irec_t *gotp, /* next extent pointer */ 3206 xfs_bmbt_irec_t *gotp, /* next extent pointer */
@@ -3650,9 +3362,9 @@ xfs_bmap_extsize_align(
3650 3362
3651#define XFS_ALLOC_GAP_UNITS 4 3363#define XFS_ALLOC_GAP_UNITS 4
3652 3364
3653STATIC void 3365void
3654xfs_bmap_adjacent( 3366xfs_bmap_adjacent(
3655 xfs_bmalloca_t *ap) /* bmap alloc argument struct */ 3367 struct xfs_bmalloca *ap) /* bmap alloc argument struct */
3656{ 3368{
3657 xfs_fsblock_t adjust; /* adjustment to block numbers */ 3369 xfs_fsblock_t adjust; /* adjustment to block numbers */
3658 xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ 3370 xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
@@ -3799,109 +3511,6 @@ xfs_bmap_adjacent(
3799} 3511}
3800 3512
3801STATIC int 3513STATIC int
3802xfs_bmap_rtalloc(
3803 xfs_bmalloca_t *ap) /* bmap alloc argument struct */
3804{
3805 xfs_alloctype_t atype = 0; /* type for allocation routines */
3806 int error; /* error return value */
3807 xfs_mount_t *mp; /* mount point structure */
3808 xfs_extlen_t prod = 0; /* product factor for allocators */
3809 xfs_extlen_t ralen = 0; /* realtime allocation length */
3810 xfs_extlen_t align; /* minimum allocation alignment */
3811 xfs_rtblock_t rtb;
3812
3813 mp = ap->ip->i_mount;
3814 align = xfs_get_extsz_hint(ap->ip);
3815 prod = align / mp->m_sb.sb_rextsize;
3816 error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
3817 align, 1, ap->eof, 0,
3818 ap->conv, &ap->offset, &ap->length);
3819 if (error)
3820 return error;
3821 ASSERT(ap->length);
3822 ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
3823
3824 /*
3825 * If the offset & length are not perfectly aligned
3826 * then kill prod, it will just get us in trouble.
3827 */
3828 if (do_mod(ap->offset, align) || ap->length % align)
3829 prod = 1;
3830 /*
3831 * Set ralen to be the actual requested length in rtextents.
3832 */
3833 ralen = ap->length / mp->m_sb.sb_rextsize;
3834 /*
3835 * If the old value was close enough to MAXEXTLEN that
3836 * we rounded up to it, cut it back so it's valid again.
3837 * Note that if it's a really large request (bigger than
3838 * MAXEXTLEN), we don't hear about that number, and can't
3839 * adjust the starting point to match it.
3840 */
3841 if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
3842 ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
3843
3844 /*
3845 * Lock out other modifications to the RT bitmap inode.
3846 */
3847 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
3848 xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
3849
3850 /*
3851 * If it's an allocation to an empty file at offset 0,
3852 * pick an extent that will space things out in the rt area.
3853 */
3854 if (ap->eof && ap->offset == 0) {
3855 xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
3856
3857 error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
3858 if (error)
3859 return error;
3860 ap->blkno = rtx * mp->m_sb.sb_rextsize;
3861 } else {
3862 ap->blkno = 0;
3863 }
3864
3865 xfs_bmap_adjacent(ap);
3866
3867 /*
3868 * Realtime allocation, done through xfs_rtallocate_extent.
3869 */
3870 atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
3871 do_div(ap->blkno, mp->m_sb.sb_rextsize);
3872 rtb = ap->blkno;
3873 ap->length = ralen;
3874 if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
3875 &ralen, atype, ap->wasdel, prod, &rtb)))
3876 return error;
3877 if (rtb == NULLFSBLOCK && prod > 1 &&
3878 (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
3879 ap->length, &ralen, atype,
3880 ap->wasdel, 1, &rtb)))
3881 return error;
3882 ap->blkno = rtb;
3883 if (ap->blkno != NULLFSBLOCK) {
3884 ap->blkno *= mp->m_sb.sb_rextsize;
3885 ralen *= mp->m_sb.sb_rextsize;
3886 ap->length = ralen;
3887 ap->ip->i_d.di_nblocks += ralen;
3888 xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
3889 if (ap->wasdel)
3890 ap->ip->i_delayed_blks -= ralen;
3891 /*
3892 * Adjust the disk quota also. This was reserved
3893 * earlier.
3894 */
3895 xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
3896 ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
3897 XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
3898 } else {
3899 ap->length = 0;
3900 }
3901 return 0;
3902}
3903
3904STATIC int
3905xfs_bmap_btalloc_nullfb( 3514xfs_bmap_btalloc_nullfb(
3906 struct xfs_bmalloca *ap, 3515 struct xfs_bmalloca *ap,
3907 struct xfs_alloc_arg *args, 3516 struct xfs_alloc_arg *args,
@@ -4018,7 +3627,7 @@ xfs_bmap_btalloc_nullfb(
4018 3627
4019STATIC int 3628STATIC int
4020xfs_bmap_btalloc( 3629xfs_bmap_btalloc(
4021 xfs_bmalloca_t *ap) /* bmap alloc argument struct */ 3630 struct xfs_bmalloca *ap) /* bmap alloc argument struct */
4022{ 3631{
4023 xfs_mount_t *mp; /* mount point structure */ 3632 xfs_mount_t *mp; /* mount point structure */
4024 xfs_alloctype_t atype = 0; /* type for allocation routines */ 3633 xfs_alloctype_t atype = 0; /* type for allocation routines */
@@ -4250,7 +3859,7 @@ xfs_bmap_btalloc(
4250 */ 3859 */
4251STATIC int 3860STATIC int
4252xfs_bmap_alloc( 3861xfs_bmap_alloc(
4253 xfs_bmalloca_t *ap) /* bmap alloc argument struct */ 3862 struct xfs_bmalloca *ap) /* bmap alloc argument struct */
4254{ 3863{
4255 if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata) 3864 if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata)
4256 return xfs_bmap_rtalloc(ap); 3865 return xfs_bmap_rtalloc(ap);
@@ -4638,7 +4247,7 @@ xfs_bmapi_delay(
4638} 4247}
4639 4248
4640 4249
4641STATIC int 4250int
4642__xfs_bmapi_allocate( 4251__xfs_bmapi_allocate(
4643 struct xfs_bmalloca *bma) 4252 struct xfs_bmalloca *bma)
4644{ 4253{
@@ -4648,12 +4257,9 @@ __xfs_bmapi_allocate(
4648 struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); 4257 struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
4649 int tmp_logflags = 0; 4258 int tmp_logflags = 0;
4650 int error; 4259 int error;
4651 int rt;
4652 4260
4653 ASSERT(bma->length > 0); 4261 ASSERT(bma->length > 0);
4654 4262
4655 rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(bma->ip);
4656
4657 /* 4263 /*
4658 * For the wasdelay case, we could also just allocate the stuff asked 4264 * For the wasdelay case, we could also just allocate the stuff asked
4659 * for in this bmap call but that wouldn't be as good. 4265 * for in this bmap call but that wouldn't be as good.
@@ -4756,45 +4362,6 @@ __xfs_bmapi_allocate(
4756 return 0; 4362 return 0;
4757} 4363}
4758 4364
4759static void
4760xfs_bmapi_allocate_worker(
4761 struct work_struct *work)
4762{
4763 struct xfs_bmalloca *args = container_of(work,
4764 struct xfs_bmalloca, work);
4765 unsigned long pflags;
4766
4767 /* we are in a transaction context here */
4768 current_set_flags_nested(&pflags, PF_FSTRANS);
4769
4770 args->result = __xfs_bmapi_allocate(args);
4771 complete(args->done);
4772
4773 current_restore_flags_nested(&pflags, PF_FSTRANS);
4774}
4775
4776/*
4777 * Some allocation requests often come in with little stack to work on. Push
4778 * them off to a worker thread so there is lots of stack to use. Otherwise just
4779 * call directly to avoid the context switch overhead here.
4780 */
4781int
4782xfs_bmapi_allocate(
4783 struct xfs_bmalloca *args)
4784{
4785 DECLARE_COMPLETION_ONSTACK(done);
4786
4787 if (!args->stack_switch)
4788 return __xfs_bmapi_allocate(args);
4789
4790
4791 args->done = &done;
4792 INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
4793 queue_work(xfs_alloc_wq, &args->work);
4794 wait_for_completion(&done);
4795 return args->result;
4796}
4797
4798STATIC int 4365STATIC int
4799xfs_bmapi_convert_unwritten( 4366xfs_bmapi_convert_unwritten(
4800 struct xfs_bmalloca *bma, 4367 struct xfs_bmalloca *bma,
@@ -5789,359 +5356,3 @@ error0:
5789 } 5356 }
5790 return error; 5357 return error;
5791} 5358}
5792
5793/*
5794 * returns 1 for success, 0 if we failed to map the extent.
5795 */
5796STATIC int
5797xfs_getbmapx_fix_eof_hole(
5798 xfs_inode_t *ip, /* xfs incore inode pointer */
5799 struct getbmapx *out, /* output structure */
5800 int prealloced, /* this is a file with
5801 * preallocated data space */
5802 __int64_t end, /* last block requested */
5803 xfs_fsblock_t startblock)
5804{
5805 __int64_t fixlen;
5806 xfs_mount_t *mp; /* file system mount point */
5807 xfs_ifork_t *ifp; /* inode fork pointer */
5808 xfs_extnum_t lastx; /* last extent pointer */
5809 xfs_fileoff_t fileblock;
5810
5811 if (startblock == HOLESTARTBLOCK) {
5812 mp = ip->i_mount;
5813 out->bmv_block = -1;
5814 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
5815 fixlen -= out->bmv_offset;
5816 if (prealloced && out->bmv_offset + out->bmv_length == end) {
5817 /* Came to hole at EOF. Trim it. */
5818 if (fixlen <= 0)
5819 return 0;
5820 out->bmv_length = fixlen;
5821 }
5822 } else {
5823 if (startblock == DELAYSTARTBLOCK)
5824 out->bmv_block = -2;
5825 else
5826 out->bmv_block = xfs_fsb_to_db(ip, startblock);
5827 fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
5828 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
5829 if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
5830 (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
5831 out->bmv_oflags |= BMV_OF_LAST;
5832 }
5833
5834 return 1;
5835}
5836
5837/*
5838 * Get inode's extents as described in bmv, and format for output.
5839 * Calls formatter to fill the user's buffer until all extents
5840 * are mapped, until the passed-in bmv->bmv_count slots have
5841 * been filled, or until the formatter short-circuits the loop,
5842 * if it is tracking filled-in extents on its own.
5843 */
5844int /* error code */
5845xfs_getbmap(
5846 xfs_inode_t *ip,
5847 struct getbmapx *bmv, /* user bmap structure */
5848 xfs_bmap_format_t formatter, /* format to user */
5849 void *arg) /* formatter arg */
5850{
5851 __int64_t bmvend; /* last block requested */
5852 int error = 0; /* return value */
5853 __int64_t fixlen; /* length for -1 case */
5854 int i; /* extent number */
5855 int lock; /* lock state */
5856 xfs_bmbt_irec_t *map; /* buffer for user's data */
5857 xfs_mount_t *mp; /* file system mount point */
5858 int nex; /* # of user extents can do */
5859 int nexleft; /* # of user extents left */
5860 int subnex; /* # of bmapi's can do */
5861 int nmap; /* number of map entries */
5862 struct getbmapx *out; /* output structure */
5863 int whichfork; /* data or attr fork */
5864 int prealloced; /* this is a file with
5865 * preallocated data space */
5866 int iflags; /* interface flags */
5867 int bmapi_flags; /* flags for xfs_bmapi */
5868 int cur_ext = 0;
5869
5870 mp = ip->i_mount;
5871 iflags = bmv->bmv_iflags;
5872 whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
5873
5874 if (whichfork == XFS_ATTR_FORK) {
5875 if (XFS_IFORK_Q(ip)) {
5876 if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
5877 ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
5878 ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
5879 return XFS_ERROR(EINVAL);
5880 } else if (unlikely(
5881 ip->i_d.di_aformat != 0 &&
5882 ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
5883 XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
5884 ip->i_mount);
5885 return XFS_ERROR(EFSCORRUPTED);
5886 }
5887
5888 prealloced = 0;
5889 fixlen = 1LL << 32;
5890 } else {
5891 if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
5892 ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
5893 ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
5894 return XFS_ERROR(EINVAL);
5895
5896 if (xfs_get_extsz_hint(ip) ||
5897 ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
5898 prealloced = 1;
5899 fixlen = mp->m_super->s_maxbytes;
5900 } else {
5901 prealloced = 0;
5902 fixlen = XFS_ISIZE(ip);
5903 }
5904 }
5905
5906 if (bmv->bmv_length == -1) {
5907 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
5908 bmv->bmv_length =
5909 max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
5910 } else if (bmv->bmv_length == 0) {
5911 bmv->bmv_entries = 0;
5912 return 0;
5913 } else if (bmv->bmv_length < 0) {
5914 return XFS_ERROR(EINVAL);
5915 }
5916
5917 nex = bmv->bmv_count - 1;
5918 if (nex <= 0)
5919 return XFS_ERROR(EINVAL);
5920 bmvend = bmv->bmv_offset + bmv->bmv_length;
5921
5922
5923 if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
5924 return XFS_ERROR(ENOMEM);
5925 out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
5926 if (!out) {
5927 out = kmem_zalloc_large(bmv->bmv_count *
5928 sizeof(struct getbmapx));
5929 if (!out)
5930 return XFS_ERROR(ENOMEM);
5931 }
5932
5933 xfs_ilock(ip, XFS_IOLOCK_SHARED);
5934 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
5935 if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
5936 error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
5937 if (error)
5938 goto out_unlock_iolock;
5939 }
5940 /*
5941 * even after flushing the inode, there can still be delalloc
5942 * blocks on the inode beyond EOF due to speculative
5943 * preallocation. These are not removed until the release
5944 * function is called or the inode is inactivated. Hence we
5945 * cannot assert here that ip->i_delayed_blks == 0.
5946 */
5947 }
5948
5949 lock = xfs_ilock_map_shared(ip);
5950
5951 /*
5952 * Don't let nex be bigger than the number of extents
5953 * we can have assuming alternating holes and real extents.
5954 */
5955 if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
5956 nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
5957
5958 bmapi_flags = xfs_bmapi_aflag(whichfork);
5959 if (!(iflags & BMV_IF_PREALLOC))
5960 bmapi_flags |= XFS_BMAPI_IGSTATE;
5961
5962 /*
5963 * Allocate enough space to handle "subnex" maps at a time.
5964 */
5965 error = ENOMEM;
5966 subnex = 16;
5967 map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
5968 if (!map)
5969 goto out_unlock_ilock;
5970
5971 bmv->bmv_entries = 0;
5972
5973 if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
5974 (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
5975 error = 0;
5976 goto out_free_map;
5977 }
5978
5979 nexleft = nex;
5980
5981 do {
5982 nmap = (nexleft > subnex) ? subnex : nexleft;
5983 error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
5984 XFS_BB_TO_FSB(mp, bmv->bmv_length),
5985 map, &nmap, bmapi_flags);
5986 if (error)
5987 goto out_free_map;
5988 ASSERT(nmap <= subnex);
5989
5990 for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
5991 out[cur_ext].bmv_oflags = 0;
5992 if (map[i].br_state == XFS_EXT_UNWRITTEN)
5993 out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
5994 else if (map[i].br_startblock == DELAYSTARTBLOCK)
5995 out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
5996 out[cur_ext].bmv_offset =
5997 XFS_FSB_TO_BB(mp, map[i].br_startoff);
5998 out[cur_ext].bmv_length =
5999 XFS_FSB_TO_BB(mp, map[i].br_blockcount);
6000 out[cur_ext].bmv_unused1 = 0;
6001 out[cur_ext].bmv_unused2 = 0;
6002
6003 /*
6004 * delayed allocation extents that start beyond EOF can
6005 * occur due to speculative EOF allocation when the
6006 * delalloc extent is larger than the largest freespace
6007 * extent at conversion time. These extents cannot be
6008 * converted by data writeback, so can exist here even
6009 * if we are not supposed to be finding delalloc
6010 * extents.
6011 */
6012 if (map[i].br_startblock == DELAYSTARTBLOCK &&
6013 map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
6014 ASSERT((iflags & BMV_IF_DELALLOC) != 0);
6015
6016 if (map[i].br_startblock == HOLESTARTBLOCK &&
6017 whichfork == XFS_ATTR_FORK) {
6018 /* came to the end of attribute fork */
6019 out[cur_ext].bmv_oflags |= BMV_OF_LAST;
6020 goto out_free_map;
6021 }
6022
6023 if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
6024 prealloced, bmvend,
6025 map[i].br_startblock))
6026 goto out_free_map;
6027
6028 bmv->bmv_offset =
6029 out[cur_ext].bmv_offset +
6030 out[cur_ext].bmv_length;
6031 bmv->bmv_length =
6032 max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
6033
6034 /*
6035 * In case we don't want to return the hole,
6036 * don't increase cur_ext so that we can reuse
6037 * it in the next loop.
6038 */
6039 if ((iflags & BMV_IF_NO_HOLES) &&
6040 map[i].br_startblock == HOLESTARTBLOCK) {
6041 memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
6042 continue;
6043 }
6044
6045 nexleft--;
6046 bmv->bmv_entries++;
6047 cur_ext++;
6048 }
6049 } while (nmap && nexleft && bmv->bmv_length);
6050
6051 out_free_map:
6052 kmem_free(map);
6053 out_unlock_ilock:
6054 xfs_iunlock_map_shared(ip, lock);
6055 out_unlock_iolock:
6056 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
6057
6058 for (i = 0; i < cur_ext; i++) {
6059 int full = 0; /* user array is full */
6060
6061 /* format results & advance arg */
6062 error = formatter(&arg, &out[i], &full);
6063 if (error || full)
6064 break;
6065 }
6066
6067 if (is_vmalloc_addr(out))
6068 kmem_free_large(out);
6069 else
6070 kmem_free(out);
6071 return error;
6072}
6073
6074/*
6075 * dead simple method of punching delalyed allocation blocks from a range in
6076 * the inode. Walks a block at a time so will be slow, but is only executed in
6077 * rare error cases so the overhead is not critical. This will alays punch out
6078 * both the start and end blocks, even if the ranges only partially overlap
6079 * them, so it is up to the caller to ensure that partial blocks are not
6080 * passed in.
6081 */
6082int
6083xfs_bmap_punch_delalloc_range(
6084 struct xfs_inode *ip,
6085 xfs_fileoff_t start_fsb,
6086 xfs_fileoff_t length)
6087{
6088 xfs_fileoff_t remaining = length;
6089 int error = 0;
6090
6091 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
6092
6093 do {
6094 int done;
6095 xfs_bmbt_irec_t imap;
6096 int nimaps = 1;
6097 xfs_fsblock_t firstblock;
6098 xfs_bmap_free_t flist;
6099
6100 /*
6101 * Map the range first and check that it is a delalloc extent
6102 * before trying to unmap the range. Otherwise we will be
6103 * trying to remove a real extent (which requires a
6104 * transaction) or a hole, which is probably a bad idea...
6105 */
6106 error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
6107 XFS_BMAPI_ENTIRE);
6108
6109 if (error) {
6110 /* something screwed, just bail */
6111 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
6112 xfs_alert(ip->i_mount,
6113 "Failed delalloc mapping lookup ino %lld fsb %lld.",
6114 ip->i_ino, start_fsb);
6115 }
6116 break;
6117 }
6118 if (!nimaps) {
6119 /* nothing there */
6120 goto next_block;
6121 }
6122 if (imap.br_startblock != DELAYSTARTBLOCK) {
6123 /* been converted, ignore */
6124 goto next_block;
6125 }
6126 WARN_ON(imap.br_blockcount == 0);
6127
6128 /*
6129 * Note: while we initialise the firstblock/flist pair, they
6130 * should never be used because blocks should never be
6131 * allocated or freed for a delalloc extent and hence we need
6132 * don't cancel or finish them after the xfs_bunmapi() call.
6133 */
6134 xfs_bmap_init(&flist, &firstblock);
6135 error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
6136 &flist, &done);
6137 if (error)
6138 break;
6139
6140 ASSERT(!flist.xbf_count && !flist.xbf_first);
6141next_block:
6142 start_fsb++;
6143 remaining--;
6144 } while(remaining > 0);
6145
6146 return error;
6147}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 1cf1292d29b7..33b41f351225 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -108,41 +108,6 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
108} 108}
109 109
110/* 110/*
111 * Argument structure for xfs_bmap_alloc.
112 */
113typedef struct xfs_bmalloca {
114 xfs_fsblock_t *firstblock; /* i/o first block allocated */
115 struct xfs_bmap_free *flist; /* bmap freelist */
116 struct xfs_trans *tp; /* transaction pointer */
117 struct xfs_inode *ip; /* incore inode pointer */
118 struct xfs_bmbt_irec prev; /* extent before the new one */
119 struct xfs_bmbt_irec got; /* extent after, or delayed */
120
121 xfs_fileoff_t offset; /* offset in file filling in */
122 xfs_extlen_t length; /* i/o length asked/allocated */
123 xfs_fsblock_t blkno; /* starting block of new extent */
124
125 struct xfs_btree_cur *cur; /* btree cursor */
126 xfs_extnum_t idx; /* current extent index */
127 int nallocs;/* number of extents alloc'd */
128 int logflags;/* flags for transaction logging */
129
130 xfs_extlen_t total; /* total blocks needed for xaction */
131 xfs_extlen_t minlen; /* minimum allocation size (blocks) */
132 xfs_extlen_t minleft; /* amount must be left after alloc */
133 char eof; /* set if allocating past last extent */
134 char wasdel; /* replacing a delayed allocation */
135 char userdata;/* set if is user data */
136 char aeof; /* allocated space at eof */
137 char conv; /* overwriting unwritten extents */
138 char stack_switch;
139 int flags;
140 struct completion *done;
141 struct work_struct work;
142 int result;
143} xfs_bmalloca_t;
144
145/*
146 * Flags for xfs_bmap_add_extent*. 111 * Flags for xfs_bmap_add_extent*.
147 */ 112 */
148#define BMAP_LEFT_CONTIG (1 << 0) 113#define BMAP_LEFT_CONTIG (1 << 0)
@@ -162,7 +127,7 @@ typedef struct xfs_bmalloca {
162 { BMAP_RIGHT_FILLING, "RF" }, \ 127 { BMAP_RIGHT_FILLING, "RF" }, \
163 { BMAP_ATTRFORK, "ATTR" } 128 { BMAP_ATTRFORK, "ATTR" }
164 129
165#if defined(__KERNEL) && defined(DEBUG) 130#ifdef DEBUG
166void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, 131void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
167 int whichfork, unsigned long caller_ip); 132 int whichfork, unsigned long caller_ip);
168#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ 133#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \
@@ -205,23 +170,4 @@ int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
205 xfs_extnum_t num); 170 xfs_extnum_t num);
206uint xfs_default_attroffset(struct xfs_inode *ip); 171uint xfs_default_attroffset(struct xfs_inode *ip);
207 172
208#ifdef __KERNEL__
209/* bmap to userspace formatter - copy to user & advance pointer */
210typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
211
212int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
213 int *committed);
214int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
215 xfs_bmap_format_t formatter, void *arg);
216int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
217 int whichfork, int *eof);
218int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
219 int whichfork, int *count);
220int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
221 xfs_fileoff_t start_fsb, xfs_fileoff_t length);
222
223xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
224
225#endif /* __KERNEL__ */
226
227#endif /* __XFS_BMAP_H__ */ 173#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 0c61a22be6fd..cf3bc76710c3 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -17,7 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_format.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_trans.h" 23#include "xfs_trans.h"
@@ -722,7 +722,7 @@ xfs_bmbt_key_diff(
722 cur->bc_rec.b.br_startoff; 722 cur->bc_rec.b.br_startoff;
723} 723}
724 724
725static int 725static bool
726xfs_bmbt_verify( 726xfs_bmbt_verify(
727 struct xfs_buf *bp) 727 struct xfs_buf *bp)
728{ 728{
@@ -775,7 +775,6 @@ xfs_bmbt_verify(
775 return false; 775 return false;
776 776
777 return true; 777 return true;
778
779} 778}
780 779
781static void 780static void
@@ -789,7 +788,6 @@ xfs_bmbt_read_verify(
789 bp->b_target->bt_mount, bp->b_addr); 788 bp->b_target->bt_mount, bp->b_addr);
790 xfs_buf_ioerror(bp, EFSCORRUPTED); 789 xfs_buf_ioerror(bp, EFSCORRUPTED);
791 } 790 }
792
793} 791}
794 792
795static void 793static void
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
new file mode 100644
index 000000000000..541d59f5e658
--- /dev/null
+++ b/fs/xfs/xfs_bmap_util.c
@@ -0,0 +1,2026 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * Copyright (c) 2012 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_format.h"
22#include "xfs_bit.h"
23#include "xfs_log.h"
24#include "xfs_inum.h"
25#include "xfs_trans.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h"
29#include "xfs_da_btree.h"
30#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_dinode.h"
34#include "xfs_inode.h"
35#include "xfs_btree.h"
36#include "xfs_extfree_item.h"
37#include "xfs_alloc.h"
38#include "xfs_bmap.h"
39#include "xfs_bmap_util.h"
40#include "xfs_rtalloc.h"
41#include "xfs_error.h"
42#include "xfs_quota.h"
43#include "xfs_trans_space.h"
44#include "xfs_trace.h"
45#include "xfs_icache.h"
46
47/* Kernel only BMAP related definitions and functions */
48
49/*
50 * Convert the given file system block to a disk block. We have to treat it
51 * differently based on whether the file is a real time file or not, because the
52 * bmap code does.
53 */
54xfs_daddr_t
55xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
56{
57 return (XFS_IS_REALTIME_INODE(ip) ? \
58 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
59 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
60}
61
62/*
63 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
64 * caller. Frees all the extents that need freeing, which must be done
65 * last due to locking considerations. We never free any extents in
66 * the first transaction.
67 *
68 * Return 1 if the given transaction was committed and a new one
69 * started, and 0 otherwise in the committed parameter.
70 */
71int /* error */
72xfs_bmap_finish(
73 xfs_trans_t **tp, /* transaction pointer addr */
74 xfs_bmap_free_t *flist, /* i/o: list extents to free */
75 int *committed) /* xact committed or not */
76{
77 xfs_efd_log_item_t *efd; /* extent free data */
78 xfs_efi_log_item_t *efi; /* extent free intention */
79 int error; /* error return value */
80 xfs_bmap_free_item_t *free; /* free extent item */
81 struct xfs_trans_res tres; /* new log reservation */
82 xfs_mount_t *mp; /* filesystem mount structure */
83 xfs_bmap_free_item_t *next; /* next item on free list */
84 xfs_trans_t *ntp; /* new transaction pointer */
85
86 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
87 if (flist->xbf_count == 0) {
88 *committed = 0;
89 return 0;
90 }
91 ntp = *tp;
92 efi = xfs_trans_get_efi(ntp, flist->xbf_count);
93 for (free = flist->xbf_first; free; free = free->xbfi_next)
94 xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock,
95 free->xbfi_blockcount);
96
97 tres.tr_logres = ntp->t_log_res;
98 tres.tr_logcount = ntp->t_log_count;
99 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
100 ntp = xfs_trans_dup(*tp);
101 error = xfs_trans_commit(*tp, 0);
102 *tp = ntp;
103 *committed = 1;
104 /*
105 * We have a new transaction, so we should return committed=1,
106 * even though we're returning an error.
107 */
108 if (error)
109 return error;
110
111 /*
112 * transaction commit worked ok so we can drop the extra ticket
113 * reference that we gained in xfs_trans_dup()
114 */
115 xfs_log_ticket_put(ntp->t_ticket);
116
117 error = xfs_trans_reserve(ntp, &tres, 0, 0);
118 if (error)
119 return error;
120 efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count);
121 for (free = flist->xbf_first; free != NULL; free = next) {
122 next = free->xbfi_next;
123 if ((error = xfs_free_extent(ntp, free->xbfi_startblock,
124 free->xbfi_blockcount))) {
125 /*
126 * The bmap free list will be cleaned up at a
127 * higher level. The EFI will be canceled when
128 * this transaction is aborted.
129 * Need to force shutdown here to make sure it
130 * happens, since this transaction may not be
131 * dirty yet.
132 */
133 mp = ntp->t_mountp;
134 if (!XFS_FORCED_SHUTDOWN(mp))
135 xfs_force_shutdown(mp,
136 (error == EFSCORRUPTED) ?
137 SHUTDOWN_CORRUPT_INCORE :
138 SHUTDOWN_META_IO_ERROR);
139 return error;
140 }
141 xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock,
142 free->xbfi_blockcount);
143 xfs_bmap_del_free(flist, NULL, free);
144 }
145 return 0;
146}
147
148int
149xfs_bmap_rtalloc(
150 struct xfs_bmalloca *ap) /* bmap alloc argument struct */
151{
152 xfs_alloctype_t atype = 0; /* type for allocation routines */
153 int error; /* error return value */
154 xfs_mount_t *mp; /* mount point structure */
155 xfs_extlen_t prod = 0; /* product factor for allocators */
156 xfs_extlen_t ralen = 0; /* realtime allocation length */
157 xfs_extlen_t align; /* minimum allocation alignment */
158 xfs_rtblock_t rtb;
159
160 mp = ap->ip->i_mount;
161 align = xfs_get_extsz_hint(ap->ip);
162 prod = align / mp->m_sb.sb_rextsize;
163 error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
164 align, 1, ap->eof, 0,
165 ap->conv, &ap->offset, &ap->length);
166 if (error)
167 return error;
168 ASSERT(ap->length);
169 ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
170
171 /*
172 * If the offset & length are not perfectly aligned
173 * then kill prod, it will just get us in trouble.
174 */
175 if (do_mod(ap->offset, align) || ap->length % align)
176 prod = 1;
177 /*
178 * Set ralen to be the actual requested length in rtextents.
179 */
180 ralen = ap->length / mp->m_sb.sb_rextsize;
181 /*
182 * If the old value was close enough to MAXEXTLEN that
183 * we rounded up to it, cut it back so it's valid again.
184 * Note that if it's a really large request (bigger than
185 * MAXEXTLEN), we don't hear about that number, and can't
186 * adjust the starting point to match it.
187 */
188 if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
189 ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
190
191 /*
192 * Lock out other modifications to the RT bitmap inode.
193 */
194 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
195 xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
196
197 /*
198 * If it's an allocation to an empty file at offset 0,
199 * pick an extent that will space things out in the rt area.
200 */
201 if (ap->eof && ap->offset == 0) {
202 xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
203
204 error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
205 if (error)
206 return error;
207 ap->blkno = rtx * mp->m_sb.sb_rextsize;
208 } else {
209 ap->blkno = 0;
210 }
211
212 xfs_bmap_adjacent(ap);
213
214 /*
215 * Realtime allocation, done through xfs_rtallocate_extent.
216 */
217 atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
218 do_div(ap->blkno, mp->m_sb.sb_rextsize);
219 rtb = ap->blkno;
220 ap->length = ralen;
221 if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
222 &ralen, atype, ap->wasdel, prod, &rtb)))
223 return error;
224 if (rtb == NULLFSBLOCK && prod > 1 &&
225 (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
226 ap->length, &ralen, atype,
227 ap->wasdel, 1, &rtb)))
228 return error;
229 ap->blkno = rtb;
230 if (ap->blkno != NULLFSBLOCK) {
231 ap->blkno *= mp->m_sb.sb_rextsize;
232 ralen *= mp->m_sb.sb_rextsize;
233 ap->length = ralen;
234 ap->ip->i_d.di_nblocks += ralen;
235 xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
236 if (ap->wasdel)
237 ap->ip->i_delayed_blks -= ralen;
238 /*
239 * Adjust the disk quota also. This was reserved
240 * earlier.
241 */
242 xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
243 ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
244 XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
245 } else {
246 ap->length = 0;
247 }
248 return 0;
249}
250
251/*
252 * Stack switching interfaces for allocation
253 */
254static void
255xfs_bmapi_allocate_worker(
256 struct work_struct *work)
257{
258 struct xfs_bmalloca *args = container_of(work,
259 struct xfs_bmalloca, work);
260 unsigned long pflags;
261
262 /* we are in a transaction context here */
263 current_set_flags_nested(&pflags, PF_FSTRANS);
264
265 args->result = __xfs_bmapi_allocate(args);
266 complete(args->done);
267
268 current_restore_flags_nested(&pflags, PF_FSTRANS);
269}
270
271/*
272 * Some allocation requests often come in with little stack to work on. Push
273 * them off to a worker thread so there is lots of stack to use. Otherwise just
274 * call directly to avoid the context switch overhead here.
275 */
276int
277xfs_bmapi_allocate(
278 struct xfs_bmalloca *args)
279{
280 DECLARE_COMPLETION_ONSTACK(done);
281
282 if (!args->stack_switch)
283 return __xfs_bmapi_allocate(args);
284
285
286 args->done = &done;
287 INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
288 queue_work(xfs_alloc_wq, &args->work);
289 wait_for_completion(&done);
290 return args->result;
291}
292
293/*
294 * Check if the endoff is outside the last extent. If so the caller will grow
295 * the allocation to a stripe unit boundary. All offsets are considered outside
296 * the end of file for an empty fork, so 1 is returned in *eof in that case.
297 */
298int
299xfs_bmap_eof(
300 struct xfs_inode *ip,
301 xfs_fileoff_t endoff,
302 int whichfork,
303 int *eof)
304{
305 struct xfs_bmbt_irec rec;
306 int error;
307
308 error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
309 if (error || *eof)
310 return error;
311
312 *eof = endoff >= rec.br_startoff + rec.br_blockcount;
313 return 0;
314}
315
316/*
317 * Extent tree block counting routines.
318 */
319
320/*
321 * Count leaf blocks given a range of extent records.
322 */
323STATIC void
324xfs_bmap_count_leaves(
325 xfs_ifork_t *ifp,
326 xfs_extnum_t idx,
327 int numrecs,
328 int *count)
329{
330 int b;
331
332 for (b = 0; b < numrecs; b++) {
333 xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
334 *count += xfs_bmbt_get_blockcount(frp);
335 }
336}
337
338/*
339 * Count leaf blocks given a range of extent records originally
340 * in btree format.
341 */
342STATIC void
343xfs_bmap_disk_count_leaves(
344 struct xfs_mount *mp,
345 struct xfs_btree_block *block,
346 int numrecs,
347 int *count)
348{
349 int b;
350 xfs_bmbt_rec_t *frp;
351
352 for (b = 1; b <= numrecs; b++) {
353 frp = XFS_BMBT_REC_ADDR(mp, block, b);
354 *count += xfs_bmbt_disk_get_blockcount(frp);
355 }
356}
357
358/*
359 * Recursively walks each level of a btree
360 * to count total fsblocks in use.
361 */
362STATIC int /* error */
363xfs_bmap_count_tree(
364 xfs_mount_t *mp, /* file system mount point */
365 xfs_trans_t *tp, /* transaction pointer */
366 xfs_ifork_t *ifp, /* inode fork pointer */
367 xfs_fsblock_t blockno, /* file system block number */
368 int levelin, /* level in btree */
369 int *count) /* Count of blocks */
370{
371 int error;
372 xfs_buf_t *bp, *nbp;
373 int level = levelin;
374 __be64 *pp;
375 xfs_fsblock_t bno = blockno;
376 xfs_fsblock_t nextbno;
377 struct xfs_btree_block *block, *nextblock;
378 int numrecs;
379
380 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
381 &xfs_bmbt_buf_ops);
382 if (error)
383 return error;
384 *count += 1;
385 block = XFS_BUF_TO_BLOCK(bp);
386
387 if (--level) {
388 /* Not at node above leaves, count this level of nodes */
389 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
390 while (nextbno != NULLFSBLOCK) {
391 error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
392 XFS_BMAP_BTREE_REF,
393 &xfs_bmbt_buf_ops);
394 if (error)
395 return error;
396 *count += 1;
397 nextblock = XFS_BUF_TO_BLOCK(nbp);
398 nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
399 xfs_trans_brelse(tp, nbp);
400 }
401
402 /* Dive to the next level */
403 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
404 bno = be64_to_cpu(*pp);
405 if (unlikely((error =
406 xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
407 xfs_trans_brelse(tp, bp);
408 XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
409 XFS_ERRLEVEL_LOW, mp);
410 return XFS_ERROR(EFSCORRUPTED);
411 }
412 xfs_trans_brelse(tp, bp);
413 } else {
414 /* count all level 1 nodes and their leaves */
415 for (;;) {
416 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
417 numrecs = be16_to_cpu(block->bb_numrecs);
418 xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
419 xfs_trans_brelse(tp, bp);
420 if (nextbno == NULLFSBLOCK)
421 break;
422 bno = nextbno;
423 error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
424 XFS_BMAP_BTREE_REF,
425 &xfs_bmbt_buf_ops);
426 if (error)
427 return error;
428 *count += 1;
429 block = XFS_BUF_TO_BLOCK(bp);
430 }
431 }
432 return 0;
433}
434
435/*
436 * Count fsblocks of the given fork.
437 */
438int /* error */
439xfs_bmap_count_blocks(
440 xfs_trans_t *tp, /* transaction pointer */
441 xfs_inode_t *ip, /* incore inode */
442 int whichfork, /* data or attr fork */
443 int *count) /* out: count of blocks */
444{
445 struct xfs_btree_block *block; /* current btree block */
446 xfs_fsblock_t bno; /* block # of "block" */
447 xfs_ifork_t *ifp; /* fork structure */
448 int level; /* btree level, for checking */
449 xfs_mount_t *mp; /* file system mount structure */
450 __be64 *pp; /* pointer to block address */
451
452 bno = NULLFSBLOCK;
453 mp = ip->i_mount;
454 ifp = XFS_IFORK_PTR(ip, whichfork);
455 if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
456 xfs_bmap_count_leaves(ifp, 0,
457 ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
458 count);
459 return 0;
460 }
461
462 /*
463 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
464 */
465 block = ifp->if_broot;
466 level = be16_to_cpu(block->bb_level);
467 ASSERT(level > 0);
468 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
469 bno = be64_to_cpu(*pp);
470 ASSERT(bno != NULLDFSBNO);
471 ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
472 ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
473
474 if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
475 XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
476 mp);
477 return XFS_ERROR(EFSCORRUPTED);
478 }
479
480 return 0;
481}
482
483/*
484 * returns 1 for success, 0 if we failed to map the extent.
485 */
486STATIC int
487xfs_getbmapx_fix_eof_hole(
488 xfs_inode_t *ip, /* xfs incore inode pointer */
489 struct getbmapx *out, /* output structure */
490 int prealloced, /* this is a file with
491 * preallocated data space */
492 __int64_t end, /* last block requested */
493 xfs_fsblock_t startblock)
494{
495 __int64_t fixlen;
496 xfs_mount_t *mp; /* file system mount point */
497 xfs_ifork_t *ifp; /* inode fork pointer */
498 xfs_extnum_t lastx; /* last extent pointer */
499 xfs_fileoff_t fileblock;
500
501 if (startblock == HOLESTARTBLOCK) {
502 mp = ip->i_mount;
503 out->bmv_block = -1;
504 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
505 fixlen -= out->bmv_offset;
506 if (prealloced && out->bmv_offset + out->bmv_length == end) {
507 /* Came to hole at EOF. Trim it. */
508 if (fixlen <= 0)
509 return 0;
510 out->bmv_length = fixlen;
511 }
512 } else {
513 if (startblock == DELAYSTARTBLOCK)
514 out->bmv_block = -2;
515 else
516 out->bmv_block = xfs_fsb_to_db(ip, startblock);
517 fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
518 ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
519 if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
520 (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
521 out->bmv_oflags |= BMV_OF_LAST;
522 }
523
524 return 1;
525}
526
527/*
528 * Get inode's extents as described in bmv, and format for output.
529 * Calls formatter to fill the user's buffer until all extents
530 * are mapped, until the passed-in bmv->bmv_count slots have
531 * been filled, or until the formatter short-circuits the loop,
532 * if it is tracking filled-in extents on its own.
533 */
534int /* error code */
535xfs_getbmap(
536 xfs_inode_t *ip,
537 struct getbmapx *bmv, /* user bmap structure */
538 xfs_bmap_format_t formatter, /* format to user */
539 void *arg) /* formatter arg */
540{
541 __int64_t bmvend; /* last block requested */
542 int error = 0; /* return value */
543 __int64_t fixlen; /* length for -1 case */
544 int i; /* extent number */
545 int lock; /* lock state */
546 xfs_bmbt_irec_t *map; /* buffer for user's data */
547 xfs_mount_t *mp; /* file system mount point */
548 int nex; /* # of user extents can do */
549 int nexleft; /* # of user extents left */
550 int subnex; /* # of bmapi's can do */
551 int nmap; /* number of map entries */
552 struct getbmapx *out; /* output structure */
553 int whichfork; /* data or attr fork */
554 int prealloced; /* this is a file with
555 * preallocated data space */
556 int iflags; /* interface flags */
557 int bmapi_flags; /* flags for xfs_bmapi */
558 int cur_ext = 0;
559
560 mp = ip->i_mount;
561 iflags = bmv->bmv_iflags;
562 whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
563
564 if (whichfork == XFS_ATTR_FORK) {
565 if (XFS_IFORK_Q(ip)) {
566 if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
567 ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
568 ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
569 return XFS_ERROR(EINVAL);
570 } else if (unlikely(
571 ip->i_d.di_aformat != 0 &&
572 ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
573 XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
574 ip->i_mount);
575 return XFS_ERROR(EFSCORRUPTED);
576 }
577
578 prealloced = 0;
579 fixlen = 1LL << 32;
580 } else {
581 if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
582 ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
583 ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
584 return XFS_ERROR(EINVAL);
585
586 if (xfs_get_extsz_hint(ip) ||
587 ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
588 prealloced = 1;
589 fixlen = mp->m_super->s_maxbytes;
590 } else {
591 prealloced = 0;
592 fixlen = XFS_ISIZE(ip);
593 }
594 }
595
596 if (bmv->bmv_length == -1) {
597 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
598 bmv->bmv_length =
599 max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
600 } else if (bmv->bmv_length == 0) {
601 bmv->bmv_entries = 0;
602 return 0;
603 } else if (bmv->bmv_length < 0) {
604 return XFS_ERROR(EINVAL);
605 }
606
607 nex = bmv->bmv_count - 1;
608 if (nex <= 0)
609 return XFS_ERROR(EINVAL);
610 bmvend = bmv->bmv_offset + bmv->bmv_length;
611
612
613 if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
614 return XFS_ERROR(ENOMEM);
615 out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
616 if (!out) {
617 out = kmem_zalloc_large(bmv->bmv_count *
618 sizeof(struct getbmapx));
619 if (!out)
620 return XFS_ERROR(ENOMEM);
621 }
622
623 xfs_ilock(ip, XFS_IOLOCK_SHARED);
624 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
625 if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
626 error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
627 if (error)
628 goto out_unlock_iolock;
629 }
630 /*
631 * even after flushing the inode, there can still be delalloc
632 * blocks on the inode beyond EOF due to speculative
633 * preallocation. These are not removed until the release
634 * function is called or the inode is inactivated. Hence we
635 * cannot assert here that ip->i_delayed_blks == 0.
636 */
637 }
638
639 lock = xfs_ilock_map_shared(ip);
640
641 /*
642 * Don't let nex be bigger than the number of extents
643 * we can have assuming alternating holes and real extents.
644 */
645 if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
646 nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
647
648 bmapi_flags = xfs_bmapi_aflag(whichfork);
649 if (!(iflags & BMV_IF_PREALLOC))
650 bmapi_flags |= XFS_BMAPI_IGSTATE;
651
652 /*
653 * Allocate enough space to handle "subnex" maps at a time.
654 */
655 error = ENOMEM;
656 subnex = 16;
657 map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
658 if (!map)
659 goto out_unlock_ilock;
660
661 bmv->bmv_entries = 0;
662
663 if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
664 (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
665 error = 0;
666 goto out_free_map;
667 }
668
669 nexleft = nex;
670
671 do {
672 nmap = (nexleft > subnex) ? subnex : nexleft;
673 error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
674 XFS_BB_TO_FSB(mp, bmv->bmv_length),
675 map, &nmap, bmapi_flags);
676 if (error)
677 goto out_free_map;
678 ASSERT(nmap <= subnex);
679
680 for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
681 out[cur_ext].bmv_oflags = 0;
682 if (map[i].br_state == XFS_EXT_UNWRITTEN)
683 out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
684 else if (map[i].br_startblock == DELAYSTARTBLOCK)
685 out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
686 out[cur_ext].bmv_offset =
687 XFS_FSB_TO_BB(mp, map[i].br_startoff);
688 out[cur_ext].bmv_length =
689 XFS_FSB_TO_BB(mp, map[i].br_blockcount);
690 out[cur_ext].bmv_unused1 = 0;
691 out[cur_ext].bmv_unused2 = 0;
692
693 /*
694 * delayed allocation extents that start beyond EOF can
695 * occur due to speculative EOF allocation when the
696 * delalloc extent is larger than the largest freespace
697 * extent at conversion time. These extents cannot be
698 * converted by data writeback, so can exist here even
699 * if we are not supposed to be finding delalloc
700 * extents.
701 */
702 if (map[i].br_startblock == DELAYSTARTBLOCK &&
703 map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
704 ASSERT((iflags & BMV_IF_DELALLOC) != 0);
705
706 if (map[i].br_startblock == HOLESTARTBLOCK &&
707 whichfork == XFS_ATTR_FORK) {
708 /* came to the end of attribute fork */
709 out[cur_ext].bmv_oflags |= BMV_OF_LAST;
710 goto out_free_map;
711 }
712
713 if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
714 prealloced, bmvend,
715 map[i].br_startblock))
716 goto out_free_map;
717
718 bmv->bmv_offset =
719 out[cur_ext].bmv_offset +
720 out[cur_ext].bmv_length;
721 bmv->bmv_length =
722 max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
723
724 /*
725 * In case we don't want to return the hole,
726 * don't increase cur_ext so that we can reuse
727 * it in the next loop.
728 */
729 if ((iflags & BMV_IF_NO_HOLES) &&
730 map[i].br_startblock == HOLESTARTBLOCK) {
731 memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
732 continue;
733 }
734
735 nexleft--;
736 bmv->bmv_entries++;
737 cur_ext++;
738 }
739 } while (nmap && nexleft && bmv->bmv_length);
740
741 out_free_map:
742 kmem_free(map);
743 out_unlock_ilock:
744 xfs_iunlock_map_shared(ip, lock);
745 out_unlock_iolock:
746 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
747
748 for (i = 0; i < cur_ext; i++) {
749 int full = 0; /* user array is full */
750
751 /* format results & advance arg */
752 error = formatter(&arg, &out[i], &full);
753 if (error || full)
754 break;
755 }
756
757 if (is_vmalloc_addr(out))
758 kmem_free_large(out);
759 else
760 kmem_free(out);
761 return error;
762}
763
764/*
765 * dead simple method of punching delalyed allocation blocks from a range in
766 * the inode. Walks a block at a time so will be slow, but is only executed in
767 * rare error cases so the overhead is not critical. This will always punch out
768 * both the start and end blocks, even if the ranges only partially overlap
769 * them, so it is up to the caller to ensure that partial blocks are not
770 * passed in.
771 */
772int
773xfs_bmap_punch_delalloc_range(
774 struct xfs_inode *ip,
775 xfs_fileoff_t start_fsb,
776 xfs_fileoff_t length)
777{
778 xfs_fileoff_t remaining = length;
779 int error = 0;
780
781 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
782
783 do {
784 int done;
785 xfs_bmbt_irec_t imap;
786 int nimaps = 1;
787 xfs_fsblock_t firstblock;
788 xfs_bmap_free_t flist;
789
790 /*
791 * Map the range first and check that it is a delalloc extent
792 * before trying to unmap the range. Otherwise we will be
793 * trying to remove a real extent (which requires a
794 * transaction) or a hole, which is probably a bad idea...
795 */
796 error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
797 XFS_BMAPI_ENTIRE);
798
799 if (error) {
800 /* something screwed, just bail */
801 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
802 xfs_alert(ip->i_mount,
803 "Failed delalloc mapping lookup ino %lld fsb %lld.",
804 ip->i_ino, start_fsb);
805 }
806 break;
807 }
808 if (!nimaps) {
809 /* nothing there */
810 goto next_block;
811 }
812 if (imap.br_startblock != DELAYSTARTBLOCK) {
813 /* been converted, ignore */
814 goto next_block;
815 }
816 WARN_ON(imap.br_blockcount == 0);
817
818 /*
819 * Note: while we initialise the firstblock/flist pair, they
820 * should never be used because blocks should never be
821 * allocated or freed for a delalloc extent and hence we need
822 * don't cancel or finish them after the xfs_bunmapi() call.
823 */
824 xfs_bmap_init(&flist, &firstblock);
825 error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
826 &flist, &done);
827 if (error)
828 break;
829
830 ASSERT(!flist.xbf_count && !flist.xbf_first);
831next_block:
832 start_fsb++;
833 remaining--;
834 } while(remaining > 0);
835
836 return error;
837}
838
839/*
840 * Test whether it is appropriate to check an inode for and free post EOF
841 * blocks. The 'force' parameter determines whether we should also consider
842 * regular files that are marked preallocated or append-only.
843 */
844bool
845xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
846{
847 /* prealloc/delalloc exists only on regular files */
848 if (!S_ISREG(ip->i_d.di_mode))
849 return false;
850
851 /*
852 * Zero sized files with no cached pages and delalloc blocks will not
853 * have speculative prealloc/delalloc blocks to remove.
854 */
855 if (VFS_I(ip)->i_size == 0 &&
856 VN_CACHED(VFS_I(ip)) == 0 &&
857 ip->i_delayed_blks == 0)
858 return false;
859
860 /* If we haven't read in the extent list, then don't do it now. */
861 if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
862 return false;
863
864 /*
865 * Do not free real preallocated or append-only files unless the file
866 * has delalloc blocks and we are forced to remove them.
867 */
868 if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
869 if (!force || ip->i_delayed_blks == 0)
870 return false;
871
872 return true;
873}
874
875/*
876 * This is called by xfs_inactive to free any blocks beyond eof
877 * when the link count isn't zero and by xfs_dm_punch_hole() when
878 * punching a hole to EOF.
879 */
880int
881xfs_free_eofblocks(
882 xfs_mount_t *mp,
883 xfs_inode_t *ip,
884 bool need_iolock)
885{
886 xfs_trans_t *tp;
887 int error;
888 xfs_fileoff_t end_fsb;
889 xfs_fileoff_t last_fsb;
890 xfs_filblks_t map_len;
891 int nimaps;
892 xfs_bmbt_irec_t imap;
893
894 /*
895 * Figure out if there are any blocks beyond the end
896 * of the file. If not, then there is nothing to do.
897 */
898 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
899 last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
900 if (last_fsb <= end_fsb)
901 return 0;
902 map_len = last_fsb - end_fsb;
903
904 nimaps = 1;
905 xfs_ilock(ip, XFS_ILOCK_SHARED);
906 error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
907 xfs_iunlock(ip, XFS_ILOCK_SHARED);
908
909 if (!error && (nimaps != 0) &&
910 (imap.br_startblock != HOLESTARTBLOCK ||
911 ip->i_delayed_blks)) {
912 /*
913 * Attach the dquots to the inode up front.
914 */
915 error = xfs_qm_dqattach(ip, 0);
916 if (error)
917 return error;
918
919 /*
920 * There are blocks after the end of file.
921 * Free them up now by truncating the file to
922 * its current size.
923 */
924 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
925
926 if (need_iolock) {
927 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
928 xfs_trans_cancel(tp, 0);
929 return EAGAIN;
930 }
931 }
932
933 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
934 if (error) {
935 ASSERT(XFS_FORCED_SHUTDOWN(mp));
936 xfs_trans_cancel(tp, 0);
937 if (need_iolock)
938 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
939 return error;
940 }
941
942 xfs_ilock(ip, XFS_ILOCK_EXCL);
943 xfs_trans_ijoin(tp, ip, 0);
944
945 /*
946 * Do not update the on-disk file size. If we update the
947 * on-disk file size and then the system crashes before the
948 * contents of the file are flushed to disk then the files
949 * may be full of holes (ie NULL files bug).
950 */
951 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
952 XFS_ISIZE(ip));
953 if (error) {
954 /*
955 * If we get an error at this point we simply don't
956 * bother truncating the file.
957 */
958 xfs_trans_cancel(tp,
959 (XFS_TRANS_RELEASE_LOG_RES |
960 XFS_TRANS_ABORT));
961 } else {
962 error = xfs_trans_commit(tp,
963 XFS_TRANS_RELEASE_LOG_RES);
964 if (!error)
965 xfs_inode_clear_eofblocks_tag(ip);
966 }
967
968 xfs_iunlock(ip, XFS_ILOCK_EXCL);
969 if (need_iolock)
970 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
971 }
972 return error;
973}
974
975/*
976 * xfs_alloc_file_space()
977 * This routine allocates disk space for the given file.
978 *
979 * If alloc_type == 0, this request is for an ALLOCSP type
980 * request which will change the file size. In this case, no
981 * DMAPI event will be generated by the call. A TRUNCATE event
982 * will be generated later by xfs_setattr.
983 *
984 * If alloc_type != 0, this request is for a RESVSP type
985 * request, and a DMAPI DM_EVENT_WRITE will be generated if the
986 * lower block boundary byte address is less than the file's
987 * length.
988 *
989 * RETURNS:
990 * 0 on success
991 * errno on error
992 *
993 */
994STATIC int
995xfs_alloc_file_space(
996 xfs_inode_t *ip,
997 xfs_off_t offset,
998 xfs_off_t len,
999 int alloc_type,
1000 int attr_flags)
1001{
1002 xfs_mount_t *mp = ip->i_mount;
1003 xfs_off_t count;
1004 xfs_filblks_t allocated_fsb;
1005 xfs_filblks_t allocatesize_fsb;
1006 xfs_extlen_t extsz, temp;
1007 xfs_fileoff_t startoffset_fsb;
1008 xfs_fsblock_t firstfsb;
1009 int nimaps;
1010 int quota_flag;
1011 int rt;
1012 xfs_trans_t *tp;
1013 xfs_bmbt_irec_t imaps[1], *imapp;
1014 xfs_bmap_free_t free_list;
1015 uint qblocks, resblks, resrtextents;
1016 int committed;
1017 int error;
1018
1019 trace_xfs_alloc_file_space(ip);
1020
1021 if (XFS_FORCED_SHUTDOWN(mp))
1022 return XFS_ERROR(EIO);
1023
1024 error = xfs_qm_dqattach(ip, 0);
1025 if (error)
1026 return error;
1027
1028 if (len <= 0)
1029 return XFS_ERROR(EINVAL);
1030
1031 rt = XFS_IS_REALTIME_INODE(ip);
1032 extsz = xfs_get_extsz_hint(ip);
1033
1034 count = len;
1035 imapp = &imaps[0];
1036 nimaps = 1;
1037 startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
1038 allocatesize_fsb = XFS_B_TO_FSB(mp, count);
1039
1040 /*
1041 * Allocate file space until done or until there is an error
1042 */
1043 while (allocatesize_fsb && !error) {
1044 xfs_fileoff_t s, e;
1045
1046 /*
1047 * Determine space reservations for data/realtime.
1048 */
1049 if (unlikely(extsz)) {
1050 s = startoffset_fsb;
1051 do_div(s, extsz);
1052 s *= extsz;
1053 e = startoffset_fsb + allocatesize_fsb;
1054 if ((temp = do_mod(startoffset_fsb, extsz)))
1055 e += temp;
1056 if ((temp = do_mod(e, extsz)))
1057 e += extsz - temp;
1058 } else {
1059 s = 0;
1060 e = allocatesize_fsb;
1061 }
1062
1063 /*
1064 * The transaction reservation is limited to a 32-bit block
1065 * count, hence we need to limit the number of blocks we are
1066 * trying to reserve to avoid an overflow. We can't allocate
1067 * more than @nimaps extents, and an extent is limited on disk
1068 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
1069 */
1070 resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
1071 if (unlikely(rt)) {
1072 resrtextents = qblocks = resblks;
1073 resrtextents /= mp->m_sb.sb_rextsize;
1074 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1075 quota_flag = XFS_QMOPT_RES_RTBLKS;
1076 } else {
1077 resrtextents = 0;
1078 resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
1079 quota_flag = XFS_QMOPT_RES_REGBLKS;
1080 }
1081
1082 /*
1083 * Allocate and setup the transaction.
1084 */
1085 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1086 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
1087 resblks, resrtextents);
1088 /*
1089 * Check for running out of space
1090 */
1091 if (error) {
1092 /*
1093 * Free the transaction structure.
1094 */
1095 ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1096 xfs_trans_cancel(tp, 0);
1097 break;
1098 }
1099 xfs_ilock(ip, XFS_ILOCK_EXCL);
1100 error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
1101 0, quota_flag);
1102 if (error)
1103 goto error1;
1104
1105 xfs_trans_ijoin(tp, ip, 0);
1106
1107 xfs_bmap_init(&free_list, &firstfsb);
1108 error = xfs_bmapi_write(tp, ip, startoffset_fsb,
1109 allocatesize_fsb, alloc_type, &firstfsb,
1110 0, imapp, &nimaps, &free_list);
1111 if (error) {
1112 goto error0;
1113 }
1114
1115 /*
1116 * Complete the transaction
1117 */
1118 error = xfs_bmap_finish(&tp, &free_list, &committed);
1119 if (error) {
1120 goto error0;
1121 }
1122
1123 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1124 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1125 if (error) {
1126 break;
1127 }
1128
1129 allocated_fsb = imapp->br_blockcount;
1130
1131 if (nimaps == 0) {
1132 error = XFS_ERROR(ENOSPC);
1133 break;
1134 }
1135
1136 startoffset_fsb += allocated_fsb;
1137 allocatesize_fsb -= allocated_fsb;
1138 }
1139
1140 return error;
1141
1142error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
1143 xfs_bmap_cancel(&free_list);
1144 xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
1145
1146error1: /* Just cancel transaction */
1147 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1148 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1149 return error;
1150}
1151
1152/*
1153 * Zero file bytes between startoff and endoff inclusive.
1154 * The iolock is held exclusive and no blocks are buffered.
1155 *
1156 * This function is used by xfs_free_file_space() to zero
1157 * partial blocks when the range to free is not block aligned.
1158 * When unreserving space with boundaries that are not block
1159 * aligned we round up the start and round down the end
1160 * boundaries and then use this function to zero the parts of
1161 * the blocks that got dropped during the rounding.
1162 */
1163STATIC int
1164xfs_zero_remaining_bytes(
1165 xfs_inode_t *ip,
1166 xfs_off_t startoff,
1167 xfs_off_t endoff)
1168{
1169 xfs_bmbt_irec_t imap;
1170 xfs_fileoff_t offset_fsb;
1171 xfs_off_t lastoffset;
1172 xfs_off_t offset;
1173 xfs_buf_t *bp;
1174 xfs_mount_t *mp = ip->i_mount;
1175 int nimap;
1176 int error = 0;
1177
1178 /*
1179 * Avoid doing I/O beyond eof - it's not necessary
1180 * since nothing can read beyond eof. The space will
1181 * be zeroed when the file is extended anyway.
1182 */
1183 if (startoff >= XFS_ISIZE(ip))
1184 return 0;
1185
1186 if (endoff > XFS_ISIZE(ip))
1187 endoff = XFS_ISIZE(ip);
1188
1189 bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
1190 mp->m_rtdev_targp : mp->m_ddev_targp,
1191 BTOBB(mp->m_sb.sb_blocksize), 0);
1192 if (!bp)
1193 return XFS_ERROR(ENOMEM);
1194
1195 xfs_buf_unlock(bp);
1196
1197 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
1198 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1199 nimap = 1;
1200 error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
1201 if (error || nimap < 1)
1202 break;
1203 ASSERT(imap.br_blockcount >= 1);
1204 ASSERT(imap.br_startoff == offset_fsb);
1205 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
1206 if (lastoffset > endoff)
1207 lastoffset = endoff;
1208 if (imap.br_startblock == HOLESTARTBLOCK)
1209 continue;
1210 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1211 if (imap.br_state == XFS_EXT_UNWRITTEN)
1212 continue;
1213 XFS_BUF_UNDONE(bp);
1214 XFS_BUF_UNWRITE(bp);
1215 XFS_BUF_READ(bp);
1216 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
1217 xfsbdstrat(mp, bp);
1218 error = xfs_buf_iowait(bp);
1219 if (error) {
1220 xfs_buf_ioerror_alert(bp,
1221 "xfs_zero_remaining_bytes(read)");
1222 break;
1223 }
1224 memset(bp->b_addr +
1225 (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
1226 0, lastoffset - offset + 1);
1227 XFS_BUF_UNDONE(bp);
1228 XFS_BUF_UNREAD(bp);
1229 XFS_BUF_WRITE(bp);
1230 xfsbdstrat(mp, bp);
1231 error = xfs_buf_iowait(bp);
1232 if (error) {
1233 xfs_buf_ioerror_alert(bp,
1234 "xfs_zero_remaining_bytes(write)");
1235 break;
1236 }
1237 }
1238 xfs_buf_free(bp);
1239 return error;
1240}
1241
1242/*
1243 * xfs_free_file_space()
1244 * This routine frees disk space for the given file.
1245 *
1246 * This routine is only called by xfs_change_file_space
1247 * for an UNRESVSP type call.
1248 *
1249 * RETURNS:
1250 * 0 on success
1251 * errno on error
1252 *
1253 */
1254STATIC int
1255xfs_free_file_space(
1256 xfs_inode_t *ip,
1257 xfs_off_t offset,
1258 xfs_off_t len,
1259 int attr_flags)
1260{
1261 int committed;
1262 int done;
1263 xfs_fileoff_t endoffset_fsb;
1264 int error;
1265 xfs_fsblock_t firstfsb;
1266 xfs_bmap_free_t free_list;
1267 xfs_bmbt_irec_t imap;
1268 xfs_off_t ioffset;
1269 xfs_extlen_t mod=0;
1270 xfs_mount_t *mp;
1271 int nimap;
1272 uint resblks;
1273 xfs_off_t rounding;
1274 int rt;
1275 xfs_fileoff_t startoffset_fsb;
1276 xfs_trans_t *tp;
1277 int need_iolock = 1;
1278
1279 mp = ip->i_mount;
1280
1281 trace_xfs_free_file_space(ip);
1282
1283 error = xfs_qm_dqattach(ip, 0);
1284 if (error)
1285 return error;
1286
1287 error = 0;
1288 if (len <= 0) /* if nothing being freed */
1289 return error;
1290 rt = XFS_IS_REALTIME_INODE(ip);
1291 startoffset_fsb = XFS_B_TO_FSB(mp, offset);
1292 endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
1293
1294 if (attr_flags & XFS_ATTR_NOLOCK)
1295 need_iolock = 0;
1296 if (need_iolock) {
1297 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1298 /* wait for the completion of any pending DIOs */
1299 inode_dio_wait(VFS_I(ip));
1300 }
1301
1302 rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1303 ioffset = offset & ~(rounding - 1);
1304 error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1305 ioffset, -1);
1306 if (error)
1307 goto out_unlock_iolock;
1308 truncate_pagecache_range(VFS_I(ip), ioffset, -1);
1309
1310 /*
1311 * Need to zero the stuff we're not freeing, on disk.
1312 * If it's a realtime file & can't use unwritten extents then we
1313 * actually need to zero the extent edges. Otherwise xfs_bunmapi
1314 * will take care of it for us.
1315 */
1316 if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
1317 nimap = 1;
1318 error = xfs_bmapi_read(ip, startoffset_fsb, 1,
1319 &imap, &nimap, 0);
1320 if (error)
1321 goto out_unlock_iolock;
1322 ASSERT(nimap == 0 || nimap == 1);
1323 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1324 xfs_daddr_t block;
1325
1326 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1327 block = imap.br_startblock;
1328 mod = do_div(block, mp->m_sb.sb_rextsize);
1329 if (mod)
1330 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
1331 }
1332 nimap = 1;
1333 error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
1334 &imap, &nimap, 0);
1335 if (error)
1336 goto out_unlock_iolock;
1337 ASSERT(nimap == 0 || nimap == 1);
1338 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1339 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1340 mod++;
1341 if (mod && (mod != mp->m_sb.sb_rextsize))
1342 endoffset_fsb -= mod;
1343 }
1344 }
1345 if ((done = (endoffset_fsb <= startoffset_fsb)))
1346 /*
1347 * One contiguous piece to clear
1348 */
1349 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
1350 else {
1351 /*
1352 * Some full blocks, possibly two pieces to clear
1353 */
1354 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
1355 error = xfs_zero_remaining_bytes(ip, offset,
1356 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
1357 if (!error &&
1358 XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
1359 error = xfs_zero_remaining_bytes(ip,
1360 XFS_FSB_TO_B(mp, endoffset_fsb),
1361 offset + len - 1);
1362 }
1363
1364 /*
1365 * free file space until done or until there is an error
1366 */
1367 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1368 while (!error && !done) {
1369
1370 /*
1371 * allocate and setup the transaction. Allow this
1372 * transaction to dip into the reserve blocks to ensure
1373 * the freeing of the space succeeds at ENOSPC.
1374 */
1375 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1376 tp->t_flags |= XFS_TRANS_RESERVE;
1377 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
1378
1379 /*
1380 * check for running out of space
1381 */
1382 if (error) {
1383 /*
1384 * Free the transaction structure.
1385 */
1386 ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1387 xfs_trans_cancel(tp, 0);
1388 break;
1389 }
1390 xfs_ilock(ip, XFS_ILOCK_EXCL);
1391 error = xfs_trans_reserve_quota(tp, mp,
1392 ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
1393 resblks, 0, XFS_QMOPT_RES_REGBLKS);
1394 if (error)
1395 goto error1;
1396
1397 xfs_trans_ijoin(tp, ip, 0);
1398
1399 /*
1400 * issue the bunmapi() call to free the blocks
1401 */
1402 xfs_bmap_init(&free_list, &firstfsb);
1403 error = xfs_bunmapi(tp, ip, startoffset_fsb,
1404 endoffset_fsb - startoffset_fsb,
1405 0, 2, &firstfsb, &free_list, &done);
1406 if (error) {
1407 goto error0;
1408 }
1409
1410 /*
1411 * complete the transaction
1412 */
1413 error = xfs_bmap_finish(&tp, &free_list, &committed);
1414 if (error) {
1415 goto error0;
1416 }
1417
1418 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1419 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1420 }
1421
1422 out_unlock_iolock:
1423 if (need_iolock)
1424 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1425 return error;
1426
1427 error0:
1428 xfs_bmap_cancel(&free_list);
1429 error1:
1430 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1431 xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
1432 XFS_ILOCK_EXCL);
1433 return error;
1434}
1435
1436
1437STATIC int
1438xfs_zero_file_space(
1439 struct xfs_inode *ip,
1440 xfs_off_t offset,
1441 xfs_off_t len,
1442 int attr_flags)
1443{
1444 struct xfs_mount *mp = ip->i_mount;
1445 uint granularity;
1446 xfs_off_t start_boundary;
1447 xfs_off_t end_boundary;
1448 int error;
1449
1450 granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1451
1452 /*
1453 * Round the range of extents we are going to convert inwards. If the
1454 * offset is aligned, then it doesn't get changed so we zero from the
1455 * start of the block offset points to.
1456 */
1457 start_boundary = round_up(offset, granularity);
1458 end_boundary = round_down(offset + len, granularity);
1459
1460 ASSERT(start_boundary >= offset);
1461 ASSERT(end_boundary <= offset + len);
1462
1463 if (!(attr_flags & XFS_ATTR_NOLOCK))
1464 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1465
1466 if (start_boundary < end_boundary - 1) {
1467 /* punch out the page cache over the conversion range */
1468 truncate_pagecache_range(VFS_I(ip), start_boundary,
1469 end_boundary - 1);
1470 /* convert the blocks */
1471 error = xfs_alloc_file_space(ip, start_boundary,
1472 end_boundary - start_boundary - 1,
1473 XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
1474 attr_flags);
1475 if (error)
1476 goto out_unlock;
1477
1478 /* We've handled the interior of the range, now for the edges */
1479 if (start_boundary != offset)
1480 error = xfs_iozero(ip, offset, start_boundary - offset);
1481 if (error)
1482 goto out_unlock;
1483
1484 if (end_boundary != offset + len)
1485 error = xfs_iozero(ip, end_boundary,
1486 offset + len - end_boundary);
1487
1488 } else {
1489 /*
1490 * It's either a sub-granularity range or the range spanned lies
1491 * partially across two adjacent blocks.
1492 */
1493 error = xfs_iozero(ip, offset, len);
1494 }
1495
1496out_unlock:
1497 if (!(attr_flags & XFS_ATTR_NOLOCK))
1498 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1499 return error;
1500
1501}
1502
1503/*
1504 * xfs_change_file_space()
1505 * This routine allocates or frees disk space for the given file.
1506 * The user specified parameters are checked for alignment and size
1507 * limitations.
1508 *
1509 * RETURNS:
1510 * 0 on success
1511 * errno on error
1512 *
1513 */
1514int
1515xfs_change_file_space(
1516 xfs_inode_t *ip,
1517 int cmd,
1518 xfs_flock64_t *bf,
1519 xfs_off_t offset,
1520 int attr_flags)
1521{
1522 xfs_mount_t *mp = ip->i_mount;
1523 int clrprealloc;
1524 int error;
1525 xfs_fsize_t fsize;
1526 int setprealloc;
1527 xfs_off_t startoffset;
1528 xfs_trans_t *tp;
1529 struct iattr iattr;
1530
1531 if (!S_ISREG(ip->i_d.di_mode))
1532 return XFS_ERROR(EINVAL);
1533
1534 switch (bf->l_whence) {
1535 case 0: /*SEEK_SET*/
1536 break;
1537 case 1: /*SEEK_CUR*/
1538 bf->l_start += offset;
1539 break;
1540 case 2: /*SEEK_END*/
1541 bf->l_start += XFS_ISIZE(ip);
1542 break;
1543 default:
1544 return XFS_ERROR(EINVAL);
1545 }
1546
1547 /*
1548 * length of <= 0 for resv/unresv/zero is invalid. length for
1549 * alloc/free is ignored completely and we have no idea what userspace
1550 * might have set it to, so set it to zero to allow range
1551 * checks to pass.
1552 */
1553 switch (cmd) {
1554 case XFS_IOC_ZERO_RANGE:
1555 case XFS_IOC_RESVSP:
1556 case XFS_IOC_RESVSP64:
1557 case XFS_IOC_UNRESVSP:
1558 case XFS_IOC_UNRESVSP64:
1559 if (bf->l_len <= 0)
1560 return XFS_ERROR(EINVAL);
1561 break;
1562 default:
1563 bf->l_len = 0;
1564 break;
1565 }
1566
1567 if (bf->l_start < 0 ||
1568 bf->l_start > mp->m_super->s_maxbytes ||
1569 bf->l_start + bf->l_len < 0 ||
1570 bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
1571 return XFS_ERROR(EINVAL);
1572
1573 bf->l_whence = 0;
1574
1575 startoffset = bf->l_start;
1576 fsize = XFS_ISIZE(ip);
1577
1578 setprealloc = clrprealloc = 0;
1579 switch (cmd) {
1580 case XFS_IOC_ZERO_RANGE:
1581 error = xfs_zero_file_space(ip, startoffset, bf->l_len,
1582 attr_flags);
1583 if (error)
1584 return error;
1585 setprealloc = 1;
1586 break;
1587
1588 case XFS_IOC_RESVSP:
1589 case XFS_IOC_RESVSP64:
1590 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
1591 XFS_BMAPI_PREALLOC, attr_flags);
1592 if (error)
1593 return error;
1594 setprealloc = 1;
1595 break;
1596
1597 case XFS_IOC_UNRESVSP:
1598 case XFS_IOC_UNRESVSP64:
1599 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
1600 attr_flags)))
1601 return error;
1602 break;
1603
1604 case XFS_IOC_ALLOCSP:
1605 case XFS_IOC_ALLOCSP64:
1606 case XFS_IOC_FREESP:
1607 case XFS_IOC_FREESP64:
1608 /*
1609 * These operations actually do IO when extending the file, but
1610 * the allocation is done seperately to the zeroing that is
1611 * done. This set of operations need to be serialised against
1612 * other IO operations, such as truncate and buffered IO. We
1613 * need to take the IOLOCK here to serialise the allocation and
1614 * zeroing IO to prevent other IOLOCK holders (e.g. getbmap,
1615 * truncate, direct IO) from racing against the transient
1616 * allocated but not written state we can have here.
1617 */
1618 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1619 if (startoffset > fsize) {
1620 error = xfs_alloc_file_space(ip, fsize,
1621 startoffset - fsize, 0,
1622 attr_flags | XFS_ATTR_NOLOCK);
1623 if (error) {
1624 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1625 break;
1626 }
1627 }
1628
1629 iattr.ia_valid = ATTR_SIZE;
1630 iattr.ia_size = startoffset;
1631
1632 error = xfs_setattr_size(ip, &iattr,
1633 attr_flags | XFS_ATTR_NOLOCK);
1634 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1635
1636 if (error)
1637 return error;
1638
1639 clrprealloc = 1;
1640 break;
1641
1642 default:
1643 ASSERT(0);
1644 return XFS_ERROR(EINVAL);
1645 }
1646
1647 /*
1648 * update the inode timestamp, mode, and prealloc flag bits
1649 */
1650 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
1651 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0);
1652 if (error) {
1653 xfs_trans_cancel(tp, 0);
1654 return error;
1655 }
1656
1657 xfs_ilock(ip, XFS_ILOCK_EXCL);
1658 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1659
1660 if ((attr_flags & XFS_ATTR_DMI) == 0) {
1661 ip->i_d.di_mode &= ~S_ISUID;
1662
1663 /*
1664 * Note that we don't have to worry about mandatory
1665 * file locking being disabled here because we only
1666 * clear the S_ISGID bit if the Group execute bit is
1667 * on, but if it was on then mandatory locking wouldn't
1668 * have been enabled.
1669 */
1670 if (ip->i_d.di_mode & S_IXGRP)
1671 ip->i_d.di_mode &= ~S_ISGID;
1672
1673 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1674 }
1675 if (setprealloc)
1676 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
1677 else if (clrprealloc)
1678 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
1679
1680 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1681 if (attr_flags & XFS_ATTR_SYNC)
1682 xfs_trans_set_sync(tp);
1683 return xfs_trans_commit(tp, 0);
1684}
1685
1686/*
1687 * We need to check that the format of the data fork in the temporary inode is
1688 * valid for the target inode before doing the swap. This is not a problem with
1689 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
1690 * data fork depending on the space the attribute fork is taking so we can get
1691 * invalid formats on the target inode.
1692 *
1693 * E.g. target has space for 7 extents in extent format, temp inode only has
1694 * space for 6. If we defragment down to 7 extents, then the tmp format is a
1695 * btree, but when swapped it needs to be in extent format. Hence we can't just
1696 * blindly swap data forks on attr2 filesystems.
1697 *
1698 * Note that we check the swap in both directions so that we don't end up with
1699 * a corrupt temporary inode, either.
1700 *
1701 * Note that fixing the way xfs_fsr sets up the attribute fork in the source
1702 * inode will prevent this situation from occurring, so all we do here is
1703 * reject and log the attempt. basically we are putting the responsibility on
1704 * userspace to get this right.
1705 */
1706static int
1707xfs_swap_extents_check_format(
1708 xfs_inode_t *ip, /* target inode */
1709 xfs_inode_t *tip) /* tmp inode */
1710{
1711
1712 /* Should never get a local format */
1713 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
1714 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
1715 return EINVAL;
1716
1717 /*
1718 * if the target inode has less extents that then temporary inode then
1719 * why did userspace call us?
1720 */
1721 if (ip->i_d.di_nextents < tip->i_d.di_nextents)
1722 return EINVAL;
1723
1724 /*
1725 * if the target inode is in extent form and the temp inode is in btree
1726 * form then we will end up with the target inode in the wrong format
1727 * as we already know there are less extents in the temp inode.
1728 */
1729 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1730 tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
1731 return EINVAL;
1732
1733 /* Check temp in extent form to max in target */
1734 if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1735 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
1736 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1737 return EINVAL;
1738
1739 /* Check target in extent form to max in temp */
1740 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1741 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
1742 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1743 return EINVAL;
1744
1745 /*
1746 * If we are in a btree format, check that the temp root block will fit
1747 * in the target and that it has enough extents to be in btree format
1748 * in the target.
1749 *
1750 * Note that we have to be careful to allow btree->extent conversions
1751 * (a common defrag case) which will occur when the temp inode is in
1752 * extent format...
1753 */
1754 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1755 if (XFS_IFORK_BOFF(ip) &&
1756 XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
1757 return EINVAL;
1758 if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
1759 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1760 return EINVAL;
1761 }
1762
1763 /* Reciprocal target->temp btree format checks */
1764 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1765 if (XFS_IFORK_BOFF(tip) &&
1766 XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
1767 return EINVAL;
1768 if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
1769 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1770 return EINVAL;
1771 }
1772
1773 return 0;
1774}
1775
1776int
1777xfs_swap_extents(
1778 xfs_inode_t *ip, /* target inode */
1779 xfs_inode_t *tip, /* tmp inode */
1780 xfs_swapext_t *sxp)
1781{
1782 xfs_mount_t *mp = ip->i_mount;
1783 xfs_trans_t *tp;
1784 xfs_bstat_t *sbp = &sxp->sx_stat;
1785 xfs_ifork_t *tempifp, *ifp, *tifp;
1786 int src_log_flags, target_log_flags;
1787 int error = 0;
1788 int aforkblks = 0;
1789 int taforkblks = 0;
1790 __uint64_t tmp;
1791
1792 /*
1793 * We have no way of updating owner information in the BMBT blocks for
1794 * each inode on CRC enabled filesystems, so to avoid corrupting the
1795 * this metadata we simply don't allow extent swaps to occur.
1796 */
1797 if (xfs_sb_version_hascrc(&mp->m_sb))
1798 return XFS_ERROR(EINVAL);
1799
1800 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
1801 if (!tempifp) {
1802 error = XFS_ERROR(ENOMEM);
1803 goto out;
1804 }
1805
1806 /*
1807 * we have to do two separate lock calls here to keep lockdep
1808 * happy. If we try to get all the locks in one call, lock will
1809 * report false positives when we drop the ILOCK and regain them
1810 * below.
1811 */
1812 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
1813 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
1814
1815 /* Verify that both files have the same format */
1816 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
1817 error = XFS_ERROR(EINVAL);
1818 goto out_unlock;
1819 }
1820
1821 /* Verify both files are either real-time or non-realtime */
1822 if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
1823 error = XFS_ERROR(EINVAL);
1824 goto out_unlock;
1825 }
1826
1827 error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
1828 if (error)
1829 goto out_unlock;
1830 truncate_pagecache_range(VFS_I(tip), 0, -1);
1831
1832 /* Verify O_DIRECT for ftmp */
1833 if (VN_CACHED(VFS_I(tip)) != 0) {
1834 error = XFS_ERROR(EINVAL);
1835 goto out_unlock;
1836 }
1837
1838 /* Verify all data are being swapped */
1839 if (sxp->sx_offset != 0 ||
1840 sxp->sx_length != ip->i_d.di_size ||
1841 sxp->sx_length != tip->i_d.di_size) {
1842 error = XFS_ERROR(EFAULT);
1843 goto out_unlock;
1844 }
1845
1846 trace_xfs_swap_extent_before(ip, 0);
1847 trace_xfs_swap_extent_before(tip, 1);
1848
1849 /* check inode formats now that data is flushed */
1850 error = xfs_swap_extents_check_format(ip, tip);
1851 if (error) {
1852 xfs_notice(mp,
1853 "%s: inode 0x%llx format is incompatible for exchanging.",
1854 __func__, ip->i_ino);
1855 goto out_unlock;
1856 }
1857
1858 /*
1859 * Compare the current change & modify times with that
1860 * passed in. If they differ, we abort this swap.
1861 * This is the mechanism used to ensure the calling
1862 * process that the file was not changed out from
1863 * under it.
1864 */
1865 if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
1866 (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
1867 (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
1868 (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
1869 error = XFS_ERROR(EBUSY);
1870 goto out_unlock;
1871 }
1872
1873 /* We need to fail if the file is memory mapped. Once we have tossed
1874 * all existing pages, the page fault will have no option
1875 * but to go to the filesystem for pages. By making the page fault call
1876 * vop_read (or write in the case of autogrow) they block on the iolock
1877 * until we have switched the extents.
1878 */
1879 if (VN_MAPPED(VFS_I(ip))) {
1880 error = XFS_ERROR(EBUSY);
1881 goto out_unlock;
1882 }
1883
1884 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1885 xfs_iunlock(tip, XFS_ILOCK_EXCL);
1886
1887 /*
1888 * There is a race condition here since we gave up the
1889 * ilock. However, the data fork will not change since
1890 * we have the iolock (locked for truncation too) so we
1891 * are safe. We don't really care if non-io related
1892 * fields change.
1893 */
1894 truncate_pagecache_range(VFS_I(ip), 0, -1);
1895
1896 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
1897 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
1898 if (error) {
1899 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1900 xfs_iunlock(tip, XFS_IOLOCK_EXCL);
1901 xfs_trans_cancel(tp, 0);
1902 goto out;
1903 }
1904 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
1905
1906 /*
1907 * Count the number of extended attribute blocks
1908 */
1909 if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
1910 (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
1911 error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
1912 if (error)
1913 goto out_trans_cancel;
1914 }
1915 if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
1916 (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
1917 error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
1918 &taforkblks);
1919 if (error)
1920 goto out_trans_cancel;
1921 }
1922
1923 /*
1924 * Swap the data forks of the inodes
1925 */
1926 ifp = &ip->i_df;
1927 tifp = &tip->i_df;
1928 *tempifp = *ifp; /* struct copy */
1929 *ifp = *tifp; /* struct copy */
1930 *tifp = *tempifp; /* struct copy */
1931
1932 /*
1933 * Fix the on-disk inode values
1934 */
1935 tmp = (__uint64_t)ip->i_d.di_nblocks;
1936 ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
1937 tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
1938
1939 tmp = (__uint64_t) ip->i_d.di_nextents;
1940 ip->i_d.di_nextents = tip->i_d.di_nextents;
1941 tip->i_d.di_nextents = tmp;
1942
1943 tmp = (__uint64_t) ip->i_d.di_format;
1944 ip->i_d.di_format = tip->i_d.di_format;
1945 tip->i_d.di_format = tmp;
1946
1947 /*
1948 * The extents in the source inode could still contain speculative
1949 * preallocation beyond EOF (e.g. the file is open but not modified
1950 * while defrag is in progress). In that case, we need to copy over the
1951 * number of delalloc blocks the data fork in the source inode is
1952 * tracking beyond EOF so that when the fork is truncated away when the
1953 * temporary inode is unlinked we don't underrun the i_delayed_blks
1954 * counter on that inode.
1955 */
1956 ASSERT(tip->i_delayed_blks == 0);
1957 tip->i_delayed_blks = ip->i_delayed_blks;
1958 ip->i_delayed_blks = 0;
1959
1960 src_log_flags = XFS_ILOG_CORE;
1961 switch (ip->i_d.di_format) {
1962 case XFS_DINODE_FMT_EXTENTS:
1963 /* If the extents fit in the inode, fix the
1964 * pointer. Otherwise it's already NULL or
1965 * pointing to the extent.
1966 */
1967 if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
1968 ifp->if_u1.if_extents =
1969 ifp->if_u2.if_inline_ext;
1970 }
1971 src_log_flags |= XFS_ILOG_DEXT;
1972 break;
1973 case XFS_DINODE_FMT_BTREE:
1974 src_log_flags |= XFS_ILOG_DBROOT;
1975 break;
1976 }
1977
1978 target_log_flags = XFS_ILOG_CORE;
1979 switch (tip->i_d.di_format) {
1980 case XFS_DINODE_FMT_EXTENTS:
1981 /* If the extents fit in the inode, fix the
1982 * pointer. Otherwise it's already NULL or
1983 * pointing to the extent.
1984 */
1985 if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
1986 tifp->if_u1.if_extents =
1987 tifp->if_u2.if_inline_ext;
1988 }
1989 target_log_flags |= XFS_ILOG_DEXT;
1990 break;
1991 case XFS_DINODE_FMT_BTREE:
1992 target_log_flags |= XFS_ILOG_DBROOT;
1993 break;
1994 }
1995
1996
1997 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1998 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1999
2000 xfs_trans_log_inode(tp, ip, src_log_flags);
2001 xfs_trans_log_inode(tp, tip, target_log_flags);
2002
2003 /*
2004 * If this is a synchronous mount, make sure that the
2005 * transaction goes to disk before returning to the user.
2006 */
2007 if (mp->m_flags & XFS_MOUNT_WSYNC)
2008 xfs_trans_set_sync(tp);
2009
2010 error = xfs_trans_commit(tp, 0);
2011
2012 trace_xfs_swap_extent_after(ip, 0);
2013 trace_xfs_swap_extent_after(tip, 1);
2014out:
2015 kmem_free(tempifp);
2016 return error;
2017
2018out_unlock:
2019 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
2020 xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
2021 goto out;
2022
2023out_trans_cancel:
2024 xfs_trans_cancel(tp, 0);
2025 goto out_unlock;
2026}
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
new file mode 100644
index 000000000000..061260946f7a
--- /dev/null
+++ b/fs/xfs/xfs_bmap_util.h
@@ -0,0 +1,110 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_BMAP_UTIL_H__
19#define __XFS_BMAP_UTIL_H__
20
21/* Kernel only BMAP related definitions and functions */
22
23struct xfs_bmbt_irec;
24struct xfs_bmap_free_item;
25struct xfs_ifork;
26struct xfs_inode;
27struct xfs_mount;
28struct xfs_trans;
29
30/*
31 * Argument structure for xfs_bmap_alloc.
32 */
33struct xfs_bmalloca {
34 xfs_fsblock_t *firstblock; /* i/o first block allocated */
35 struct xfs_bmap_free *flist; /* bmap freelist */
36 struct xfs_trans *tp; /* transaction pointer */
37 struct xfs_inode *ip; /* incore inode pointer */
38 struct xfs_bmbt_irec prev; /* extent before the new one */
39 struct xfs_bmbt_irec got; /* extent after, or delayed */
40
41 xfs_fileoff_t offset; /* offset in file filling in */
42 xfs_extlen_t length; /* i/o length asked/allocated */
43 xfs_fsblock_t blkno; /* starting block of new extent */
44
45 struct xfs_btree_cur *cur; /* btree cursor */
46 xfs_extnum_t idx; /* current extent index */
47 int nallocs;/* number of extents alloc'd */
48 int logflags;/* flags for transaction logging */
49
50 xfs_extlen_t total; /* total blocks needed for xaction */
51 xfs_extlen_t minlen; /* minimum allocation size (blocks) */
52 xfs_extlen_t minleft; /* amount must be left after alloc */
53 char eof; /* set if allocating past last extent */
54 char wasdel; /* replacing a delayed allocation */
55 char userdata;/* set if is user data */
56 char aeof; /* allocated space at eof */
57 char conv; /* overwriting unwritten extents */
58 char stack_switch;
59 int flags;
60 struct completion *done;
61 struct work_struct work;
62 int result;
63};
64
65int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
66 int *committed);
67int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
68int xfs_bmapi_allocate(struct xfs_bmalloca *args);
69int __xfs_bmapi_allocate(struct xfs_bmalloca *args);
70int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
71 int whichfork, int *eof);
72int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
73 int whichfork, int *count);
74int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
75 xfs_fileoff_t start_fsb, xfs_fileoff_t length);
76
77/* bmap to userspace formatter - copy to user & advance pointer */
78typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
79int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
80 xfs_bmap_format_t formatter, void *arg);
81
82/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
83void xfs_bmap_del_free(struct xfs_bmap_free *flist,
84 struct xfs_bmap_free_item *prev,
85 struct xfs_bmap_free_item *free);
86int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
87 struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
88 int rt, int eof, int delay, int convert,
89 xfs_fileoff_t *offp, xfs_extlen_t *lenp);
90void xfs_bmap_adjacent(struct xfs_bmalloca *ap);
91int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
92 int whichfork, struct xfs_bmbt_irec *rec,
93 int *is_empty);
94
95/* preallocation and hole punch interface */
96int xfs_change_file_space(struct xfs_inode *ip, int cmd,
97 xfs_flock64_t *bf, xfs_off_t offset,
98 int attr_flags);
99
100/* EOF block manipulation functions */
101bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
102int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
103 bool need_iolock);
104
105int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
106 struct xfs_swapext *sx);
107
108xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb);
109
110#endif /* __XFS_BMAP_UTIL_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 0903960410a2..7a2b4da3c0db 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -510,7 +510,7 @@ xfs_btree_ptr_addr(
510} 510}
511 511
512/* 512/*
513 * Get a the root block which is stored in the inode. 513 * Get the root block which is stored in the inode.
514 * 514 *
515 * For now this btree implementation assumes the btree root is always 515 * For now this btree implementation assumes the btree root is always
516 * stored in the if_broot field of an inode fork. 516 * stored in the if_broot field of an inode fork.
@@ -978,6 +978,7 @@ xfs_btree_init_block_int(
978 buf->bb_u.l.bb_owner = cpu_to_be64(owner); 978 buf->bb_u.l.bb_owner = cpu_to_be64(owner);
979 uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid); 979 uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
980 buf->bb_u.l.bb_pad = 0; 980 buf->bb_u.l.bb_pad = 0;
981 buf->bb_u.l.bb_lsn = 0;
981 } 982 }
982 } else { 983 } else {
983 /* owner is a 32 bit value on short blocks */ 984 /* owner is a 32 bit value on short blocks */
@@ -989,6 +990,7 @@ xfs_btree_init_block_int(
989 buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); 990 buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
990 buf->bb_u.s.bb_owner = cpu_to_be32(__owner); 991 buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
991 uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid); 992 uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
993 buf->bb_u.s.bb_lsn = 0;
992 } 994 }
993 } 995 }
994} 996}
@@ -1684,7 +1686,7 @@ xfs_lookup_get_search_key(
1684 1686
1685/* 1687/*
1686 * Lookup the record. The cursor is made to point to it, based on dir. 1688 * Lookup the record. The cursor is made to point to it, based on dir.
1687 * Return 0 if can't find any such record, 1 for success. 1689 * stat is set to 0 if can't find any such record, 1 for success.
1688 */ 1690 */
1689int /* error */ 1691int /* error */
1690xfs_btree_lookup( 1692xfs_btree_lookup(
@@ -2756,7 +2758,6 @@ xfs_btree_make_block_unfull(
2756 2758
2757 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { 2759 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
2758 /* A root block that can be made bigger. */ 2760 /* A root block that can be made bigger. */
2759
2760 xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork); 2761 xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
2761 } else { 2762 } else {
2762 /* A root block that needs replacing */ 2763 /* A root block that needs replacing */
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 55e3c7cc3c3d..c8473c7ef45e 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -88,13 +88,11 @@ struct xfs_btree_block {
88#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) 88#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40)
89#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) 89#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48)
90 90
91
92#define XFS_BTREE_SBLOCK_CRC_OFF \ 91#define XFS_BTREE_SBLOCK_CRC_OFF \
93 offsetof(struct xfs_btree_block, bb_u.s.bb_crc) 92 offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
94#define XFS_BTREE_LBLOCK_CRC_OFF \ 93#define XFS_BTREE_LBLOCK_CRC_OFF \
95 offsetof(struct xfs_btree_block, bb_u.l.bb_crc) 94 offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
96 95
97
98/* 96/*
99 * Generic key, ptr and record wrapper structures. 97 * Generic key, ptr and record wrapper structures.
100 * 98 *
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 1b2472a46e46..c06823fe10d3 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -35,6 +35,7 @@
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36 36
37#include "xfs_sb.h" 37#include "xfs_sb.h"
38#include "xfs_trans_resv.h"
38#include "xfs_log.h" 39#include "xfs_log.h"
39#include "xfs_ag.h" 40#include "xfs_ag.h"
40#include "xfs_mount.h" 41#include "xfs_mount.h"
@@ -303,7 +304,7 @@ _xfs_buf_free_pages(
303 * Releases the specified buffer. 304 * Releases the specified buffer.
304 * 305 *
305 * The modification state of any associated pages is left unchanged. 306 * The modification state of any associated pages is left unchanged.
306 * The buffer most not be on any hash - use xfs_buf_rele instead for 307 * The buffer must not be on any hash - use xfs_buf_rele instead for
307 * hashed and refcounted buffers 308 * hashed and refcounted buffers
308 */ 309 */
309void 310void
@@ -1621,7 +1622,7 @@ xfs_setsize_buftarg_flags(
1621/* 1622/*
1622 * When allocating the initial buffer target we have not yet 1623 * When allocating the initial buffer target we have not yet
1623 * read in the superblock, so don't know what sized sectors 1624 * read in the superblock, so don't know what sized sectors
1624 * are being used is at this early stage. Play safe. 1625 * are being used at this early stage. Play safe.
1625 */ 1626 */
1626STATIC int 1627STATIC int
1627xfs_setsize_buftarg_early( 1628xfs_setsize_buftarg_early(
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index bfc4e0c26fd3..3a944b198e35 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -39,6 +39,14 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
39 39
40STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); 40STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
41 41
42static inline int
43xfs_buf_log_format_size(
44 struct xfs_buf_log_format *blfp)
45{
46 return offsetof(struct xfs_buf_log_format, blf_data_map) +
47 (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
48}
49
42/* 50/*
43 * This returns the number of log iovecs needed to log the 51 * This returns the number of log iovecs needed to log the
44 * given buf log item. 52 * given buf log item.
@@ -49,25 +57,27 @@ STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
49 * 57 *
50 * If the XFS_BLI_STALE flag has been set, then log nothing. 58 * If the XFS_BLI_STALE flag has been set, then log nothing.
51 */ 59 */
52STATIC uint 60STATIC void
53xfs_buf_item_size_segment( 61xfs_buf_item_size_segment(
54 struct xfs_buf_log_item *bip, 62 struct xfs_buf_log_item *bip,
55 struct xfs_buf_log_format *blfp) 63 struct xfs_buf_log_format *blfp,
64 int *nvecs,
65 int *nbytes)
56{ 66{
57 struct xfs_buf *bp = bip->bli_buf; 67 struct xfs_buf *bp = bip->bli_buf;
58 uint nvecs;
59 int next_bit; 68 int next_bit;
60 int last_bit; 69 int last_bit;
61 70
62 last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 71 last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
63 if (last_bit == -1) 72 if (last_bit == -1)
64 return 0; 73 return;
65 74
66 /* 75 /*
67 * initial count for a dirty buffer is 2 vectors - the format structure 76 * initial count for a dirty buffer is 2 vectors - the format structure
68 * and the first dirty region. 77 * and the first dirty region.
69 */ 78 */
70 nvecs = 2; 79 *nvecs += 2;
80 *nbytes += xfs_buf_log_format_size(blfp) + XFS_BLF_CHUNK;
71 81
72 while (last_bit != -1) { 82 while (last_bit != -1) {
73 /* 83 /*
@@ -87,18 +97,17 @@ xfs_buf_item_size_segment(
87 break; 97 break;
88 } else if (next_bit != last_bit + 1) { 98 } else if (next_bit != last_bit + 1) {
89 last_bit = next_bit; 99 last_bit = next_bit;
90 nvecs++; 100 (*nvecs)++;
91 } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != 101 } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
92 (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + 102 (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
93 XFS_BLF_CHUNK)) { 103 XFS_BLF_CHUNK)) {
94 last_bit = next_bit; 104 last_bit = next_bit;
95 nvecs++; 105 (*nvecs)++;
96 } else { 106 } else {
97 last_bit++; 107 last_bit++;
98 } 108 }
109 *nbytes += XFS_BLF_CHUNK;
99 } 110 }
100
101 return nvecs;
102} 111}
103 112
104/* 113/*
@@ -118,12 +127,13 @@ xfs_buf_item_size_segment(
118 * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log 127 * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
119 * format structures. 128 * format structures.
120 */ 129 */
121STATIC uint 130STATIC void
122xfs_buf_item_size( 131xfs_buf_item_size(
123 struct xfs_log_item *lip) 132 struct xfs_log_item *lip,
133 int *nvecs,
134 int *nbytes)
124{ 135{
125 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 136 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
126 uint nvecs;
127 int i; 137 int i;
128 138
129 ASSERT(atomic_read(&bip->bli_refcount) > 0); 139 ASSERT(atomic_read(&bip->bli_refcount) > 0);
@@ -135,7 +145,11 @@ xfs_buf_item_size(
135 */ 145 */
136 trace_xfs_buf_item_size_stale(bip); 146 trace_xfs_buf_item_size_stale(bip);
137 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); 147 ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
138 return bip->bli_format_count; 148 *nvecs += bip->bli_format_count;
149 for (i = 0; i < bip->bli_format_count; i++) {
150 *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]);
151 }
152 return;
139 } 153 }
140 154
141 ASSERT(bip->bli_flags & XFS_BLI_LOGGED); 155 ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
@@ -147,7 +161,8 @@ xfs_buf_item_size(
147 * commit, so no vectors are used at all. 161 * commit, so no vectors are used at all.
148 */ 162 */
149 trace_xfs_buf_item_size_ordered(bip); 163 trace_xfs_buf_item_size_ordered(bip);
150 return XFS_LOG_VEC_ORDERED; 164 *nvecs = XFS_LOG_VEC_ORDERED;
165 return;
151 } 166 }
152 167
153 /* 168 /*
@@ -159,13 +174,11 @@ xfs_buf_item_size(
159 * count for the extra buf log format structure that will need to be 174 * count for the extra buf log format structure that will need to be
160 * written. 175 * written.
161 */ 176 */
162 nvecs = 0;
163 for (i = 0; i < bip->bli_format_count; i++) { 177 for (i = 0; i < bip->bli_format_count; i++) {
164 nvecs += xfs_buf_item_size_segment(bip, &bip->bli_formats[i]); 178 xfs_buf_item_size_segment(bip, &bip->bli_formats[i],
179 nvecs, nbytes);
165 } 180 }
166
167 trace_xfs_buf_item_size(bip); 181 trace_xfs_buf_item_size(bip);
168 return nvecs;
169} 182}
170 183
171static struct xfs_log_iovec * 184static struct xfs_log_iovec *
@@ -192,8 +205,7 @@ xfs_buf_item_format_segment(
192 * the actual size of the dirty bitmap rather than the size of the in 205 * the actual size of the dirty bitmap rather than the size of the in
193 * memory structure. 206 * memory structure.
194 */ 207 */
195 base_size = offsetof(struct xfs_buf_log_format, blf_data_map) + 208 base_size = xfs_buf_log_format_size(blfp);
196 (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
197 209
198 nvecs = 0; 210 nvecs = 0;
199 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); 211 first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
@@ -601,11 +613,9 @@ xfs_buf_item_unlock(
601 } 613 }
602 } 614 }
603 } 615 }
604 if (clean) 616 if (clean || aborted) {
605 xfs_buf_item_relse(bp);
606 else if (aborted) {
607 if (atomic_dec_and_test(&bip->bli_refcount)) { 617 if (atomic_dec_and_test(&bip->bli_refcount)) {
608 ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); 618 ASSERT(!aborted || XFS_FORCED_SHUTDOWN(lip->li_mountp));
609 xfs_buf_item_relse(bp); 619 xfs_buf_item_relse(bp);
610 } 620 }
611 } else 621 } else
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 0f1c247dc680..db6371087fe8 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -18,101 +18,9 @@
18#ifndef __XFS_BUF_ITEM_H__ 18#ifndef __XFS_BUF_ITEM_H__
19#define __XFS_BUF_ITEM_H__ 19#define __XFS_BUF_ITEM_H__
20 20
21extern kmem_zone_t *xfs_buf_item_zone; 21/* kernel only definitions */
22
23/*
24 * This flag indicates that the buffer contains on disk inodes
25 * and requires special recovery handling.
26 */
27#define XFS_BLF_INODE_BUF (1<<0)
28/*
29 * This flag indicates that the buffer should not be replayed
30 * during recovery because its blocks are being freed.
31 */
32#define XFS_BLF_CANCEL (1<<1)
33
34/*
35 * This flag indicates that the buffer contains on disk
36 * user or group dquots and may require special recovery handling.
37 */
38#define XFS_BLF_UDQUOT_BUF (1<<2)
39#define XFS_BLF_PDQUOT_BUF (1<<3)
40#define XFS_BLF_GDQUOT_BUF (1<<4)
41
42#define XFS_BLF_CHUNK 128
43#define XFS_BLF_SHIFT 7
44#define BIT_TO_WORD_SHIFT 5
45#define NBWORD (NBBY * sizeof(unsigned int))
46
47/*
48 * This is the structure used to lay out a buf log item in the
49 * log. The data map describes which 128 byte chunks of the buffer
50 * have been logged.
51 */
52#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
53 22
54typedef struct xfs_buf_log_format { 23/* buf log item flags */
55 unsigned short blf_type; /* buf log item type indicator */
56 unsigned short blf_size; /* size of this item */
57 ushort blf_flags; /* misc state */
58 ushort blf_len; /* number of blocks in this buf */
59 __int64_t blf_blkno; /* starting blkno of this buf */
60 unsigned int blf_map_size; /* used size of data bitmap in words */
61 unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
62} xfs_buf_log_format_t;
63
64/*
65 * All buffers now need to tell recovery where the magic number
66 * is so that it can verify and calculate the CRCs on the buffer correctly
67 * once the changes have been replayed into the buffer.
68 *
69 * The type value is held in the upper 5 bits of the blf_flags field, which is
70 * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down.
71 */
72#define XFS_BLFT_BITS 5
73#define XFS_BLFT_SHIFT 11
74#define XFS_BLFT_MASK (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT)
75
76enum xfs_blft {
77 XFS_BLFT_UNKNOWN_BUF = 0,
78 XFS_BLFT_UDQUOT_BUF,
79 XFS_BLFT_PDQUOT_BUF,
80 XFS_BLFT_GDQUOT_BUF,
81 XFS_BLFT_BTREE_BUF,
82 XFS_BLFT_AGF_BUF,
83 XFS_BLFT_AGFL_BUF,
84 XFS_BLFT_AGI_BUF,
85 XFS_BLFT_DINO_BUF,
86 XFS_BLFT_SYMLINK_BUF,
87 XFS_BLFT_DIR_BLOCK_BUF,
88 XFS_BLFT_DIR_DATA_BUF,
89 XFS_BLFT_DIR_FREE_BUF,
90 XFS_BLFT_DIR_LEAF1_BUF,
91 XFS_BLFT_DIR_LEAFN_BUF,
92 XFS_BLFT_DA_NODE_BUF,
93 XFS_BLFT_ATTR_LEAF_BUF,
94 XFS_BLFT_ATTR_RMT_BUF,
95 XFS_BLFT_SB_BUF,
96 XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
97};
98
99static inline void
100xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
101{
102 ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF);
103 blf->blf_flags &= ~XFS_BLFT_MASK;
104 blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
105}
106
107static inline __uint16_t
108xfs_blft_from_flags(struct xfs_buf_log_format *blf)
109{
110 return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
111}
112
113/*
114 * buf log item flags
115 */
116#define XFS_BLI_HOLD 0x01 24#define XFS_BLI_HOLD 0x01
117#define XFS_BLI_DIRTY 0x02 25#define XFS_BLI_DIRTY 0x02
118#define XFS_BLI_STALE 0x04 26#define XFS_BLI_STALE 0x04
@@ -133,8 +41,6 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
133 { XFS_BLI_ORDERED, "ORDERED" } 41 { XFS_BLI_ORDERED, "ORDERED" }
134 42
135 43
136#ifdef __KERNEL__
137
138struct xfs_buf; 44struct xfs_buf;
139struct xfs_mount; 45struct xfs_mount;
140struct xfs_buf_log_item; 46struct xfs_buf_log_item;
@@ -169,6 +75,6 @@ void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
169 enum xfs_blft); 75 enum xfs_blft);
170void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp); 76void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp);
171 77
172#endif /* __KERNEL__ */ 78extern kmem_zone_t *xfs_buf_item_zone;
173 79
174#endif /* __XFS_BUF_ITEM_H__ */ 80#endif /* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 0b8b2a13cd24..d4e59a4ff59f 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -27,8 +27,8 @@
27#include "xfs_mount.h" 27#include "xfs_mount.h"
28#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
29#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
30#include "xfs_dir2.h"
31#include "xfs_dir2_format.h" 30#include "xfs_dir2_format.h"
31#include "xfs_dir2.h"
32#include "xfs_dir2_priv.h" 32#include "xfs_dir2_priv.h"
33#include "xfs_dinode.h" 33#include "xfs_dinode.h"
34#include "xfs_inode.h" 34#include "xfs_inode.h"
@@ -399,7 +399,7 @@ xfs_da3_split(
399 struct xfs_da_intnode *node; 399 struct xfs_da_intnode *node;
400 struct xfs_buf *bp; 400 struct xfs_buf *bp;
401 int max; 401 int max;
402 int action; 402 int action = 0;
403 int error; 403 int error;
404 int i; 404 int i;
405 405
@@ -2454,9 +2454,9 @@ static int
2454xfs_buf_map_from_irec( 2454xfs_buf_map_from_irec(
2455 struct xfs_mount *mp, 2455 struct xfs_mount *mp,
2456 struct xfs_buf_map **mapp, 2456 struct xfs_buf_map **mapp,
2457 unsigned int *nmaps, 2457 int *nmaps,
2458 struct xfs_bmbt_irec *irecs, 2458 struct xfs_bmbt_irec *irecs,
2459 unsigned int nirecs) 2459 int nirecs)
2460{ 2460{
2461 struct xfs_buf_map *map; 2461 struct xfs_buf_map *map;
2462 int i; 2462 int i;
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 6fb3371c63cf..b1f267995dea 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -133,12 +133,19 @@ extern void xfs_da3_node_hdr_to_disk(struct xfs_da_intnode *to,
133 struct xfs_da3_icnode_hdr *from); 133 struct xfs_da3_icnode_hdr *from);
134 134
135static inline int 135static inline int
136xfs_da3_node_hdr_size(struct xfs_da_intnode *dap) 136__xfs_da3_node_hdr_size(bool v3)
137{ 137{
138 if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) 138 if (v3)
139 return sizeof(struct xfs_da3_node_hdr); 139 return sizeof(struct xfs_da3_node_hdr);
140 return sizeof(struct xfs_da_node_hdr); 140 return sizeof(struct xfs_da_node_hdr);
141} 141}
142static inline int
143xfs_da3_node_hdr_size(struct xfs_da_intnode *dap)
144{
145 bool v3 = dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC);
146
147 return __xfs_da3_node_hdr_size(v3);
148}
142 149
143static inline struct xfs_da_node_entry * 150static inline struct xfs_da_node_entry *
144xfs_da3_node_tree_p(struct xfs_da_intnode *dap) 151xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
@@ -176,6 +183,7 @@ enum xfs_dacmp {
176typedef struct xfs_da_args { 183typedef struct xfs_da_args {
177 const __uint8_t *name; /* string (maybe not NULL terminated) */ 184 const __uint8_t *name; /* string (maybe not NULL terminated) */
178 int namelen; /* length of string (maybe no NULL) */ 185 int namelen; /* length of string (maybe no NULL) */
186 __uint8_t filetype; /* filetype of inode for directories */
179 __uint8_t *value; /* set of bytes (maybe contain NULLs) */ 187 __uint8_t *value; /* set of bytes (maybe contain NULLs) */
180 int valuelen; /* length of value */ 188 int valuelen; /* length of value */
181 int flags; /* argument flags (eg: ATTR_NOCREATE) */ 189 int flags; /* argument flags (eg: ATTR_NOCREATE) */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
deleted file mode 100644
index e36445ceaf80..000000000000
--- a/fs/xfs/xfs_dfrag.c
+++ /dev/null
@@ -1,459 +0,0 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_log.h"
22#include "xfs_trans.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h"
26#include "xfs_bmap_btree.h"
27#include "xfs_alloc_btree.h"
28#include "xfs_ialloc_btree.h"
29#include "xfs_btree.h"
30#include "xfs_dinode.h"
31#include "xfs_inode.h"
32#include "xfs_inode_item.h"
33#include "xfs_bmap.h"
34#include "xfs_itable.h"
35#include "xfs_dfrag.h"
36#include "xfs_error.h"
37#include "xfs_vnodeops.h"
38#include "xfs_trace.h"
39
40
41static int xfs_swap_extents(
42 xfs_inode_t *ip, /* target inode */
43 xfs_inode_t *tip, /* tmp inode */
44 xfs_swapext_t *sxp);
45
46/*
47 * ioctl interface for swapext
48 */
49int
50xfs_swapext(
51 xfs_swapext_t *sxp)
52{
53 xfs_inode_t *ip, *tip;
54 struct fd f, tmp;
55 int error = 0;
56
57 /* Pull information for the target fd */
58 f = fdget((int)sxp->sx_fdtarget);
59 if (!f.file) {
60 error = XFS_ERROR(EINVAL);
61 goto out;
62 }
63
64 if (!(f.file->f_mode & FMODE_WRITE) ||
65 !(f.file->f_mode & FMODE_READ) ||
66 (f.file->f_flags & O_APPEND)) {
67 error = XFS_ERROR(EBADF);
68 goto out_put_file;
69 }
70
71 tmp = fdget((int)sxp->sx_fdtmp);
72 if (!tmp.file) {
73 error = XFS_ERROR(EINVAL);
74 goto out_put_file;
75 }
76
77 if (!(tmp.file->f_mode & FMODE_WRITE) ||
78 !(tmp.file->f_mode & FMODE_READ) ||
79 (tmp.file->f_flags & O_APPEND)) {
80 error = XFS_ERROR(EBADF);
81 goto out_put_tmp_file;
82 }
83
84 if (IS_SWAPFILE(file_inode(f.file)) ||
85 IS_SWAPFILE(file_inode(tmp.file))) {
86 error = XFS_ERROR(EINVAL);
87 goto out_put_tmp_file;
88 }
89
90 ip = XFS_I(file_inode(f.file));
91 tip = XFS_I(file_inode(tmp.file));
92
93 if (ip->i_mount != tip->i_mount) {
94 error = XFS_ERROR(EINVAL);
95 goto out_put_tmp_file;
96 }
97
98 if (ip->i_ino == tip->i_ino) {
99 error = XFS_ERROR(EINVAL);
100 goto out_put_tmp_file;
101 }
102
103 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
104 error = XFS_ERROR(EIO);
105 goto out_put_tmp_file;
106 }
107
108 error = xfs_swap_extents(ip, tip, sxp);
109
110 out_put_tmp_file:
111 fdput(tmp);
112 out_put_file:
113 fdput(f);
114 out:
115 return error;
116}
117
118/*
119 * We need to check that the format of the data fork in the temporary inode is
120 * valid for the target inode before doing the swap. This is not a problem with
121 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
122 * data fork depending on the space the attribute fork is taking so we can get
123 * invalid formats on the target inode.
124 *
125 * E.g. target has space for 7 extents in extent format, temp inode only has
126 * space for 6. If we defragment down to 7 extents, then the tmp format is a
127 * btree, but when swapped it needs to be in extent format. Hence we can't just
128 * blindly swap data forks on attr2 filesystems.
129 *
130 * Note that we check the swap in both directions so that we don't end up with
131 * a corrupt temporary inode, either.
132 *
133 * Note that fixing the way xfs_fsr sets up the attribute fork in the source
134 * inode will prevent this situation from occurring, so all we do here is
135 * reject and log the attempt. basically we are putting the responsibility on
136 * userspace to get this right.
137 */
138static int
139xfs_swap_extents_check_format(
140 xfs_inode_t *ip, /* target inode */
141 xfs_inode_t *tip) /* tmp inode */
142{
143
144 /* Should never get a local format */
145 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
146 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
147 return EINVAL;
148
149 /*
150 * if the target inode has less extents that then temporary inode then
151 * why did userspace call us?
152 */
153 if (ip->i_d.di_nextents < tip->i_d.di_nextents)
154 return EINVAL;
155
156 /*
157 * if the target inode is in extent form and the temp inode is in btree
158 * form then we will end up with the target inode in the wrong format
159 * as we already know there are less extents in the temp inode.
160 */
161 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
162 tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
163 return EINVAL;
164
165 /* Check temp in extent form to max in target */
166 if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
167 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
168 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
169 return EINVAL;
170
171 /* Check target in extent form to max in temp */
172 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
173 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
174 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
175 return EINVAL;
176
177 /*
178 * If we are in a btree format, check that the temp root block will fit
179 * in the target and that it has enough extents to be in btree format
180 * in the target.
181 *
182 * Note that we have to be careful to allow btree->extent conversions
183 * (a common defrag case) which will occur when the temp inode is in
184 * extent format...
185 */
186 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
187 if (XFS_IFORK_BOFF(ip) &&
188 XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
189 return EINVAL;
190 if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
191 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
192 return EINVAL;
193 }
194
195 /* Reciprocal target->temp btree format checks */
196 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
197 if (XFS_IFORK_BOFF(tip) &&
198 XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
199 return EINVAL;
200 if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
201 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
202 return EINVAL;
203 }
204
205 return 0;
206}
207
208static int
209xfs_swap_extents(
210 xfs_inode_t *ip, /* target inode */
211 xfs_inode_t *tip, /* tmp inode */
212 xfs_swapext_t *sxp)
213{
214 xfs_mount_t *mp = ip->i_mount;
215 xfs_trans_t *tp;
216 xfs_bstat_t *sbp = &sxp->sx_stat;
217 xfs_ifork_t *tempifp, *ifp, *tifp;
218 int src_log_flags, target_log_flags;
219 int error = 0;
220 int aforkblks = 0;
221 int taforkblks = 0;
222 __uint64_t tmp;
223
224 /*
225 * We have no way of updating owner information in the BMBT blocks for
226 * each inode on CRC enabled filesystems, so to avoid corrupting the
227 * this metadata we simply don't allow extent swaps to occur.
228 */
229 if (xfs_sb_version_hascrc(&mp->m_sb))
230 return XFS_ERROR(EINVAL);
231
232 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
233 if (!tempifp) {
234 error = XFS_ERROR(ENOMEM);
235 goto out;
236 }
237
238 /*
239 * we have to do two separate lock calls here to keep lockdep
240 * happy. If we try to get all the locks in one call, lock will
241 * report false positives when we drop the ILOCK and regain them
242 * below.
243 */
244 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
245 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
246
247 /* Verify that both files have the same format */
248 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
249 error = XFS_ERROR(EINVAL);
250 goto out_unlock;
251 }
252
253 /* Verify both files are either real-time or non-realtime */
254 if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
255 error = XFS_ERROR(EINVAL);
256 goto out_unlock;
257 }
258
259 error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
260 if (error)
261 goto out_unlock;
262 truncate_pagecache_range(VFS_I(tip), 0, -1);
263
264 /* Verify O_DIRECT for ftmp */
265 if (VN_CACHED(VFS_I(tip)) != 0) {
266 error = XFS_ERROR(EINVAL);
267 goto out_unlock;
268 }
269
270 /* Verify all data are being swapped */
271 if (sxp->sx_offset != 0 ||
272 sxp->sx_length != ip->i_d.di_size ||
273 sxp->sx_length != tip->i_d.di_size) {
274 error = XFS_ERROR(EFAULT);
275 goto out_unlock;
276 }
277
278 trace_xfs_swap_extent_before(ip, 0);
279 trace_xfs_swap_extent_before(tip, 1);
280
281 /* check inode formats now that data is flushed */
282 error = xfs_swap_extents_check_format(ip, tip);
283 if (error) {
284 xfs_notice(mp,
285 "%s: inode 0x%llx format is incompatible for exchanging.",
286 __func__, ip->i_ino);
287 goto out_unlock;
288 }
289
290 /*
291 * Compare the current change & modify times with that
292 * passed in. If they differ, we abort this swap.
293 * This is the mechanism used to ensure the calling
294 * process that the file was not changed out from
295 * under it.
296 */
297 if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
298 (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
299 (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
300 (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
301 error = XFS_ERROR(EBUSY);
302 goto out_unlock;
303 }
304
305 /* We need to fail if the file is memory mapped. Once we have tossed
306 * all existing pages, the page fault will have no option
307 * but to go to the filesystem for pages. By making the page fault call
308 * vop_read (or write in the case of autogrow) they block on the iolock
309 * until we have switched the extents.
310 */
311 if (VN_MAPPED(VFS_I(ip))) {
312 error = XFS_ERROR(EBUSY);
313 goto out_unlock;
314 }
315
316 xfs_iunlock(ip, XFS_ILOCK_EXCL);
317 xfs_iunlock(tip, XFS_ILOCK_EXCL);
318
319 /*
320 * There is a race condition here since we gave up the
321 * ilock. However, the data fork will not change since
322 * we have the iolock (locked for truncation too) so we
323 * are safe. We don't really care if non-io related
324 * fields change.
325 */
326 truncate_pagecache_range(VFS_I(ip), 0, -1);
327
328 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
329 if ((error = xfs_trans_reserve(tp, 0,
330 XFS_ICHANGE_LOG_RES(mp), 0,
331 0, 0))) {
332 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
333 xfs_iunlock(tip, XFS_IOLOCK_EXCL);
334 xfs_trans_cancel(tp, 0);
335 goto out;
336 }
337 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
338
339 /*
340 * Count the number of extended attribute blocks
341 */
342 if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
343 (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
344 error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
345 if (error)
346 goto out_trans_cancel;
347 }
348 if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
349 (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
350 error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
351 &taforkblks);
352 if (error)
353 goto out_trans_cancel;
354 }
355
356 /*
357 * Swap the data forks of the inodes
358 */
359 ifp = &ip->i_df;
360 tifp = &tip->i_df;
361 *tempifp = *ifp; /* struct copy */
362 *ifp = *tifp; /* struct copy */
363 *tifp = *tempifp; /* struct copy */
364
365 /*
366 * Fix the on-disk inode values
367 */
368 tmp = (__uint64_t)ip->i_d.di_nblocks;
369 ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
370 tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
371
372 tmp = (__uint64_t) ip->i_d.di_nextents;
373 ip->i_d.di_nextents = tip->i_d.di_nextents;
374 tip->i_d.di_nextents = tmp;
375
376 tmp = (__uint64_t) ip->i_d.di_format;
377 ip->i_d.di_format = tip->i_d.di_format;
378 tip->i_d.di_format = tmp;
379
380 /*
381 * The extents in the source inode could still contain speculative
382 * preallocation beyond EOF (e.g. the file is open but not modified
383 * while defrag is in progress). In that case, we need to copy over the
384 * number of delalloc blocks the data fork in the source inode is
385 * tracking beyond EOF so that when the fork is truncated away when the
386 * temporary inode is unlinked we don't underrun the i_delayed_blks
387 * counter on that inode.
388 */
389 ASSERT(tip->i_delayed_blks == 0);
390 tip->i_delayed_blks = ip->i_delayed_blks;
391 ip->i_delayed_blks = 0;
392
393 src_log_flags = XFS_ILOG_CORE;
394 switch (ip->i_d.di_format) {
395 case XFS_DINODE_FMT_EXTENTS:
396 /* If the extents fit in the inode, fix the
397 * pointer. Otherwise it's already NULL or
398 * pointing to the extent.
399 */
400 if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
401 ifp->if_u1.if_extents =
402 ifp->if_u2.if_inline_ext;
403 }
404 src_log_flags |= XFS_ILOG_DEXT;
405 break;
406 case XFS_DINODE_FMT_BTREE:
407 src_log_flags |= XFS_ILOG_DBROOT;
408 break;
409 }
410
411 target_log_flags = XFS_ILOG_CORE;
412 switch (tip->i_d.di_format) {
413 case XFS_DINODE_FMT_EXTENTS:
414 /* If the extents fit in the inode, fix the
415 * pointer. Otherwise it's already NULL or
416 * pointing to the extent.
417 */
418 if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
419 tifp->if_u1.if_extents =
420 tifp->if_u2.if_inline_ext;
421 }
422 target_log_flags |= XFS_ILOG_DEXT;
423 break;
424 case XFS_DINODE_FMT_BTREE:
425 target_log_flags |= XFS_ILOG_DBROOT;
426 break;
427 }
428
429
430 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
431 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
432
433 xfs_trans_log_inode(tp, ip, src_log_flags);
434 xfs_trans_log_inode(tp, tip, target_log_flags);
435
436 /*
437 * If this is a synchronous mount, make sure that the
438 * transaction goes to disk before returning to the user.
439 */
440 if (mp->m_flags & XFS_MOUNT_WSYNC)
441 xfs_trans_set_sync(tp);
442
443 error = xfs_trans_commit(tp, 0);
444
445 trace_xfs_swap_extent_after(ip, 0);
446 trace_xfs_swap_extent_after(tip, 1);
447out:
448 kmem_free(tempifp);
449 return error;
450
451out_unlock:
452 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
453 xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
454 goto out;
455
456out_trans_cancel:
457 xfs_trans_cancel(tp, 0);
458 goto out_unlock;
459}
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
deleted file mode 100644
index 20bdd935c121..000000000000
--- a/fs/xfs/xfs_dfrag.h
+++ /dev/null
@@ -1,53 +0,0 @@
1/*
2 * Copyright (c) 2000,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_DFRAG_H__
19#define __XFS_DFRAG_H__
20
21/*
22 * Structure passed to xfs_swapext
23 */
24
25typedef struct xfs_swapext
26{
27 __int64_t sx_version; /* version */
28 __int64_t sx_fdtarget; /* fd of target file */
29 __int64_t sx_fdtmp; /* fd of tmp file */
30 xfs_off_t sx_offset; /* offset into file */
31 xfs_off_t sx_length; /* leng from offset */
32 char sx_pad[16]; /* pad space, unused */
33 xfs_bstat_t sx_stat; /* stat of target b4 copy */
34} xfs_swapext_t;
35
36/*
37 * Version flag
38 */
39#define XFS_SX_VERSION 0
40
41#ifdef __KERNEL__
42/*
43 * Prototypes for visible xfs_dfrag.c routines.
44 */
45
46/*
47 * Syscall interface for xfs_swapext
48 */
49int xfs_swapext(struct xfs_swapext *sx);
50
51#endif /* __KERNEL__ */
52
53#endif /* __XFS_DFRAG_H__ */
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 8f023dee404d..edf203ab50af 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -31,14 +31,14 @@
31#include "xfs_inode.h" 31#include "xfs_inode.h"
32#include "xfs_inode_item.h" 32#include "xfs_inode_item.h"
33#include "xfs_bmap.h" 33#include "xfs_bmap.h"
34#include "xfs_dir2.h"
35#include "xfs_dir2_format.h" 34#include "xfs_dir2_format.h"
35#include "xfs_dir2.h"
36#include "xfs_dir2_priv.h" 36#include "xfs_dir2_priv.h"
37#include "xfs_error.h" 37#include "xfs_error.h"
38#include "xfs_vnodeops.h"
39#include "xfs_trace.h" 38#include "xfs_trace.h"
40 39
41struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2}; 40struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
41
42 42
43/* 43/*
44 * ASCII case-insensitive (ie. A-Z) support for directories that was 44 * ASCII case-insensitive (ie. A-Z) support for directories that was
@@ -90,6 +90,9 @@ void
90xfs_dir_mount( 90xfs_dir_mount(
91 xfs_mount_t *mp) 91 xfs_mount_t *mp)
92{ 92{
93 int nodehdr_size;
94
95
93 ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb)); 96 ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb));
94 ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <= 97 ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
95 XFS_MAX_BLOCKSIZE); 98 XFS_MAX_BLOCKSIZE);
@@ -98,12 +101,13 @@ xfs_dir_mount(
98 mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp)); 101 mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp));
99 mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp)); 102 mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp));
100 mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp)); 103 mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp));
101 mp->m_attr_node_ents = 104
102 (mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) / 105 nodehdr_size = __xfs_da3_node_hdr_size(xfs_sb_version_hascrc(&mp->m_sb));
103 (uint)sizeof(xfs_da_node_entry_t); 106 mp->m_attr_node_ents = (mp->m_sb.sb_blocksize - nodehdr_size) /
104 mp->m_dir_node_ents = 107 (uint)sizeof(xfs_da_node_entry_t);
105 (mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) / 108 mp->m_dir_node_ents = (mp->m_dirblksize - nodehdr_size) /
106 (uint)sizeof(xfs_da_node_entry_t); 109 (uint)sizeof(xfs_da_node_entry_t);
110
107 mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100; 111 mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100;
108 if (xfs_sb_version_hasasciici(&mp->m_sb)) 112 if (xfs_sb_version_hasasciici(&mp->m_sb))
109 mp->m_dirnameops = &xfs_ascii_ci_nameops; 113 mp->m_dirnameops = &xfs_ascii_ci_nameops;
@@ -209,6 +213,7 @@ xfs_dir_createname(
209 memset(&args, 0, sizeof(xfs_da_args_t)); 213 memset(&args, 0, sizeof(xfs_da_args_t));
210 args.name = name->name; 214 args.name = name->name;
211 args.namelen = name->len; 215 args.namelen = name->len;
216 args.filetype = name->type;
212 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 217 args.hashval = dp->i_mount->m_dirnameops->hashname(name);
213 args.inumber = inum; 218 args.inumber = inum;
214 args.dp = dp; 219 args.dp = dp;
@@ -283,6 +288,7 @@ xfs_dir_lookup(
283 memset(&args, 0, sizeof(xfs_da_args_t)); 288 memset(&args, 0, sizeof(xfs_da_args_t));
284 args.name = name->name; 289 args.name = name->name;
285 args.namelen = name->len; 290 args.namelen = name->len;
291 args.filetype = name->type;
286 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 292 args.hashval = dp->i_mount->m_dirnameops->hashname(name);
287 args.dp = dp; 293 args.dp = dp;
288 args.whichfork = XFS_DATA_FORK; 294 args.whichfork = XFS_DATA_FORK;
@@ -338,6 +344,7 @@ xfs_dir_removename(
338 memset(&args, 0, sizeof(xfs_da_args_t)); 344 memset(&args, 0, sizeof(xfs_da_args_t));
339 args.name = name->name; 345 args.name = name->name;
340 args.namelen = name->len; 346 args.namelen = name->len;
347 args.filetype = name->type;
341 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 348 args.hashval = dp->i_mount->m_dirnameops->hashname(name);
342 args.inumber = ino; 349 args.inumber = ino;
343 args.dp = dp; 350 args.dp = dp;
@@ -363,37 +370,6 @@ xfs_dir_removename(
363} 370}
364 371
365/* 372/*
366 * Read a directory.
367 */
368int
369xfs_readdir(
370 xfs_inode_t *dp,
371 struct dir_context *ctx,
372 size_t bufsize)
373{
374 int rval; /* return value */
375 int v; /* type-checking value */
376
377 trace_xfs_readdir(dp);
378
379 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
380 return XFS_ERROR(EIO);
381
382 ASSERT(S_ISDIR(dp->i_d.di_mode));
383 XFS_STATS_INC(xs_dir_getdents);
384
385 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
386 rval = xfs_dir2_sf_getdents(dp, ctx);
387 else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
388 ;
389 else if (v)
390 rval = xfs_dir2_block_getdents(dp, ctx);
391 else
392 rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
393 return rval;
394}
395
396/*
397 * Replace the inode number of a directory entry. 373 * Replace the inode number of a directory entry.
398 */ 374 */
399int 375int
@@ -418,6 +394,7 @@ xfs_dir_replace(
418 memset(&args, 0, sizeof(xfs_da_args_t)); 394 memset(&args, 0, sizeof(xfs_da_args_t));
419 args.name = name->name; 395 args.name = name->name;
420 args.namelen = name->len; 396 args.namelen = name->len;
397 args.filetype = name->type;
421 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 398 args.hashval = dp->i_mount->m_dirnameops->hashname(name);
422 args.inumber = inum; 399 args.inumber = inum;
423 args.dp = dp; 400 args.dp = dp;
@@ -465,6 +442,7 @@ xfs_dir_canenter(
465 memset(&args, 0, sizeof(xfs_da_args_t)); 442 memset(&args, 0, sizeof(xfs_da_args_t));
466 args.name = name->name; 443 args.name = name->name;
467 args.namelen = name->len; 444 args.namelen = name->len;
445 args.filetype = name->type;
468 args.hashval = dp->i_mount->m_dirnameops->hashname(name); 446 args.hashval = dp->i_mount->m_dirnameops->hashname(name);
469 args.dp = dp; 447 args.dp = dp;
470 args.whichfork = XFS_DATA_FORK; 448 args.whichfork = XFS_DATA_FORK;
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index e937d9991c18..9910401327d4 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -23,6 +23,11 @@ struct xfs_da_args;
23struct xfs_inode; 23struct xfs_inode;
24struct xfs_mount; 24struct xfs_mount;
25struct xfs_trans; 25struct xfs_trans;
26struct xfs_dir2_sf_hdr;
27struct xfs_dir2_sf_entry;
28struct xfs_dir2_data_hdr;
29struct xfs_dir2_data_entry;
30struct xfs_dir2_data_unused;
26 31
27extern struct xfs_name xfs_name_dotdot; 32extern struct xfs_name xfs_name_dotdot;
28 33
@@ -57,4 +62,45 @@ extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
57 */ 62 */
58extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); 63extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
59 64
65/*
66 * Interface routines used by userspace utilities
67 */
68extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
69extern void xfs_dir2_sf_put_parent_ino(struct xfs_dir2_sf_hdr *sfp,
70 xfs_ino_t ino);
71extern xfs_ino_t xfs_dir3_sfe_get_ino(struct xfs_mount *mp,
72 struct xfs_dir2_sf_hdr *sfp, struct xfs_dir2_sf_entry *sfep);
73extern void xfs_dir3_sfe_put_ino(struct xfs_mount *mp,
74 struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep,
75 xfs_ino_t ino);
76
77extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
78extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
79extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
80 struct xfs_buf *bp);
81
82extern void xfs_dir2_data_freescan(struct xfs_mount *mp,
83 struct xfs_dir2_data_hdr *hdr, int *loghead);
84extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp,
85 struct xfs_dir2_data_entry *dep);
86extern void xfs_dir2_data_log_header(struct xfs_trans *tp,
87 struct xfs_buf *bp);
88extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp,
89 struct xfs_dir2_data_unused *dup);
90extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp,
91 xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
92 int *needlogp, int *needscanp);
93extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
94 struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset,
95 xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
96
97extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
98 struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_unused *dup);
99
100extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
101extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
102extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
103extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
104extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
105
60#endif /* __XFS_DIR2_H__ */ 106#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 5e7fbd72cf52..0957aa98b6c0 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -31,8 +31,8 @@
31#include "xfs_inode_item.h" 31#include "xfs_inode_item.h"
32#include "xfs_bmap.h" 32#include "xfs_bmap.h"
33#include "xfs_buf_item.h" 33#include "xfs_buf_item.h"
34#include "xfs_dir2.h"
35#include "xfs_dir2_format.h" 34#include "xfs_dir2_format.h"
35#include "xfs_dir2.h"
36#include "xfs_dir2_priv.h" 36#include "xfs_dir2_priv.h"
37#include "xfs_error.h" 37#include "xfs_error.h"
38#include "xfs_trace.h" 38#include "xfs_trace.h"
@@ -126,7 +126,7 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
126 .verify_write = xfs_dir3_block_write_verify, 126 .verify_write = xfs_dir3_block_write_verify,
127}; 127};
128 128
129static int 129int
130xfs_dir3_block_read( 130xfs_dir3_block_read(
131 struct xfs_trans *tp, 131 struct xfs_trans *tp,
132 struct xfs_inode *dp, 132 struct xfs_inode *dp,
@@ -369,7 +369,7 @@ xfs_dir2_block_addname(
369 if (error) 369 if (error)
370 return error; 370 return error;
371 371
372 len = xfs_dir2_data_entsize(args->namelen); 372 len = xfs_dir3_data_entsize(mp, args->namelen);
373 373
374 /* 374 /*
375 * Set up pointers to parts of the block. 375 * Set up pointers to parts of the block.
@@ -549,7 +549,8 @@ xfs_dir2_block_addname(
549 dep->inumber = cpu_to_be64(args->inumber); 549 dep->inumber = cpu_to_be64(args->inumber);
550 dep->namelen = args->namelen; 550 dep->namelen = args->namelen;
551 memcpy(dep->name, args->name, args->namelen); 551 memcpy(dep->name, args->name, args->namelen);
552 tagp = xfs_dir2_data_entry_tag_p(dep); 552 xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
553 tagp = xfs_dir3_data_entry_tag_p(mp, dep);
553 *tagp = cpu_to_be16((char *)dep - (char *)hdr); 554 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
554 /* 555 /*
555 * Clean up the bestfree array and log the header, tail, and entry. 556 * Clean up the bestfree array and log the header, tail, and entry.
@@ -565,101 +566,6 @@ xfs_dir2_block_addname(
565} 566}
566 567
567/* 568/*
568 * Readdir for block directories.
569 */
570int /* error */
571xfs_dir2_block_getdents(
572 xfs_inode_t *dp, /* incore inode */
573 struct dir_context *ctx)
574{
575 xfs_dir2_data_hdr_t *hdr; /* block header */
576 struct xfs_buf *bp; /* buffer for block */
577 xfs_dir2_block_tail_t *btp; /* block tail */
578 xfs_dir2_data_entry_t *dep; /* block data entry */
579 xfs_dir2_data_unused_t *dup; /* block unused entry */
580 char *endptr; /* end of the data entries */
581 int error; /* error return value */
582 xfs_mount_t *mp; /* filesystem mount point */
583 char *ptr; /* current data entry */
584 int wantoff; /* starting block offset */
585 xfs_off_t cook;
586
587 mp = dp->i_mount;
588 /*
589 * If the block number in the offset is out of range, we're done.
590 */
591 if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
592 return 0;
593
594 error = xfs_dir3_block_read(NULL, dp, &bp);
595 if (error)
596 return error;
597
598 /*
599 * Extract the byte offset we start at from the seek pointer.
600 * We'll skip entries before this.
601 */
602 wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos);
603 hdr = bp->b_addr;
604 xfs_dir3_data_check(dp, bp);
605 /*
606 * Set up values for the loop.
607 */
608 btp = xfs_dir2_block_tail_p(mp, hdr);
609 ptr = (char *)xfs_dir3_data_entry_p(hdr);
610 endptr = (char *)xfs_dir2_block_leaf_p(btp);
611
612 /*
613 * Loop over the data portion of the block.
614 * Each object is a real entry (dep) or an unused one (dup).
615 */
616 while (ptr < endptr) {
617 dup = (xfs_dir2_data_unused_t *)ptr;
618 /*
619 * Unused, skip it.
620 */
621 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
622 ptr += be16_to_cpu(dup->length);
623 continue;
624 }
625
626 dep = (xfs_dir2_data_entry_t *)ptr;
627
628 /*
629 * Bump pointer for the next iteration.
630 */
631 ptr += xfs_dir2_data_entsize(dep->namelen);
632 /*
633 * The entry is before the desired starting point, skip it.
634 */
635 if ((char *)dep - (char *)hdr < wantoff)
636 continue;
637
638 cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
639 (char *)dep - (char *)hdr);
640
641 ctx->pos = cook & 0x7fffffff;
642 /*
643 * If it didn't fit, set the final offset to here & return.
644 */
645 if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
646 be64_to_cpu(dep->inumber), DT_UNKNOWN)) {
647 xfs_trans_brelse(NULL, bp);
648 return 0;
649 }
650 }
651
652 /*
653 * Reached the end of the block.
654 * Set the offset to a non-existent block 1 and return.
655 */
656 ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
657 0x7fffffff;
658 xfs_trans_brelse(NULL, bp);
659 return 0;
660}
661
662/*
663 * Log leaf entries from the block. 569 * Log leaf entries from the block.
664 */ 570 */
665static void 571static void
@@ -736,6 +642,7 @@ xfs_dir2_block_lookup(
736 * Fill in inode number, CI name if appropriate, release the block. 642 * Fill in inode number, CI name if appropriate, release the block.
737 */ 643 */
738 args->inumber = be64_to_cpu(dep->inumber); 644 args->inumber = be64_to_cpu(dep->inumber);
645 args->filetype = xfs_dir3_dirent_get_ftype(mp, dep);
739 error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); 646 error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
740 xfs_trans_brelse(args->trans, bp); 647 xfs_trans_brelse(args->trans, bp);
741 return XFS_ERROR(error); 648 return XFS_ERROR(error);
@@ -894,7 +801,7 @@ xfs_dir2_block_removename(
894 needlog = needscan = 0; 801 needlog = needscan = 0;
895 xfs_dir2_data_make_free(tp, bp, 802 xfs_dir2_data_make_free(tp, bp,
896 (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), 803 (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
897 xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); 804 xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan);
898 /* 805 /*
899 * Fix up the block tail. 806 * Fix up the block tail.
900 */ 807 */
@@ -968,6 +875,7 @@ xfs_dir2_block_replace(
968 * Change the inode number to the new value. 875 * Change the inode number to the new value.
969 */ 876 */
970 dep->inumber = cpu_to_be64(args->inumber); 877 dep->inumber = cpu_to_be64(args->inumber);
878 xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
971 xfs_dir2_data_log_entry(args->trans, bp, dep); 879 xfs_dir2_data_log_entry(args->trans, bp, dep);
972 xfs_dir3_data_check(dp, bp); 880 xfs_dir3_data_check(dp, bp);
973 return 0; 881 return 0;
@@ -1254,7 +1162,8 @@ xfs_dir2_sf_to_block(
1254 dep->inumber = cpu_to_be64(dp->i_ino); 1162 dep->inumber = cpu_to_be64(dp->i_ino);
1255 dep->namelen = 1; 1163 dep->namelen = 1;
1256 dep->name[0] = '.'; 1164 dep->name[0] = '.';
1257 tagp = xfs_dir2_data_entry_tag_p(dep); 1165 xfs_dir3_dirent_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
1166 tagp = xfs_dir3_data_entry_tag_p(mp, dep);
1258 *tagp = cpu_to_be16((char *)dep - (char *)hdr); 1167 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
1259 xfs_dir2_data_log_entry(tp, bp, dep); 1168 xfs_dir2_data_log_entry(tp, bp, dep);
1260 blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot); 1169 blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
@@ -1267,7 +1176,8 @@ xfs_dir2_sf_to_block(
1267 dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp)); 1176 dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp));
1268 dep->namelen = 2; 1177 dep->namelen = 2;
1269 dep->name[0] = dep->name[1] = '.'; 1178 dep->name[0] = dep->name[1] = '.';
1270 tagp = xfs_dir2_data_entry_tag_p(dep); 1179 xfs_dir3_dirent_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
1180 tagp = xfs_dir3_data_entry_tag_p(mp, dep);
1271 *tagp = cpu_to_be16((char *)dep - (char *)hdr); 1181 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
1272 xfs_dir2_data_log_entry(tp, bp, dep); 1182 xfs_dir2_data_log_entry(tp, bp, dep);
1273 blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); 1183 blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
@@ -1312,10 +1222,12 @@ xfs_dir2_sf_to_block(
1312 * Copy a real entry. 1222 * Copy a real entry.
1313 */ 1223 */
1314 dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset); 1224 dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset);
1315 dep->inumber = cpu_to_be64(xfs_dir2_sfe_get_ino(sfp, sfep)); 1225 dep->inumber = cpu_to_be64(xfs_dir3_sfe_get_ino(mp, sfp, sfep));
1316 dep->namelen = sfep->namelen; 1226 dep->namelen = sfep->namelen;
1227 xfs_dir3_dirent_put_ftype(mp, dep,
1228 xfs_dir3_sfe_get_ftype(mp, sfp, sfep));
1317 memcpy(dep->name, sfep->name, dep->namelen); 1229 memcpy(dep->name, sfep->name, dep->namelen);
1318 tagp = xfs_dir2_data_entry_tag_p(dep); 1230 tagp = xfs_dir3_data_entry_tag_p(mp, dep);
1319 *tagp = cpu_to_be16((char *)dep - (char *)hdr); 1231 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
1320 xfs_dir2_data_log_entry(tp, bp, dep); 1232 xfs_dir2_data_log_entry(tp, bp, dep);
1321 name.name = sfep->name; 1233 name.name = sfep->name;
@@ -1328,7 +1240,7 @@ xfs_dir2_sf_to_block(
1328 if (++i == sfp->count) 1240 if (++i == sfp->count)
1329 sfep = NULL; 1241 sfep = NULL;
1330 else 1242 else
1331 sfep = xfs_dir2_sf_nextentry(sfp, sfep); 1243 sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
1332 } 1244 }
1333 /* Done with the temporary buffer */ 1245 /* Done with the temporary buffer */
1334 kmem_free(sfp); 1246 kmem_free(sfp);
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index c2930238005c..47e1326c169a 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -29,14 +29,12 @@
29#include "xfs_dinode.h" 29#include "xfs_dinode.h"
30#include "xfs_inode.h" 30#include "xfs_inode.h"
31#include "xfs_dir2_format.h" 31#include "xfs_dir2_format.h"
32#include "xfs_dir2.h"
32#include "xfs_dir2_priv.h" 33#include "xfs_dir2_priv.h"
33#include "xfs_error.h" 34#include "xfs_error.h"
34#include "xfs_buf_item.h" 35#include "xfs_buf_item.h"
35#include "xfs_cksum.h" 36#include "xfs_cksum.h"
36 37
37STATIC xfs_dir2_data_free_t *
38xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
39
40/* 38/*
41 * Check the consistency of the data block. 39 * Check the consistency of the data block.
42 * The input can also be a block-format directory. 40 * The input can also be a block-format directory.
@@ -149,8 +147,10 @@ __xfs_dir3_data_check(
149 XFS_WANT_CORRUPTED_RETURN( 147 XFS_WANT_CORRUPTED_RETURN(
150 !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); 148 !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
151 XFS_WANT_CORRUPTED_RETURN( 149 XFS_WANT_CORRUPTED_RETURN(
152 be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) == 150 be16_to_cpu(*xfs_dir3_data_entry_tag_p(mp, dep)) ==
153 (char *)dep - (char *)hdr); 151 (char *)dep - (char *)hdr);
152 XFS_WANT_CORRUPTED_RETURN(
153 xfs_dir3_dirent_get_ftype(mp, dep) < XFS_DIR3_FT_MAX);
154 count++; 154 count++;
155 lastfree = 0; 155 lastfree = 0;
156 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || 156 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
@@ -168,7 +168,7 @@ __xfs_dir3_data_check(
168 } 168 }
169 XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count)); 169 XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
170 } 170 }
171 p += xfs_dir2_data_entsize(dep->namelen); 171 p += xfs_dir3_data_entsize(mp, dep->namelen);
172 } 172 }
173 /* 173 /*
174 * Need to have seen all the entries and all the bestfree slots. 174 * Need to have seen all the entries and all the bestfree slots.
@@ -325,7 +325,7 @@ xfs_dir3_data_readahead(
325 * Given a data block and an unused entry from that block, 325 * Given a data block and an unused entry from that block,
326 * return the bestfree entry if any that corresponds to it. 326 * return the bestfree entry if any that corresponds to it.
327 */ 327 */
328STATIC xfs_dir2_data_free_t * 328xfs_dir2_data_free_t *
329xfs_dir2_data_freefind( 329xfs_dir2_data_freefind(
330 xfs_dir2_data_hdr_t *hdr, /* data block */ 330 xfs_dir2_data_hdr_t *hdr, /* data block */
331 xfs_dir2_data_unused_t *dup) /* data unused entry */ 331 xfs_dir2_data_unused_t *dup) /* data unused entry */
@@ -333,7 +333,7 @@ xfs_dir2_data_freefind(
333 xfs_dir2_data_free_t *dfp; /* bestfree entry */ 333 xfs_dir2_data_free_t *dfp; /* bestfree entry */
334 xfs_dir2_data_aoff_t off; /* offset value needed */ 334 xfs_dir2_data_aoff_t off; /* offset value needed */
335 struct xfs_dir2_data_free *bf; 335 struct xfs_dir2_data_free *bf;
336#if defined(DEBUG) && defined(__KERNEL__) 336#ifdef DEBUG
337 int matched; /* matched the value */ 337 int matched; /* matched the value */
338 int seenzero; /* saw a 0 bestfree entry */ 338 int seenzero; /* saw a 0 bestfree entry */
339#endif 339#endif
@@ -341,7 +341,7 @@ xfs_dir2_data_freefind(
341 off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); 341 off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
342 bf = xfs_dir3_data_bestfree_p(hdr); 342 bf = xfs_dir3_data_bestfree_p(hdr);
343 343
344#if defined(DEBUG) && defined(__KERNEL__) 344#ifdef DEBUG
345 /* 345 /*
346 * Validate some consistency in the bestfree table. 346 * Validate some consistency in the bestfree table.
347 * Check order, non-overlapping entries, and if we find the 347 * Check order, non-overlapping entries, and if we find the
@@ -538,8 +538,8 @@ xfs_dir2_data_freescan(
538 else { 538 else {
539 dep = (xfs_dir2_data_entry_t *)p; 539 dep = (xfs_dir2_data_entry_t *)p;
540 ASSERT((char *)dep - (char *)hdr == 540 ASSERT((char *)dep - (char *)hdr ==
541 be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep))); 541 be16_to_cpu(*xfs_dir3_data_entry_tag_p(mp, dep)));
542 p += xfs_dir2_data_entsize(dep->namelen); 542 p += xfs_dir3_data_entsize(mp, dep->namelen);
543 } 543 }
544 } 544 }
545} 545}
@@ -629,7 +629,8 @@ xfs_dir2_data_log_entry(
629 struct xfs_buf *bp, 629 struct xfs_buf *bp,
630 xfs_dir2_data_entry_t *dep) /* data entry pointer */ 630 xfs_dir2_data_entry_t *dep) /* data entry pointer */
631{ 631{
632 xfs_dir2_data_hdr_t *hdr = bp->b_addr; 632 struct xfs_dir2_data_hdr *hdr = bp->b_addr;
633 struct xfs_mount *mp = tp->t_mountp;
633 634
634 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || 635 ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
635 hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || 636 hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
@@ -637,7 +638,7 @@ xfs_dir2_data_log_entry(
637 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); 638 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
638 639
639 xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr), 640 xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr),
640 (uint)((char *)(xfs_dir2_data_entry_tag_p(dep) + 1) - 641 (uint)((char *)(xfs_dir3_data_entry_tag_p(mp, dep) + 1) -
641 (char *)hdr - 1)); 642 (char *)hdr - 1));
642} 643}
643 644
diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h
index 7826782b8d78..a0961a61ac1a 100644
--- a/fs/xfs/xfs_dir2_format.h
+++ b/fs/xfs/xfs_dir2_format.h
@@ -69,6 +69,23 @@
69#define XFS_DIR3_FREE_MAGIC 0x58444633 /* XDF3: free index blocks */ 69#define XFS_DIR3_FREE_MAGIC 0x58444633 /* XDF3: free index blocks */
70 70
71/* 71/*
72 * Dirents in version 3 directories have a file type field. Additions to this
73 * list are an on-disk format change, requiring feature bits. Valid values
74 * are as follows:
75 */
76#define XFS_DIR3_FT_UNKNOWN 0
77#define XFS_DIR3_FT_REG_FILE 1
78#define XFS_DIR3_FT_DIR 2
79#define XFS_DIR3_FT_CHRDEV 3
80#define XFS_DIR3_FT_BLKDEV 4
81#define XFS_DIR3_FT_FIFO 5
82#define XFS_DIR3_FT_SOCK 6
83#define XFS_DIR3_FT_SYMLINK 7
84#define XFS_DIR3_FT_WHT 8
85
86#define XFS_DIR3_FT_MAX 9
87
88/*
72 * Byte offset in data block and shortform entry. 89 * Byte offset in data block and shortform entry.
73 */ 90 */
74typedef __uint16_t xfs_dir2_data_off_t; 91typedef __uint16_t xfs_dir2_data_off_t;
@@ -138,6 +155,9 @@ typedef struct xfs_dir2_sf_entry {
138 xfs_dir2_sf_off_t offset; /* saved offset */ 155 xfs_dir2_sf_off_t offset; /* saved offset */
139 __u8 name[]; /* name, variable size */ 156 __u8 name[]; /* name, variable size */
140 /* 157 /*
158 * A single byte containing the file type field follows the inode
159 * number for version 3 directory entries.
160 *
141 * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a 161 * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a
142 * variable offset after the name. 162 * variable offset after the name.
143 */ 163 */
@@ -162,16 +182,6 @@ xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
162 put_unaligned_be16(off, &sfep->offset.i); 182 put_unaligned_be16(off, &sfep->offset.i);
163} 183}
164 184
165static inline int
166xfs_dir2_sf_entsize(struct xfs_dir2_sf_hdr *hdr, int len)
167{
168 return sizeof(struct xfs_dir2_sf_entry) + /* namelen + offset */
169 len + /* name */
170 (hdr->i8count ? /* ino */
171 sizeof(xfs_dir2_ino8_t) :
172 sizeof(xfs_dir2_ino4_t));
173}
174
175static inline struct xfs_dir2_sf_entry * 185static inline struct xfs_dir2_sf_entry *
176xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr) 186xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
177{ 187{
@@ -179,14 +189,78 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
179 ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count)); 189 ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
180} 190}
181 191
192static inline int
193xfs_dir3_sf_entsize(
194 struct xfs_mount *mp,
195 struct xfs_dir2_sf_hdr *hdr,
196 int len)
197{
198 int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */
199
200 count += len; /* name */
201 count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
202 sizeof(xfs_dir2_ino4_t); /* ino # */
203 if (xfs_sb_version_hasftype(&mp->m_sb))
204 count += sizeof(__uint8_t); /* file type */
205 return count;
206}
207
182static inline struct xfs_dir2_sf_entry * 208static inline struct xfs_dir2_sf_entry *
183xfs_dir2_sf_nextentry(struct xfs_dir2_sf_hdr *hdr, 209xfs_dir3_sf_nextentry(
184 struct xfs_dir2_sf_entry *sfep) 210 struct xfs_mount *mp,
211 struct xfs_dir2_sf_hdr *hdr,
212 struct xfs_dir2_sf_entry *sfep)
185{ 213{
186 return (struct xfs_dir2_sf_entry *) 214 return (struct xfs_dir2_sf_entry *)
187 ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen)); 215 ((char *)sfep + xfs_dir3_sf_entsize(mp, hdr, sfep->namelen));
188} 216}
189 217
218/*
219 * in dir3 shortform directories, the file type field is stored at a variable
220 * offset after the inode number. Because it's only a single byte, endian
221 * conversion is not necessary.
222 */
223static inline __uint8_t *
224xfs_dir3_sfe_ftypep(
225 struct xfs_dir2_sf_hdr *hdr,
226 struct xfs_dir2_sf_entry *sfep)
227{
228 return (__uint8_t *)&sfep->name[sfep->namelen];
229}
230
231static inline __uint8_t
232xfs_dir3_sfe_get_ftype(
233 struct xfs_mount *mp,
234 struct xfs_dir2_sf_hdr *hdr,
235 struct xfs_dir2_sf_entry *sfep)
236{
237 __uint8_t *ftp;
238
239 if (!xfs_sb_version_hasftype(&mp->m_sb))
240 return XFS_DIR3_FT_UNKNOWN;
241
242 ftp = xfs_dir3_sfe_ftypep(hdr, sfep);
243 if (*ftp >= XFS_DIR3_FT_MAX)
244 return XFS_DIR3_FT_UNKNOWN;
245 return *ftp;
246}
247
248static inline void
249xfs_dir3_sfe_put_ftype(
250 struct xfs_mount *mp,
251 struct xfs_dir2_sf_hdr *hdr,
252 struct xfs_dir2_sf_entry *sfep,
253 __uint8_t ftype)
254{
255 __uint8_t *ftp;
256
257 ASSERT(ftype < XFS_DIR3_FT_MAX);
258
259 if (!xfs_sb_version_hasftype(&mp->m_sb))
260 return;
261 ftp = xfs_dir3_sfe_ftypep(hdr, sfep);
262 *ftp = ftype;
263}
190 264
191/* 265/*
192 * Data block structures. 266 * Data block structures.
@@ -286,12 +360,18 @@ xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
286 * Active entry in a data block. 360 * Active entry in a data block.
287 * 361 *
288 * Aligned to 8 bytes. After the variable length name field there is a 362 * Aligned to 8 bytes. After the variable length name field there is a
289 * 2 byte tag field, which can be accessed using xfs_dir2_data_entry_tag_p. 363 * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p.
364 *
365 * For dir3 structures, there is file type field between the name and the tag.
366 * This can only be manipulated by helper functions. It is packed hard against
367 * the end of the name so any padding for rounding is between the file type and
368 * the tag.
290 */ 369 */
291typedef struct xfs_dir2_data_entry { 370typedef struct xfs_dir2_data_entry {
292 __be64 inumber; /* inode number */ 371 __be64 inumber; /* inode number */
293 __u8 namelen; /* name length */ 372 __u8 namelen; /* name length */
294 __u8 name[]; /* name bytes, no null */ 373 __u8 name[]; /* name bytes, no null */
374 /* __u8 filetype; */ /* type of inode we point to */
295 /* __be16 tag; */ /* starting offset of us */ 375 /* __be16 tag; */ /* starting offset of us */
296} xfs_dir2_data_entry_t; 376} xfs_dir2_data_entry_t;
297 377
@@ -311,20 +391,67 @@ typedef struct xfs_dir2_data_unused {
311/* 391/*
312 * Size of a data entry. 392 * Size of a data entry.
313 */ 393 */
314static inline int xfs_dir2_data_entsize(int n) 394static inline int
395__xfs_dir3_data_entsize(
396 bool ftype,
397 int n)
315{ 398{
316 return (int)roundup(offsetof(struct xfs_dir2_data_entry, name[0]) + n + 399 int size = offsetof(struct xfs_dir2_data_entry, name[0]);
317 (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN); 400
401 size += n;
402 size += sizeof(xfs_dir2_data_off_t);
403 if (ftype)
404 size += sizeof(__uint8_t);
405 return roundup(size, XFS_DIR2_DATA_ALIGN);
406}
407static inline int
408xfs_dir3_data_entsize(
409 struct xfs_mount *mp,
410 int n)
411{
412 bool ftype = xfs_sb_version_hasftype(&mp->m_sb) ? true : false;
413 return __xfs_dir3_data_entsize(ftype, n);
414}
415
416static inline __uint8_t
417xfs_dir3_dirent_get_ftype(
418 struct xfs_mount *mp,
419 struct xfs_dir2_data_entry *dep)
420{
421 if (xfs_sb_version_hasftype(&mp->m_sb)) {
422 __uint8_t type = dep->name[dep->namelen];
423
424 ASSERT(type < XFS_DIR3_FT_MAX);
425 if (type < XFS_DIR3_FT_MAX)
426 return type;
427
428 }
429 return XFS_DIR3_FT_UNKNOWN;
430}
431
432static inline void
433xfs_dir3_dirent_put_ftype(
434 struct xfs_mount *mp,
435 struct xfs_dir2_data_entry *dep,
436 __uint8_t type)
437{
438 ASSERT(type < XFS_DIR3_FT_MAX);
439 ASSERT(dep->namelen != 0);
440
441 if (xfs_sb_version_hasftype(&mp->m_sb))
442 dep->name[dep->namelen] = type;
318} 443}
319 444
320/* 445/*
321 * Pointer to an entry's tag word. 446 * Pointer to an entry's tag word.
322 */ 447 */
323static inline __be16 * 448static inline __be16 *
324xfs_dir2_data_entry_tag_p(struct xfs_dir2_data_entry *dep) 449xfs_dir3_data_entry_tag_p(
450 struct xfs_mount *mp,
451 struct xfs_dir2_data_entry *dep)
325{ 452{
326 return (__be16 *)((char *)dep + 453 return (__be16 *)((char *)dep +
327 xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16)); 454 xfs_dir3_data_entsize(mp, dep->namelen) - sizeof(__be16));
328} 455}
329 456
330/* 457/*
@@ -375,13 +502,17 @@ xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
375 * data block header because the sfe embeds the block offset of the entry into 502 * data block header because the sfe embeds the block offset of the entry into
376 * it so that it doesn't change when format conversion occurs. Bad Things Happen 503 * it so that it doesn't change when format conversion occurs. Bad Things Happen
377 * if we don't follow this rule. 504 * if we don't follow this rule.
505 *
506 * XXX: there is scope for significant optimisation of the logic here. Right
507 * now we are checking for "dir3 format" over and over again. Ideally we should
508 * only do it once for each operation.
378 */ 509 */
379#define XFS_DIR3_DATA_DOT_OFFSET(mp) \ 510#define XFS_DIR3_DATA_DOT_OFFSET(mp) \
380 xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&(mp)->m_sb)) 511 xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&(mp)->m_sb))
381#define XFS_DIR3_DATA_DOTDOT_OFFSET(mp) \ 512#define XFS_DIR3_DATA_DOTDOT_OFFSET(mp) \
382 (XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir2_data_entsize(1)) 513 (XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir3_data_entsize(mp, 1))
383#define XFS_DIR3_DATA_FIRST_OFFSET(mp) \ 514#define XFS_DIR3_DATA_FIRST_OFFSET(mp) \
384 (XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir2_data_entsize(2)) 515 (XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir3_data_entsize(mp, 2))
385 516
386static inline xfs_dir2_data_aoff_t 517static inline xfs_dir2_data_aoff_t
387xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr) 518xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr)
@@ -392,13 +523,19 @@ xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr)
392static inline xfs_dir2_data_aoff_t 523static inline xfs_dir2_data_aoff_t
393xfs_dir3_data_dotdot_offset(struct xfs_dir2_data_hdr *hdr) 524xfs_dir3_data_dotdot_offset(struct xfs_dir2_data_hdr *hdr)
394{ 525{
395 return xfs_dir3_data_dot_offset(hdr) + xfs_dir2_data_entsize(1); 526 bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
527 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
528 return xfs_dir3_data_dot_offset(hdr) +
529 __xfs_dir3_data_entsize(dir3, 1);
396} 530}
397 531
398static inline xfs_dir2_data_aoff_t 532static inline xfs_dir2_data_aoff_t
399xfs_dir3_data_first_offset(struct xfs_dir2_data_hdr *hdr) 533xfs_dir3_data_first_offset(struct xfs_dir2_data_hdr *hdr)
400{ 534{
401 return xfs_dir3_data_dotdot_offset(hdr) + xfs_dir2_data_entsize(2); 535 bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
536 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
537 return xfs_dir3_data_dotdot_offset(hdr) +
538 __xfs_dir3_data_entsize(dir3, 2);
402} 539}
403 540
404/* 541/*
@@ -519,6 +656,9 @@ struct xfs_dir3_leaf {
519 656
520#define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc) 657#define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc)
521 658
659extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to,
660 struct xfs_dir2_leaf *from);
661
522static inline int 662static inline int
523xfs_dir3_leaf_hdr_size(struct xfs_dir2_leaf *lp) 663xfs_dir3_leaf_hdr_size(struct xfs_dir2_leaf *lp)
524{ 664{
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 2aed25cae04d..08984eeee159 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -31,6 +31,7 @@
31#include "xfs_inode.h" 31#include "xfs_inode.h"
32#include "xfs_bmap.h" 32#include "xfs_bmap.h"
33#include "xfs_dir2_format.h" 33#include "xfs_dir2_format.h"
34#include "xfs_dir2.h"
34#include "xfs_dir2_priv.h" 35#include "xfs_dir2_priv.h"
35#include "xfs_error.h" 36#include "xfs_error.h"
36#include "xfs_trace.h" 37#include "xfs_trace.h"
@@ -695,7 +696,7 @@ xfs_dir2_leaf_addname(
695 ents = xfs_dir3_leaf_ents_p(leaf); 696 ents = xfs_dir3_leaf_ents_p(leaf);
696 xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); 697 xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
697 bestsp = xfs_dir2_leaf_bests_p(ltp); 698 bestsp = xfs_dir2_leaf_bests_p(ltp);
698 length = xfs_dir2_data_entsize(args->namelen); 699 length = xfs_dir3_data_entsize(mp, args->namelen);
699 700
700 /* 701 /*
701 * See if there are any entries with the same hash value 702 * See if there are any entries with the same hash value
@@ -896,7 +897,8 @@ xfs_dir2_leaf_addname(
896 dep->inumber = cpu_to_be64(args->inumber); 897 dep->inumber = cpu_to_be64(args->inumber);
897 dep->namelen = args->namelen; 898 dep->namelen = args->namelen;
898 memcpy(dep->name, args->name, dep->namelen); 899 memcpy(dep->name, args->name, dep->namelen);
899 tagp = xfs_dir2_data_entry_tag_p(dep); 900 xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
901 tagp = xfs_dir3_data_entry_tag_p(mp, dep);
900 *tagp = cpu_to_be16((char *)dep - (char *)hdr); 902 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
901 /* 903 /*
902 * Need to scan fix up the bestfree table. 904 * Need to scan fix up the bestfree table.
@@ -1083,396 +1085,6 @@ xfs_dir3_leaf_compact_x1(
1083 *highstalep = highstale; 1085 *highstalep = highstale;
1084} 1086}
1085 1087
1086struct xfs_dir2_leaf_map_info {
1087 xfs_extlen_t map_blocks; /* number of fsbs in map */
1088 xfs_dablk_t map_off; /* last mapped file offset */
1089 int map_size; /* total entries in *map */
1090 int map_valid; /* valid entries in *map */
1091 int nmap; /* mappings to ask xfs_bmapi */
1092 xfs_dir2_db_t curdb; /* db for current block */
1093 int ra_current; /* number of read-ahead blks */
1094 int ra_index; /* *map index for read-ahead */
1095 int ra_offset; /* map entry offset for ra */
1096 int ra_want; /* readahead count wanted */
1097 struct xfs_bmbt_irec map[]; /* map vector for blocks */
1098};
1099
1100STATIC int
1101xfs_dir2_leaf_readbuf(
1102 struct xfs_inode *dp,
1103 size_t bufsize,
1104 struct xfs_dir2_leaf_map_info *mip,
1105 xfs_dir2_off_t *curoff,
1106 struct xfs_buf **bpp)
1107{
1108 struct xfs_mount *mp = dp->i_mount;
1109 struct xfs_buf *bp = *bpp;
1110 struct xfs_bmbt_irec *map = mip->map;
1111 struct blk_plug plug;
1112 int error = 0;
1113 int length;
1114 int i;
1115 int j;
1116
1117 /*
1118 * If we have a buffer, we need to release it and
1119 * take it out of the mapping.
1120 */
1121
1122 if (bp) {
1123 xfs_trans_brelse(NULL, bp);
1124 bp = NULL;
1125 mip->map_blocks -= mp->m_dirblkfsbs;
1126 /*
1127 * Loop to get rid of the extents for the
1128 * directory block.
1129 */
1130 for (i = mp->m_dirblkfsbs; i > 0; ) {
1131 j = min_t(int, map->br_blockcount, i);
1132 map->br_blockcount -= j;
1133 map->br_startblock += j;
1134 map->br_startoff += j;
1135 /*
1136 * If mapping is done, pitch it from
1137 * the table.
1138 */
1139 if (!map->br_blockcount && --mip->map_valid)
1140 memmove(&map[0], &map[1],
1141 sizeof(map[0]) * mip->map_valid);
1142 i -= j;
1143 }
1144 }
1145
1146 /*
1147 * Recalculate the readahead blocks wanted.
1148 */
1149 mip->ra_want = howmany(bufsize + mp->m_dirblksize,
1150 mp->m_sb.sb_blocksize) - 1;
1151 ASSERT(mip->ra_want >= 0);
1152
1153 /*
1154 * If we don't have as many as we want, and we haven't
1155 * run out of data blocks, get some more mappings.
1156 */
1157 if (1 + mip->ra_want > mip->map_blocks &&
1158 mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) {
1159 /*
1160 * Get more bmaps, fill in after the ones
1161 * we already have in the table.
1162 */
1163 mip->nmap = mip->map_size - mip->map_valid;
1164 error = xfs_bmapi_read(dp, mip->map_off,
1165 xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) -
1166 mip->map_off,
1167 &map[mip->map_valid], &mip->nmap, 0);
1168
1169 /*
1170 * Don't know if we should ignore this or try to return an
1171 * error. The trouble with returning errors is that readdir
1172 * will just stop without actually passing the error through.
1173 */
1174 if (error)
1175 goto out; /* XXX */
1176
1177 /*
1178 * If we got all the mappings we asked for, set the final map
1179 * offset based on the last bmap value received. Otherwise,
1180 * we've reached the end.
1181 */
1182 if (mip->nmap == mip->map_size - mip->map_valid) {
1183 i = mip->map_valid + mip->nmap - 1;
1184 mip->map_off = map[i].br_startoff + map[i].br_blockcount;
1185 } else
1186 mip->map_off = xfs_dir2_byte_to_da(mp,
1187 XFS_DIR2_LEAF_OFFSET);
1188
1189 /*
1190 * Look for holes in the mapping, and eliminate them. Count up
1191 * the valid blocks.
1192 */
1193 for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) {
1194 if (map[i].br_startblock == HOLESTARTBLOCK) {
1195 mip->nmap--;
1196 length = mip->map_valid + mip->nmap - i;
1197 if (length)
1198 memmove(&map[i], &map[i + 1],
1199 sizeof(map[i]) * length);
1200 } else {
1201 mip->map_blocks += map[i].br_blockcount;
1202 i++;
1203 }
1204 }
1205 mip->map_valid += mip->nmap;
1206 }
1207
1208 /*
1209 * No valid mappings, so no more data blocks.
1210 */
1211 if (!mip->map_valid) {
1212 *curoff = xfs_dir2_da_to_byte(mp, mip->map_off);
1213 goto out;
1214 }
1215
1216 /*
1217 * Read the directory block starting at the first mapping.
1218 */
1219 mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
1220 error = xfs_dir3_data_read(NULL, dp, map->br_startoff,
1221 map->br_blockcount >= mp->m_dirblkfsbs ?
1222 XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
1223
1224 /*
1225 * Should just skip over the data block instead of giving up.
1226 */
1227 if (error)
1228 goto out; /* XXX */
1229
1230 /*
1231 * Adjust the current amount of read-ahead: we just read a block that
1232 * was previously ra.
1233 */
1234 if (mip->ra_current)
1235 mip->ra_current -= mp->m_dirblkfsbs;
1236
1237 /*
1238 * Do we need more readahead?
1239 */
1240 blk_start_plug(&plug);
1241 for (mip->ra_index = mip->ra_offset = i = 0;
1242 mip->ra_want > mip->ra_current && i < mip->map_blocks;
1243 i += mp->m_dirblkfsbs) {
1244 ASSERT(mip->ra_index < mip->map_valid);
1245 /*
1246 * Read-ahead a contiguous directory block.
1247 */
1248 if (i > mip->ra_current &&
1249 map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
1250 xfs_dir3_data_readahead(NULL, dp,
1251 map[mip->ra_index].br_startoff + mip->ra_offset,
1252 XFS_FSB_TO_DADDR(mp,
1253 map[mip->ra_index].br_startblock +
1254 mip->ra_offset));
1255 mip->ra_current = i;
1256 }
1257
1258 /*
1259 * Read-ahead a non-contiguous directory block. This doesn't
1260 * use our mapping, but this is a very rare case.
1261 */
1262 else if (i > mip->ra_current) {
1263 xfs_dir3_data_readahead(NULL, dp,
1264 map[mip->ra_index].br_startoff +
1265 mip->ra_offset, -1);
1266 mip->ra_current = i;
1267 }
1268
1269 /*
1270 * Advance offset through the mapping table.
1271 */
1272 for (j = 0; j < mp->m_dirblkfsbs; j++) {
1273 /*
1274 * The rest of this extent but not more than a dir
1275 * block.
1276 */
1277 length = min_t(int, mp->m_dirblkfsbs,
1278 map[mip->ra_index].br_blockcount -
1279 mip->ra_offset);
1280 j += length;
1281 mip->ra_offset += length;
1282
1283 /*
1284 * Advance to the next mapping if this one is used up.
1285 */
1286 if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
1287 mip->ra_offset = 0;
1288 mip->ra_index++;
1289 }
1290 }
1291 }
1292 blk_finish_plug(&plug);
1293
1294out:
1295 *bpp = bp;
1296 return error;
1297}
1298
1299/*
1300 * Getdents (readdir) for leaf and node directories.
1301 * This reads the data blocks only, so is the same for both forms.
1302 */
1303int /* error */
1304xfs_dir2_leaf_getdents(
1305 xfs_inode_t *dp, /* incore directory inode */
1306 struct dir_context *ctx,
1307 size_t bufsize)
1308{
1309 struct xfs_buf *bp = NULL; /* data block buffer */
1310 xfs_dir2_data_hdr_t *hdr; /* data block header */
1311 xfs_dir2_data_entry_t *dep; /* data entry */
1312 xfs_dir2_data_unused_t *dup; /* unused entry */
1313 int error = 0; /* error return value */
1314 int length; /* temporary length value */
1315 xfs_mount_t *mp; /* filesystem mount point */
1316 int byteoff; /* offset in current block */
1317 xfs_dir2_off_t curoff; /* current overall offset */
1318 xfs_dir2_off_t newoff; /* new curoff after new blk */
1319 char *ptr = NULL; /* pointer to current data */
1320 struct xfs_dir2_leaf_map_info *map_info;
1321
1322 /*
1323 * If the offset is at or past the largest allowed value,
1324 * give up right away.
1325 */
1326 if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
1327 return 0;
1328
1329 mp = dp->i_mount;
1330
1331 /*
1332 * Set up to bmap a number of blocks based on the caller's
1333 * buffer size, the directory block size, and the filesystem
1334 * block size.
1335 */
1336 length = howmany(bufsize + mp->m_dirblksize,
1337 mp->m_sb.sb_blocksize);
1338 map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
1339 (length * sizeof(struct xfs_bmbt_irec)),
1340 KM_SLEEP | KM_NOFS);
1341 map_info->map_size = length;
1342
1343 /*
1344 * Inside the loop we keep the main offset value as a byte offset
1345 * in the directory file.
1346 */
1347 curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
1348
1349 /*
1350 * Force this conversion through db so we truncate the offset
1351 * down to get the start of the data block.
1352 */
1353 map_info->map_off = xfs_dir2_db_to_da(mp,
1354 xfs_dir2_byte_to_db(mp, curoff));
1355
1356 /*
1357 * Loop over directory entries until we reach the end offset.
1358 * Get more blocks and readahead as necessary.
1359 */
1360 while (curoff < XFS_DIR2_LEAF_OFFSET) {
1361 /*
1362 * If we have no buffer, or we're off the end of the
1363 * current buffer, need to get another one.
1364 */
1365 if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) {
1366
1367 error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info,
1368 &curoff, &bp);
1369 if (error || !map_info->map_valid)
1370 break;
1371
1372 /*
1373 * Having done a read, we need to set a new offset.
1374 */
1375 newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0);
1376 /*
1377 * Start of the current block.
1378 */
1379 if (curoff < newoff)
1380 curoff = newoff;
1381 /*
1382 * Make sure we're in the right block.
1383 */
1384 else if (curoff > newoff)
1385 ASSERT(xfs_dir2_byte_to_db(mp, curoff) ==
1386 map_info->curdb);
1387 hdr = bp->b_addr;
1388 xfs_dir3_data_check(dp, bp);
1389 /*
1390 * Find our position in the block.
1391 */
1392 ptr = (char *)xfs_dir3_data_entry_p(hdr);
1393 byteoff = xfs_dir2_byte_to_off(mp, curoff);
1394 /*
1395 * Skip past the header.
1396 */
1397 if (byteoff == 0)
1398 curoff += xfs_dir3_data_entry_offset(hdr);
1399 /*
1400 * Skip past entries until we reach our offset.
1401 */
1402 else {
1403 while ((char *)ptr - (char *)hdr < byteoff) {
1404 dup = (xfs_dir2_data_unused_t *)ptr;
1405
1406 if (be16_to_cpu(dup->freetag)
1407 == XFS_DIR2_DATA_FREE_TAG) {
1408
1409 length = be16_to_cpu(dup->length);
1410 ptr += length;
1411 continue;
1412 }
1413 dep = (xfs_dir2_data_entry_t *)ptr;
1414 length =
1415 xfs_dir2_data_entsize(dep->namelen);
1416 ptr += length;
1417 }
1418 /*
1419 * Now set our real offset.
1420 */
1421 curoff =
1422 xfs_dir2_db_off_to_byte(mp,
1423 xfs_dir2_byte_to_db(mp, curoff),
1424 (char *)ptr - (char *)hdr);
1425 if (ptr >= (char *)hdr + mp->m_dirblksize) {
1426 continue;
1427 }
1428 }
1429 }
1430 /*
1431 * We have a pointer to an entry.
1432 * Is it a live one?
1433 */
1434 dup = (xfs_dir2_data_unused_t *)ptr;
1435 /*
1436 * No, it's unused, skip over it.
1437 */
1438 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
1439 length = be16_to_cpu(dup->length);
1440 ptr += length;
1441 curoff += length;
1442 continue;
1443 }
1444
1445 dep = (xfs_dir2_data_entry_t *)ptr;
1446 length = xfs_dir2_data_entsize(dep->namelen);
1447
1448 ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
1449 if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
1450 be64_to_cpu(dep->inumber), DT_UNKNOWN))
1451 break;
1452
1453 /*
1454 * Advance to next entry in the block.
1455 */
1456 ptr += length;
1457 curoff += length;
1458 /* bufsize may have just been a guess; don't go negative */
1459 bufsize = bufsize > length ? bufsize - length : 0;
1460 }
1461
1462 /*
1463 * All done. Set output offset value to current offset.
1464 */
1465 if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
1466 ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
1467 else
1468 ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
1469 kmem_free(map_info);
1470 if (bp)
1471 xfs_trans_brelse(NULL, bp);
1472 return error;
1473}
1474
1475
1476/* 1088/*
1477 * Log the bests entries indicated from a leaf1 block. 1089 * Log the bests entries indicated from a leaf1 block.
1478 */ 1090 */
@@ -1614,6 +1226,7 @@ xfs_dir2_leaf_lookup(
1614 * Return the found inode number & CI name if appropriate 1226 * Return the found inode number & CI name if appropriate
1615 */ 1227 */
1616 args->inumber = be64_to_cpu(dep->inumber); 1228 args->inumber = be64_to_cpu(dep->inumber);
1229 args->filetype = xfs_dir3_dirent_get_ftype(dp->i_mount, dep);
1617 error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); 1230 error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
1618 xfs_trans_brelse(tp, dbp); 1231 xfs_trans_brelse(tp, dbp);
1619 xfs_trans_brelse(tp, lbp); 1232 xfs_trans_brelse(tp, lbp);
@@ -1816,7 +1429,7 @@ xfs_dir2_leaf_removename(
1816 */ 1429 */
1817 xfs_dir2_data_make_free(tp, dbp, 1430 xfs_dir2_data_make_free(tp, dbp,
1818 (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), 1431 (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
1819 xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); 1432 xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan);
1820 /* 1433 /*
1821 * We just mark the leaf entry stale by putting a null in it. 1434 * We just mark the leaf entry stale by putting a null in it.
1822 */ 1435 */
@@ -1944,6 +1557,7 @@ xfs_dir2_leaf_replace(
1944 * Put the new inode number in, log it. 1557 * Put the new inode number in, log it.
1945 */ 1558 */
1946 dep->inumber = cpu_to_be64(args->inumber); 1559 dep->inumber = cpu_to_be64(args->inumber);
1560 xfs_dir3_dirent_put_ftype(dp->i_mount, dep, args->filetype);
1947 tp = args->trans; 1561 tp = args->trans;
1948 xfs_dir2_data_log_entry(tp, dbp, dep); 1562 xfs_dir2_data_log_entry(tp, dbp, dep);
1949 xfs_dir3_leaf_check(dp->i_mount, lbp); 1563 xfs_dir3_leaf_check(dp->i_mount, lbp);
@@ -1975,10 +1589,6 @@ xfs_dir2_leaf_search_hash(
1975 ents = xfs_dir3_leaf_ents_p(leaf); 1589 ents = xfs_dir3_leaf_ents_p(leaf);
1976 xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf); 1590 xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
1977 1591
1978#ifndef __KERNEL__
1979 if (!leafhdr.count)
1980 return 0;
1981#endif
1982 /* 1592 /*
1983 * Note, the table cannot be empty, so we have to go through the loop. 1593 * Note, the table cannot be empty, so we have to go through the loop.
1984 * Binary search the leaf entries looking for our hash value. 1594 * Binary search the leaf entries looking for our hash value.
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 2226a00acd15..4c3dba7ffb74 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -30,6 +30,7 @@
30#include "xfs_inode.h" 30#include "xfs_inode.h"
31#include "xfs_bmap.h" 31#include "xfs_bmap.h"
32#include "xfs_dir2_format.h" 32#include "xfs_dir2_format.h"
33#include "xfs_dir2.h"
33#include "xfs_dir2_priv.h" 34#include "xfs_dir2_priv.h"
34#include "xfs_error.h" 35#include "xfs_error.h"
35#include "xfs_trace.h" 36#include "xfs_trace.h"
@@ -312,11 +313,13 @@ xfs_dir2_free_log_header(
312 struct xfs_trans *tp, 313 struct xfs_trans *tp,
313 struct xfs_buf *bp) 314 struct xfs_buf *bp)
314{ 315{
316#ifdef DEBUG
315 xfs_dir2_free_t *free; /* freespace structure */ 317 xfs_dir2_free_t *free; /* freespace structure */
316 318
317 free = bp->b_addr; 319 free = bp->b_addr;
318 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || 320 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
319 free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); 321 free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
322#endif
320 xfs_trans_log_buf(tp, bp, 0, xfs_dir3_free_hdr_size(tp->t_mountp) - 1); 323 xfs_trans_log_buf(tp, bp, 0, xfs_dir3_free_hdr_size(tp->t_mountp) - 1);
321} 324}
322 325
@@ -602,7 +605,7 @@ xfs_dir2_leafn_lookup_for_addname(
602 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || 605 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
603 free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); 606 free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
604 } 607 }
605 length = xfs_dir2_data_entsize(args->namelen); 608 length = xfs_dir3_data_entsize(mp, args->namelen);
606 /* 609 /*
607 * Loop over leaf entries with the right hash value. 610 * Loop over leaf entries with the right hash value.
608 */ 611 */
@@ -813,6 +816,7 @@ xfs_dir2_leafn_lookup_for_entry(
813 xfs_trans_brelse(tp, state->extrablk.bp); 816 xfs_trans_brelse(tp, state->extrablk.bp);
814 args->cmpresult = cmp; 817 args->cmpresult = cmp;
815 args->inumber = be64_to_cpu(dep->inumber); 818 args->inumber = be64_to_cpu(dep->inumber);
819 args->filetype = xfs_dir3_dirent_get_ftype(mp, dep);
816 *indexp = index; 820 *indexp = index;
817 state->extravalid = 1; 821 state->extravalid = 1;
818 state->extrablk.bp = curbp; 822 state->extrablk.bp = curbp;
@@ -1256,7 +1260,7 @@ xfs_dir2_leafn_remove(
1256 longest = be16_to_cpu(bf[0].length); 1260 longest = be16_to_cpu(bf[0].length);
1257 needlog = needscan = 0; 1261 needlog = needscan = 0;
1258 xfs_dir2_data_make_free(tp, dbp, off, 1262 xfs_dir2_data_make_free(tp, dbp, off,
1259 xfs_dir2_data_entsize(dep->namelen), &needlog, &needscan); 1263 xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan);
1260 /* 1264 /*
1261 * Rescan the data block freespaces for bestfree. 1265 * Rescan the data block freespaces for bestfree.
1262 * Log the data block header if needed. 1266 * Log the data block header if needed.
@@ -1708,7 +1712,7 @@ xfs_dir2_node_addname_int(
1708 dp = args->dp; 1712 dp = args->dp;
1709 mp = dp->i_mount; 1713 mp = dp->i_mount;
1710 tp = args->trans; 1714 tp = args->trans;
1711 length = xfs_dir2_data_entsize(args->namelen); 1715 length = xfs_dir3_data_entsize(mp, args->namelen);
1712 /* 1716 /*
1713 * If we came in with a freespace block that means that lookup 1717 * If we came in with a freespace block that means that lookup
1714 * found an entry with our hash value. This is the freespace 1718 * found an entry with our hash value. This is the freespace
@@ -2004,7 +2008,8 @@ xfs_dir2_node_addname_int(
2004 dep->inumber = cpu_to_be64(args->inumber); 2008 dep->inumber = cpu_to_be64(args->inumber);
2005 dep->namelen = args->namelen; 2009 dep->namelen = args->namelen;
2006 memcpy(dep->name, args->name, dep->namelen); 2010 memcpy(dep->name, args->name, dep->namelen);
2007 tagp = xfs_dir2_data_entry_tag_p(dep); 2011 xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
2012 tagp = xfs_dir3_data_entry_tag_p(mp, dep);
2008 *tagp = cpu_to_be16((char *)dep - (char *)hdr); 2013 *tagp = cpu_to_be16((char *)dep - (char *)hdr);
2009 xfs_dir2_data_log_entry(tp, dbp, dep); 2014 xfs_dir2_data_log_entry(tp, dbp, dep);
2010 /* 2015 /*
@@ -2224,6 +2229,7 @@ xfs_dir2_node_replace(
2224 * Fill in the new inode number and log the entry. 2229 * Fill in the new inode number and log the entry.
2225 */ 2230 */
2226 dep->inumber = cpu_to_be64(inum); 2231 dep->inumber = cpu_to_be64(inum);
2232 xfs_dir3_dirent_put_ftype(state->mp, dep, args->filetype);
2227 xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep); 2233 xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep);
2228 rval = 0; 2234 rval = 0;
2229 } 2235 }
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 0511cda4a712..1bad84c40829 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -18,23 +18,26 @@
18#ifndef __XFS_DIR2_PRIV_H__ 18#ifndef __XFS_DIR2_PRIV_H__
19#define __XFS_DIR2_PRIV_H__ 19#define __XFS_DIR2_PRIV_H__
20 20
21struct dir_context;
22
21/* xfs_dir2.c */ 23/* xfs_dir2.c */
22extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); 24extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
23extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
24extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
25extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, 25extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
26 xfs_dir2_db_t *dbp); 26 xfs_dir2_db_t *dbp);
27extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
28 struct xfs_buf *bp);
29extern int xfs_dir_cilookup_result(struct xfs_da_args *args, 27extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
30 const unsigned char *name, int len); 28 const unsigned char *name, int len);
31 29
32/* xfs_dir2_block.c */ 30#define S_SHIFT 12
33extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; 31extern const unsigned char xfs_mode_to_ftype[];
32
33extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp,
34 __uint8_t filetype);
34 35
36
37/* xfs_dir2_block.c */
38extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp,
39 struct xfs_buf **bpp);
35extern int xfs_dir2_block_addname(struct xfs_da_args *args); 40extern int xfs_dir2_block_addname(struct xfs_da_args *args);
36extern int xfs_dir2_block_getdents(struct xfs_inode *dp,
37 struct dir_context *ctx);
38extern int xfs_dir2_block_lookup(struct xfs_da_args *args); 41extern int xfs_dir2_block_lookup(struct xfs_da_args *args);
39extern int xfs_dir2_block_removename(struct xfs_da_args *args); 42extern int xfs_dir2_block_removename(struct xfs_da_args *args);
40extern int xfs_dir2_block_replace(struct xfs_da_args *args); 43extern int xfs_dir2_block_replace(struct xfs_da_args *args);
@@ -48,9 +51,6 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
48#define xfs_dir3_data_check(dp,bp) 51#define xfs_dir3_data_check(dp,bp)
49#endif 52#endif
50 53
51extern const struct xfs_buf_ops xfs_dir3_data_buf_ops;
52extern const struct xfs_buf_ops xfs_dir3_free_buf_ops;
53
54extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); 54extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
55extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, 55extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
56 xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); 56 xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
@@ -60,27 +60,10 @@ extern int xfs_dir3_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
60extern struct xfs_dir2_data_free * 60extern struct xfs_dir2_data_free *
61xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, 61xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
62 struct xfs_dir2_data_unused *dup, int *loghead); 62 struct xfs_dir2_data_unused *dup, int *loghead);
63extern void xfs_dir2_data_freescan(struct xfs_mount *mp,
64 struct xfs_dir2_data_hdr *hdr, int *loghead);
65extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, 63extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
66 struct xfs_buf **bpp); 64 struct xfs_buf **bpp);
67extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp,
68 struct xfs_dir2_data_entry *dep);
69extern void xfs_dir2_data_log_header(struct xfs_trans *tp,
70 struct xfs_buf *bp);
71extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp,
72 struct xfs_dir2_data_unused *dup);
73extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp,
74 xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
75 int *needlogp, int *needscanp);
76extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
77 struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset,
78 xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
79 65
80/* xfs_dir2_leaf.c */ 66/* xfs_dir2_leaf.c */
81extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
82extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
83
84extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, 67extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
85 xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); 68 xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
86extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, 69extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
@@ -91,8 +74,6 @@ extern void xfs_dir3_leaf_compact(struct xfs_da_args *args,
91extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, 74extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
92 struct xfs_dir2_leaf_entry *ents, int *indexp, 75 struct xfs_dir2_leaf_entry *ents, int *indexp,
93 int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); 76 int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
94extern int xfs_dir2_leaf_getdents(struct xfs_inode *dp, struct dir_context *ctx,
95 size_t bufsize);
96extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, 77extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
97 struct xfs_buf **bpp, __uint16_t magic); 78 struct xfs_buf **bpp, __uint16_t magic);
98extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp, 79extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp,
@@ -144,18 +125,18 @@ extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
144 xfs_dablk_t fbno, struct xfs_buf **bpp); 125 xfs_dablk_t fbno, struct xfs_buf **bpp);
145 126
146/* xfs_dir2_sf.c */ 127/* xfs_dir2_sf.c */
147extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
148extern xfs_ino_t xfs_dir2_sfe_get_ino(struct xfs_dir2_sf_hdr *sfp,
149 struct xfs_dir2_sf_entry *sfep);
150extern int xfs_dir2_block_sfsize(struct xfs_inode *dp, 128extern int xfs_dir2_block_sfsize(struct xfs_inode *dp,
151 struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp); 129 struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp);
152extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp, 130extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp,
153 int size, xfs_dir2_sf_hdr_t *sfhp); 131 int size, xfs_dir2_sf_hdr_t *sfhp);
154extern int xfs_dir2_sf_addname(struct xfs_da_args *args); 132extern int xfs_dir2_sf_addname(struct xfs_da_args *args);
155extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); 133extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
156extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, struct dir_context *ctx);
157extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); 134extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
158extern int xfs_dir2_sf_removename(struct xfs_da_args *args); 135extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
159extern int xfs_dir2_sf_replace(struct xfs_da_args *args); 136extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
160 137
138/* xfs_dir2_readdir.c */
139extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx,
140 size_t bufsize);
141
161#endif /* __XFS_DIR2_PRIV_H__ */ 142#endif /* __XFS_DIR2_PRIV_H__ */
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
new file mode 100644
index 000000000000..8993ec17452c
--- /dev/null
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -0,0 +1,695 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * Copyright (c) 2013 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_types.h"
22#include "xfs_bit.h"
23#include "xfs_log.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_da_btree.h"
29#include "xfs_bmap_btree.h"
30#include "xfs_dinode.h"
31#include "xfs_inode.h"
32#include "xfs_dir2_format.h"
33#include "xfs_dir2.h"
34#include "xfs_dir2_priv.h"
35#include "xfs_error.h"
36#include "xfs_trace.h"
37#include "xfs_bmap.h"
38
39/*
40 * Directory file type support functions
41 */
42static unsigned char xfs_dir3_filetype_table[] = {
43 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK,
44 DT_FIFO, DT_SOCK, DT_LNK, DT_WHT,
45};
46
47unsigned char
48xfs_dir3_get_dtype(
49 struct xfs_mount *mp,
50 __uint8_t filetype)
51{
52 if (!xfs_sb_version_hasftype(&mp->m_sb))
53 return DT_UNKNOWN;
54
55 if (filetype >= XFS_DIR3_FT_MAX)
56 return DT_UNKNOWN;
57
58 return xfs_dir3_filetype_table[filetype];
59}
60/*
61 * @mode, if set, indicates that the type field needs to be set up.
62 * This uses the transformation from file mode to DT_* as defined in linux/fs.h
63 * for file type specification. This will be propagated into the directory
64 * structure if appropriate for the given operation and filesystem config.
65 */
66const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = {
67 [0] = XFS_DIR3_FT_UNKNOWN,
68 [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE,
69 [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR,
70 [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV,
71 [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV,
72 [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO,
73 [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK,
74 [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK,
75};
76
77STATIC int
78xfs_dir2_sf_getdents(
79 xfs_inode_t *dp, /* incore directory inode */
80 struct dir_context *ctx)
81{
82 int i; /* shortform entry number */
83 xfs_mount_t *mp; /* filesystem mount point */
84 xfs_dir2_dataptr_t off; /* current entry's offset */
85 xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
86 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
87 xfs_dir2_dataptr_t dot_offset;
88 xfs_dir2_dataptr_t dotdot_offset;
89 xfs_ino_t ino;
90
91 mp = dp->i_mount;
92
93 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
94 /*
95 * Give up if the directory is way too short.
96 */
97 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
98 ASSERT(XFS_FORCED_SHUTDOWN(mp));
99 return XFS_ERROR(EIO);
100 }
101
102 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
103 ASSERT(dp->i_df.if_u1.if_data != NULL);
104
105 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
106
107 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
108
109 /*
110 * If the block number in the offset is out of range, we're done.
111 */
112 if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
113 return 0;
114
115 /*
116 * Precalculate offsets for . and .. as we will always need them.
117 *
118 * XXX(hch): the second argument is sometimes 0 and sometimes
119 * mp->m_dirdatablk.
120 */
121 dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
122 XFS_DIR3_DATA_DOT_OFFSET(mp));
123 dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
124 XFS_DIR3_DATA_DOTDOT_OFFSET(mp));
125
126 /*
127 * Put . entry unless we're starting past it.
128 */
129 if (ctx->pos <= dot_offset) {
130 ctx->pos = dot_offset & 0x7fffffff;
131 if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
132 return 0;
133 }
134
135 /*
136 * Put .. entry unless we're starting past it.
137 */
138 if (ctx->pos <= dotdot_offset) {
139 ino = xfs_dir2_sf_get_parent_ino(sfp);
140 ctx->pos = dotdot_offset & 0x7fffffff;
141 if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
142 return 0;
143 }
144
145 /*
146 * Loop while there are more entries and put'ing works.
147 */
148 sfep = xfs_dir2_sf_firstentry(sfp);
149 for (i = 0; i < sfp->count; i++) {
150 __uint8_t filetype;
151
152 off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
153 xfs_dir2_sf_get_offset(sfep));
154
155 if (ctx->pos > off) {
156 sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
157 continue;
158 }
159
160 ino = xfs_dir3_sfe_get_ino(mp, sfp, sfep);
161 filetype = xfs_dir3_sfe_get_ftype(mp, sfp, sfep);
162 ctx->pos = off & 0x7fffffff;
163 if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
164 xfs_dir3_get_dtype(mp, filetype)))
165 return 0;
166 sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
167 }
168
169 ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
170 0x7fffffff;
171 return 0;
172}
173
174/*
175 * Readdir for block directories.
176 */
177STATIC int
178xfs_dir2_block_getdents(
179 xfs_inode_t *dp, /* incore inode */
180 struct dir_context *ctx)
181{
182 xfs_dir2_data_hdr_t *hdr; /* block header */
183 struct xfs_buf *bp; /* buffer for block */
184 xfs_dir2_block_tail_t *btp; /* block tail */
185 xfs_dir2_data_entry_t *dep; /* block data entry */
186 xfs_dir2_data_unused_t *dup; /* block unused entry */
187 char *endptr; /* end of the data entries */
188 int error; /* error return value */
189 xfs_mount_t *mp; /* filesystem mount point */
190 char *ptr; /* current data entry */
191 int wantoff; /* starting block offset */
192 xfs_off_t cook;
193
194 mp = dp->i_mount;
195 /*
196 * If the block number in the offset is out of range, we're done.
197 */
198 if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
199 return 0;
200
201 error = xfs_dir3_block_read(NULL, dp, &bp);
202 if (error)
203 return error;
204
205 /*
206 * Extract the byte offset we start at from the seek pointer.
207 * We'll skip entries before this.
208 */
209 wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos);
210 hdr = bp->b_addr;
211 xfs_dir3_data_check(dp, bp);
212 /*
213 * Set up values for the loop.
214 */
215 btp = xfs_dir2_block_tail_p(mp, hdr);
216 ptr = (char *)xfs_dir3_data_entry_p(hdr);
217 endptr = (char *)xfs_dir2_block_leaf_p(btp);
218
219 /*
220 * Loop over the data portion of the block.
221 * Each object is a real entry (dep) or an unused one (dup).
222 */
223 while (ptr < endptr) {
224 __uint8_t filetype;
225
226 dup = (xfs_dir2_data_unused_t *)ptr;
227 /*
228 * Unused, skip it.
229 */
230 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
231 ptr += be16_to_cpu(dup->length);
232 continue;
233 }
234
235 dep = (xfs_dir2_data_entry_t *)ptr;
236
237 /*
238 * Bump pointer for the next iteration.
239 */
240 ptr += xfs_dir3_data_entsize(mp, dep->namelen);
241 /*
242 * The entry is before the desired starting point, skip it.
243 */
244 if ((char *)dep - (char *)hdr < wantoff)
245 continue;
246
247 cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
248 (char *)dep - (char *)hdr);
249
250 ctx->pos = cook & 0x7fffffff;
251 filetype = xfs_dir3_dirent_get_ftype(mp, dep);
252 /*
253 * If it didn't fit, set the final offset to here & return.
254 */
255 if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
256 be64_to_cpu(dep->inumber),
257 xfs_dir3_get_dtype(mp, filetype))) {
258 xfs_trans_brelse(NULL, bp);
259 return 0;
260 }
261 }
262
263 /*
264 * Reached the end of the block.
265 * Set the offset to a non-existent block 1 and return.
266 */
267 ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
268 0x7fffffff;
269 xfs_trans_brelse(NULL, bp);
270 return 0;
271}
272
273struct xfs_dir2_leaf_map_info {
274 xfs_extlen_t map_blocks; /* number of fsbs in map */
275 xfs_dablk_t map_off; /* last mapped file offset */
276 int map_size; /* total entries in *map */
277 int map_valid; /* valid entries in *map */
278 int nmap; /* mappings to ask xfs_bmapi */
279 xfs_dir2_db_t curdb; /* db for current block */
280 int ra_current; /* number of read-ahead blks */
281 int ra_index; /* *map index for read-ahead */
282 int ra_offset; /* map entry offset for ra */
283 int ra_want; /* readahead count wanted */
284 struct xfs_bmbt_irec map[]; /* map vector for blocks */
285};
286
287STATIC int
288xfs_dir2_leaf_readbuf(
289 struct xfs_inode *dp,
290 size_t bufsize,
291 struct xfs_dir2_leaf_map_info *mip,
292 xfs_dir2_off_t *curoff,
293 struct xfs_buf **bpp)
294{
295 struct xfs_mount *mp = dp->i_mount;
296 struct xfs_buf *bp = *bpp;
297 struct xfs_bmbt_irec *map = mip->map;
298 struct blk_plug plug;
299 int error = 0;
300 int length;
301 int i;
302 int j;
303
304 /*
305 * If we have a buffer, we need to release it and
306 * take it out of the mapping.
307 */
308
309 if (bp) {
310 xfs_trans_brelse(NULL, bp);
311 bp = NULL;
312 mip->map_blocks -= mp->m_dirblkfsbs;
313 /*
314 * Loop to get rid of the extents for the
315 * directory block.
316 */
317 for (i = mp->m_dirblkfsbs; i > 0; ) {
318 j = min_t(int, map->br_blockcount, i);
319 map->br_blockcount -= j;
320 map->br_startblock += j;
321 map->br_startoff += j;
322 /*
323 * If mapping is done, pitch it from
324 * the table.
325 */
326 if (!map->br_blockcount && --mip->map_valid)
327 memmove(&map[0], &map[1],
328 sizeof(map[0]) * mip->map_valid);
329 i -= j;
330 }
331 }
332
333 /*
334 * Recalculate the readahead blocks wanted.
335 */
336 mip->ra_want = howmany(bufsize + mp->m_dirblksize,
337 mp->m_sb.sb_blocksize) - 1;
338 ASSERT(mip->ra_want >= 0);
339
340 /*
341 * If we don't have as many as we want, and we haven't
342 * run out of data blocks, get some more mappings.
343 */
344 if (1 + mip->ra_want > mip->map_blocks &&
345 mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) {
346 /*
347 * Get more bmaps, fill in after the ones
348 * we already have in the table.
349 */
350 mip->nmap = mip->map_size - mip->map_valid;
351 error = xfs_bmapi_read(dp, mip->map_off,
352 xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) -
353 mip->map_off,
354 &map[mip->map_valid], &mip->nmap, 0);
355
356 /*
357 * Don't know if we should ignore this or try to return an
358 * error. The trouble with returning errors is that readdir
359 * will just stop without actually passing the error through.
360 */
361 if (error)
362 goto out; /* XXX */
363
364 /*
365 * If we got all the mappings we asked for, set the final map
366 * offset based on the last bmap value received. Otherwise,
367 * we've reached the end.
368 */
369 if (mip->nmap == mip->map_size - mip->map_valid) {
370 i = mip->map_valid + mip->nmap - 1;
371 mip->map_off = map[i].br_startoff + map[i].br_blockcount;
372 } else
373 mip->map_off = xfs_dir2_byte_to_da(mp,
374 XFS_DIR2_LEAF_OFFSET);
375
376 /*
377 * Look for holes in the mapping, and eliminate them. Count up
378 * the valid blocks.
379 */
380 for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) {
381 if (map[i].br_startblock == HOLESTARTBLOCK) {
382 mip->nmap--;
383 length = mip->map_valid + mip->nmap - i;
384 if (length)
385 memmove(&map[i], &map[i + 1],
386 sizeof(map[i]) * length);
387 } else {
388 mip->map_blocks += map[i].br_blockcount;
389 i++;
390 }
391 }
392 mip->map_valid += mip->nmap;
393 }
394
395 /*
396 * No valid mappings, so no more data blocks.
397 */
398 if (!mip->map_valid) {
399 *curoff = xfs_dir2_da_to_byte(mp, mip->map_off);
400 goto out;
401 }
402
403 /*
404 * Read the directory block starting at the first mapping.
405 */
406 mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
407 error = xfs_dir3_data_read(NULL, dp, map->br_startoff,
408 map->br_blockcount >= mp->m_dirblkfsbs ?
409 XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
410
411 /*
412 * Should just skip over the data block instead of giving up.
413 */
414 if (error)
415 goto out; /* XXX */
416
417 /*
418 * Adjust the current amount of read-ahead: we just read a block that
419 * was previously ra.
420 */
421 if (mip->ra_current)
422 mip->ra_current -= mp->m_dirblkfsbs;
423
424 /*
425 * Do we need more readahead?
426 */
427 blk_start_plug(&plug);
428 for (mip->ra_index = mip->ra_offset = i = 0;
429 mip->ra_want > mip->ra_current && i < mip->map_blocks;
430 i += mp->m_dirblkfsbs) {
431 ASSERT(mip->ra_index < mip->map_valid);
432 /*
433 * Read-ahead a contiguous directory block.
434 */
435 if (i > mip->ra_current &&
436 map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
437 xfs_dir3_data_readahead(NULL, dp,
438 map[mip->ra_index].br_startoff + mip->ra_offset,
439 XFS_FSB_TO_DADDR(mp,
440 map[mip->ra_index].br_startblock +
441 mip->ra_offset));
442 mip->ra_current = i;
443 }
444
445 /*
446 * Read-ahead a non-contiguous directory block. This doesn't
447 * use our mapping, but this is a very rare case.
448 */
449 else if (i > mip->ra_current) {
450 xfs_dir3_data_readahead(NULL, dp,
451 map[mip->ra_index].br_startoff +
452 mip->ra_offset, -1);
453 mip->ra_current = i;
454 }
455
456 /*
457 * Advance offset through the mapping table.
458 */
459 for (j = 0; j < mp->m_dirblkfsbs; j++) {
460 /*
461 * The rest of this extent but not more than a dir
462 * block.
463 */
464 length = min_t(int, mp->m_dirblkfsbs,
465 map[mip->ra_index].br_blockcount -
466 mip->ra_offset);
467 j += length;
468 mip->ra_offset += length;
469
470 /*
471 * Advance to the next mapping if this one is used up.
472 */
473 if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
474 mip->ra_offset = 0;
475 mip->ra_index++;
476 }
477 }
478 }
479 blk_finish_plug(&plug);
480
481out:
482 *bpp = bp;
483 return error;
484}
485
486/*
487 * Getdents (readdir) for leaf and node directories.
488 * This reads the data blocks only, so is the same for both forms.
489 */
490STATIC int
491xfs_dir2_leaf_getdents(
492 xfs_inode_t *dp, /* incore directory inode */
493 struct dir_context *ctx,
494 size_t bufsize)
495{
496 struct xfs_buf *bp = NULL; /* data block buffer */
497 xfs_dir2_data_hdr_t *hdr; /* data block header */
498 xfs_dir2_data_entry_t *dep; /* data entry */
499 xfs_dir2_data_unused_t *dup; /* unused entry */
500 int error = 0; /* error return value */
501 int length; /* temporary length value */
502 xfs_mount_t *mp; /* filesystem mount point */
503 int byteoff; /* offset in current block */
504 xfs_dir2_off_t curoff; /* current overall offset */
505 xfs_dir2_off_t newoff; /* new curoff after new blk */
506 char *ptr = NULL; /* pointer to current data */
507 struct xfs_dir2_leaf_map_info *map_info;
508
509 /*
510 * If the offset is at or past the largest allowed value,
511 * give up right away.
512 */
513 if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
514 return 0;
515
516 mp = dp->i_mount;
517
518 /*
519 * Set up to bmap a number of blocks based on the caller's
520 * buffer size, the directory block size, and the filesystem
521 * block size.
522 */
523 length = howmany(bufsize + mp->m_dirblksize,
524 mp->m_sb.sb_blocksize);
525 map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
526 (length * sizeof(struct xfs_bmbt_irec)),
527 KM_SLEEP | KM_NOFS);
528 map_info->map_size = length;
529
530 /*
531 * Inside the loop we keep the main offset value as a byte offset
532 * in the directory file.
533 */
534 curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
535
536 /*
537 * Force this conversion through db so we truncate the offset
538 * down to get the start of the data block.
539 */
540 map_info->map_off = xfs_dir2_db_to_da(mp,
541 xfs_dir2_byte_to_db(mp, curoff));
542
543 /*
544 * Loop over directory entries until we reach the end offset.
545 * Get more blocks and readahead as necessary.
546 */
547 while (curoff < XFS_DIR2_LEAF_OFFSET) {
548 __uint8_t filetype;
549
550 /*
551 * If we have no buffer, or we're off the end of the
552 * current buffer, need to get another one.
553 */
554 if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) {
555
556 error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info,
557 &curoff, &bp);
558 if (error || !map_info->map_valid)
559 break;
560
561 /*
562 * Having done a read, we need to set a new offset.
563 */
564 newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0);
565 /*
566 * Start of the current block.
567 */
568 if (curoff < newoff)
569 curoff = newoff;
570 /*
571 * Make sure we're in the right block.
572 */
573 else if (curoff > newoff)
574 ASSERT(xfs_dir2_byte_to_db(mp, curoff) ==
575 map_info->curdb);
576 hdr = bp->b_addr;
577 xfs_dir3_data_check(dp, bp);
578 /*
579 * Find our position in the block.
580 */
581 ptr = (char *)xfs_dir3_data_entry_p(hdr);
582 byteoff = xfs_dir2_byte_to_off(mp, curoff);
583 /*
584 * Skip past the header.
585 */
586 if (byteoff == 0)
587 curoff += xfs_dir3_data_entry_offset(hdr);
588 /*
589 * Skip past entries until we reach our offset.
590 */
591 else {
592 while ((char *)ptr - (char *)hdr < byteoff) {
593 dup = (xfs_dir2_data_unused_t *)ptr;
594
595 if (be16_to_cpu(dup->freetag)
596 == XFS_DIR2_DATA_FREE_TAG) {
597
598 length = be16_to_cpu(dup->length);
599 ptr += length;
600 continue;
601 }
602 dep = (xfs_dir2_data_entry_t *)ptr;
603 length =
604 xfs_dir3_data_entsize(mp, dep->namelen);
605 ptr += length;
606 }
607 /*
608 * Now set our real offset.
609 */
610 curoff =
611 xfs_dir2_db_off_to_byte(mp,
612 xfs_dir2_byte_to_db(mp, curoff),
613 (char *)ptr - (char *)hdr);
614 if (ptr >= (char *)hdr + mp->m_dirblksize) {
615 continue;
616 }
617 }
618 }
619 /*
620 * We have a pointer to an entry.
621 * Is it a live one?
622 */
623 dup = (xfs_dir2_data_unused_t *)ptr;
624 /*
625 * No, it's unused, skip over it.
626 */
627 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
628 length = be16_to_cpu(dup->length);
629 ptr += length;
630 curoff += length;
631 continue;
632 }
633
634 dep = (xfs_dir2_data_entry_t *)ptr;
635 length = xfs_dir3_data_entsize(mp, dep->namelen);
636 filetype = xfs_dir3_dirent_get_ftype(mp, dep);
637
638 ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
639 if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
640 be64_to_cpu(dep->inumber),
641 xfs_dir3_get_dtype(mp, filetype)))
642 break;
643
644 /*
645 * Advance to next entry in the block.
646 */
647 ptr += length;
648 curoff += length;
649 /* bufsize may have just been a guess; don't go negative */
650 bufsize = bufsize > length ? bufsize - length : 0;
651 }
652
653 /*
654 * All done. Set output offset value to current offset.
655 */
656 if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
657 ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
658 else
659 ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
660 kmem_free(map_info);
661 if (bp)
662 xfs_trans_brelse(NULL, bp);
663 return error;
664}
665
666/*
667 * Read a directory.
668 */
669int
670xfs_readdir(
671 xfs_inode_t *dp,
672 struct dir_context *ctx,
673 size_t bufsize)
674{
675 int rval; /* return value */
676 int v; /* type-checking value */
677
678 trace_xfs_readdir(dp);
679
680 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
681 return XFS_ERROR(EIO);
682
683 ASSERT(S_ISDIR(dp->i_d.di_mode));
684 XFS_STATS_INC(xs_dir_getdents);
685
686 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
687 rval = xfs_dir2_sf_getdents(dp, ctx);
688 else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
689 ;
690 else if (v)
691 rval = xfs_dir2_block_getdents(dp, ctx);
692 else
693 rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
694 return rval;
695}
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 97676a347da1..bb6e2848f473 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -29,8 +29,8 @@
29#include "xfs_inode.h" 29#include "xfs_inode.h"
30#include "xfs_inode_item.h" 30#include "xfs_inode_item.h"
31#include "xfs_error.h" 31#include "xfs_error.h"
32#include "xfs_dir2.h"
33#include "xfs_dir2_format.h" 32#include "xfs_dir2_format.h"
33#include "xfs_dir2.h"
34#include "xfs_dir2_priv.h" 34#include "xfs_dir2_priv.h"
35#include "xfs_trace.h" 35#include "xfs_trace.h"
36 36
@@ -95,7 +95,7 @@ xfs_dir2_sf_get_parent_ino(
95 return xfs_dir2_sf_get_ino(hdr, &hdr->parent); 95 return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
96} 96}
97 97
98static void 98void
99xfs_dir2_sf_put_parent_ino( 99xfs_dir2_sf_put_parent_ino(
100 struct xfs_dir2_sf_hdr *hdr, 100 struct xfs_dir2_sf_hdr *hdr,
101 xfs_ino_t ino) 101 xfs_ino_t ino)
@@ -105,31 +105,38 @@ xfs_dir2_sf_put_parent_ino(
105 105
106/* 106/*
107 * In short-form directory entries the inode numbers are stored at variable 107 * In short-form directory entries the inode numbers are stored at variable
108 * offset behind the entry name. The inode numbers may only be accessed 108 * offset behind the entry name. If the entry stores a filetype value, then it
109 * through the helpers below. 109 * sits between the name and the inode number. Hence the inode numbers may only
110 * be accessed through the helpers below.
110 */ 111 */
111static xfs_dir2_inou_t * 112static xfs_dir2_inou_t *
112xfs_dir2_sfe_inop( 113xfs_dir3_sfe_inop(
114 struct xfs_mount *mp,
113 struct xfs_dir2_sf_entry *sfep) 115 struct xfs_dir2_sf_entry *sfep)
114{ 116{
115 return (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]; 117 __uint8_t *ptr = &sfep->name[sfep->namelen];
118 if (xfs_sb_version_hasftype(&mp->m_sb))
119 ptr++;
120 return (xfs_dir2_inou_t *)ptr;
116} 121}
117 122
118xfs_ino_t 123xfs_ino_t
119xfs_dir2_sfe_get_ino( 124xfs_dir3_sfe_get_ino(
125 struct xfs_mount *mp,
120 struct xfs_dir2_sf_hdr *hdr, 126 struct xfs_dir2_sf_hdr *hdr,
121 struct xfs_dir2_sf_entry *sfep) 127 struct xfs_dir2_sf_entry *sfep)
122{ 128{
123 return xfs_dir2_sf_get_ino(hdr, xfs_dir2_sfe_inop(sfep)); 129 return xfs_dir2_sf_get_ino(hdr, xfs_dir3_sfe_inop(mp, sfep));
124} 130}
125 131
126static void 132void
127xfs_dir2_sfe_put_ino( 133xfs_dir3_sfe_put_ino(
134 struct xfs_mount *mp,
128 struct xfs_dir2_sf_hdr *hdr, 135 struct xfs_dir2_sf_hdr *hdr,
129 struct xfs_dir2_sf_entry *sfep, 136 struct xfs_dir2_sf_entry *sfep,
130 xfs_ino_t ino) 137 xfs_ino_t ino)
131{ 138{
132 xfs_dir2_sf_put_ino(hdr, xfs_dir2_sfe_inop(sfep), ino); 139 xfs_dir2_sf_put_ino(hdr, xfs_dir3_sfe_inop(mp, sfep), ino);
133} 140}
134 141
135/* 142/*
@@ -157,9 +164,16 @@ xfs_dir2_block_sfsize(
157 int namelen; /* total name bytes */ 164 int namelen; /* total name bytes */
158 xfs_ino_t parent = 0; /* parent inode number */ 165 xfs_ino_t parent = 0; /* parent inode number */
159 int size=0; /* total computed size */ 166 int size=0; /* total computed size */
167 int has_ftype;
160 168
161 mp = dp->i_mount; 169 mp = dp->i_mount;
162 170
171 /*
172 * if there is a filetype field, add the extra byte to the namelen
173 * for each entry that we see.
174 */
175 has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0;
176
163 count = i8count = namelen = 0; 177 count = i8count = namelen = 0;
164 btp = xfs_dir2_block_tail_p(mp, hdr); 178 btp = xfs_dir2_block_tail_p(mp, hdr);
165 blp = xfs_dir2_block_leaf_p(btp); 179 blp = xfs_dir2_block_leaf_p(btp);
@@ -188,9 +202,10 @@ xfs_dir2_block_sfsize(
188 if (!isdot) 202 if (!isdot)
189 i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM; 203 i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
190#endif 204#endif
205 /* take into account the file type field */
191 if (!isdot && !isdotdot) { 206 if (!isdot && !isdotdot) {
192 count++; 207 count++;
193 namelen += dep->namelen; 208 namelen += dep->namelen + has_ftype;
194 } else if (isdotdot) 209 } else if (isdotdot)
195 parent = be64_to_cpu(dep->inumber); 210 parent = be64_to_cpu(dep->inumber);
196 /* 211 /*
@@ -316,12 +331,14 @@ xfs_dir2_block_to_sf(
316 (xfs_dir2_data_aoff_t) 331 (xfs_dir2_data_aoff_t)
317 ((char *)dep - (char *)hdr)); 332 ((char *)dep - (char *)hdr));
318 memcpy(sfep->name, dep->name, dep->namelen); 333 memcpy(sfep->name, dep->name, dep->namelen);
319 xfs_dir2_sfe_put_ino(sfp, sfep, 334 xfs_dir3_sfe_put_ino(mp, sfp, sfep,
320 be64_to_cpu(dep->inumber)); 335 be64_to_cpu(dep->inumber));
336 xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
337 xfs_dir3_dirent_get_ftype(mp, dep));
321 338
322 sfep = xfs_dir2_sf_nextentry(sfp, sfep); 339 sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
323 } 340 }
324 ptr += xfs_dir2_data_entsize(dep->namelen); 341 ptr += xfs_dir3_data_entsize(mp, dep->namelen);
325 } 342 }
326 ASSERT((char *)sfep - (char *)sfp == size); 343 ASSERT((char *)sfep - (char *)sfp == size);
327 xfs_dir2_sf_check(args); 344 xfs_dir2_sf_check(args);
@@ -372,7 +389,7 @@ xfs_dir2_sf_addname(
372 /* 389 /*
373 * Compute entry (and change in) size. 390 * Compute entry (and change in) size.
374 */ 391 */
375 add_entsize = xfs_dir2_sf_entsize(sfp, args->namelen); 392 add_entsize = xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen);
376 incr_isize = add_entsize; 393 incr_isize = add_entsize;
377 objchange = 0; 394 objchange = 0;
378#if XFS_BIG_INUMS 395#if XFS_BIG_INUMS
@@ -466,8 +483,9 @@ xfs_dir2_sf_addname_easy(
466 /* 483 /*
467 * Grow the in-inode space. 484 * Grow the in-inode space.
468 */ 485 */
469 xfs_idata_realloc(dp, xfs_dir2_sf_entsize(sfp, args->namelen), 486 xfs_idata_realloc(dp,
470 XFS_DATA_FORK); 487 xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen),
488 XFS_DATA_FORK);
471 /* 489 /*
472 * Need to set up again due to realloc of the inode data. 490 * Need to set up again due to realloc of the inode data.
473 */ 491 */
@@ -479,7 +497,9 @@ xfs_dir2_sf_addname_easy(
479 sfep->namelen = args->namelen; 497 sfep->namelen = args->namelen;
480 xfs_dir2_sf_put_offset(sfep, offset); 498 xfs_dir2_sf_put_offset(sfep, offset);
481 memcpy(sfep->name, args->name, sfep->namelen); 499 memcpy(sfep->name, args->name, sfep->namelen);
482 xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber); 500 xfs_dir3_sfe_put_ino(dp->i_mount, sfp, sfep, args->inumber);
501 xfs_dir3_sfe_put_ftype(dp->i_mount, sfp, sfep, args->filetype);
502
483 /* 503 /*
484 * Update the header and inode. 504 * Update the header and inode.
485 */ 505 */
@@ -519,11 +539,13 @@ xfs_dir2_sf_addname_hard(
519 xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */ 539 xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */
520 xfs_dir2_sf_entry_t *sfep; /* entry in new dir */ 540 xfs_dir2_sf_entry_t *sfep; /* entry in new dir */
521 xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */ 541 xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */
542 struct xfs_mount *mp;
522 543
523 /* 544 /*
524 * Copy the old directory to the stack buffer. 545 * Copy the old directory to the stack buffer.
525 */ 546 */
526 dp = args->dp; 547 dp = args->dp;
548 mp = dp->i_mount;
527 549
528 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 550 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
529 old_isize = (int)dp->i_d.di_size; 551 old_isize = (int)dp->i_d.di_size;
@@ -535,13 +557,13 @@ xfs_dir2_sf_addname_hard(
535 * to insert the new entry. 557 * to insert the new entry.
536 * If it's going to end up at the end then oldsfep will point there. 558 * If it's going to end up at the end then oldsfep will point there.
537 */ 559 */
538 for (offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount), 560 for (offset = XFS_DIR3_DATA_FIRST_OFFSET(mp),
539 oldsfep = xfs_dir2_sf_firstentry(oldsfp), 561 oldsfep = xfs_dir2_sf_firstentry(oldsfp),
540 add_datasize = xfs_dir2_data_entsize(args->namelen), 562 add_datasize = xfs_dir3_data_entsize(mp, args->namelen),
541 eof = (char *)oldsfep == &buf[old_isize]; 563 eof = (char *)oldsfep == &buf[old_isize];
542 !eof; 564 !eof;
543 offset = new_offset + xfs_dir2_data_entsize(oldsfep->namelen), 565 offset = new_offset + xfs_dir3_data_entsize(mp, oldsfep->namelen),
544 oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep), 566 oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep),
545 eof = (char *)oldsfep == &buf[old_isize]) { 567 eof = (char *)oldsfep == &buf[old_isize]) {
546 new_offset = xfs_dir2_sf_get_offset(oldsfep); 568 new_offset = xfs_dir2_sf_get_offset(oldsfep);
547 if (offset + add_datasize <= new_offset) 569 if (offset + add_datasize <= new_offset)
@@ -570,7 +592,8 @@ xfs_dir2_sf_addname_hard(
570 sfep->namelen = args->namelen; 592 sfep->namelen = args->namelen;
571 xfs_dir2_sf_put_offset(sfep, offset); 593 xfs_dir2_sf_put_offset(sfep, offset);
572 memcpy(sfep->name, args->name, sfep->namelen); 594 memcpy(sfep->name, args->name, sfep->namelen);
573 xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber); 595 xfs_dir3_sfe_put_ino(mp, sfp, sfep, args->inumber);
596 xfs_dir3_sfe_put_ftype(mp, sfp, sfep, args->filetype);
574 sfp->count++; 597 sfp->count++;
575#if XFS_BIG_INUMS 598#if XFS_BIG_INUMS
576 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange) 599 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
@@ -580,7 +603,7 @@ xfs_dir2_sf_addname_hard(
580 * If there's more left to copy, do that. 603 * If there's more left to copy, do that.
581 */ 604 */
582 if (!eof) { 605 if (!eof) {
583 sfep = xfs_dir2_sf_nextentry(sfp, sfep); 606 sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
584 memcpy(sfep, oldsfep, old_isize - nbytes); 607 memcpy(sfep, oldsfep, old_isize - nbytes);
585 } 608 }
586 kmem_free(buf); 609 kmem_free(buf);
@@ -616,7 +639,7 @@ xfs_dir2_sf_addname_pick(
616 mp = dp->i_mount; 639 mp = dp->i_mount;
617 640
618 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 641 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
619 size = xfs_dir2_data_entsize(args->namelen); 642 size = xfs_dir3_data_entsize(mp, args->namelen);
620 offset = XFS_DIR3_DATA_FIRST_OFFSET(mp); 643 offset = XFS_DIR3_DATA_FIRST_OFFSET(mp);
621 sfep = xfs_dir2_sf_firstentry(sfp); 644 sfep = xfs_dir2_sf_firstentry(sfp);
622 holefit = 0; 645 holefit = 0;
@@ -629,8 +652,8 @@ xfs_dir2_sf_addname_pick(
629 if (!holefit) 652 if (!holefit)
630 holefit = offset + size <= xfs_dir2_sf_get_offset(sfep); 653 holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
631 offset = xfs_dir2_sf_get_offset(sfep) + 654 offset = xfs_dir2_sf_get_offset(sfep) +
632 xfs_dir2_data_entsize(sfep->namelen); 655 xfs_dir3_data_entsize(mp, sfep->namelen);
633 sfep = xfs_dir2_sf_nextentry(sfp, sfep); 656 sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
634 } 657 }
635 /* 658 /*
636 * Calculate data bytes used excluding the new entry, if this 659 * Calculate data bytes used excluding the new entry, if this
@@ -684,31 +707,34 @@ xfs_dir2_sf_check(
684 int offset; /* data offset */ 707 int offset; /* data offset */
685 xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ 708 xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */
686 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ 709 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
710 struct xfs_mount *mp;
687 711
688 dp = args->dp; 712 dp = args->dp;
713 mp = dp->i_mount;
689 714
690 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 715 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
691 offset = XFS_DIR3_DATA_FIRST_OFFSET(dp->i_mount); 716 offset = XFS_DIR3_DATA_FIRST_OFFSET(mp);
692 ino = xfs_dir2_sf_get_parent_ino(sfp); 717 ino = xfs_dir2_sf_get_parent_ino(sfp);
693 i8count = ino > XFS_DIR2_MAX_SHORT_INUM; 718 i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
694 719
695 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); 720 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
696 i < sfp->count; 721 i < sfp->count;
697 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { 722 i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep)) {
698 ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset); 723 ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
699 ino = xfs_dir2_sfe_get_ino(sfp, sfep); 724 ino = xfs_dir3_sfe_get_ino(mp, sfp, sfep);
700 i8count += ino > XFS_DIR2_MAX_SHORT_INUM; 725 i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
701 offset = 726 offset =
702 xfs_dir2_sf_get_offset(sfep) + 727 xfs_dir2_sf_get_offset(sfep) +
703 xfs_dir2_data_entsize(sfep->namelen); 728 xfs_dir3_data_entsize(mp, sfep->namelen);
729 ASSERT(xfs_dir3_sfe_get_ftype(mp, sfp, sfep) <
730 XFS_DIR3_FT_MAX);
704 } 731 }
705 ASSERT(i8count == sfp->i8count); 732 ASSERT(i8count == sfp->i8count);
706 ASSERT(XFS_BIG_INUMS || i8count == 0); 733 ASSERT(XFS_BIG_INUMS || i8count == 0);
707 ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size); 734 ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
708 ASSERT(offset + 735 ASSERT(offset +
709 (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + 736 (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
710 (uint)sizeof(xfs_dir2_block_tail_t) <= 737 (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dirblksize);
711 dp->i_mount->m_dirblksize);
712} 738}
713#endif /* DEBUG */ 739#endif /* DEBUG */
714 740
@@ -765,100 +791,6 @@ xfs_dir2_sf_create(
765 return 0; 791 return 0;
766} 792}
767 793
768int /* error */
769xfs_dir2_sf_getdents(
770 xfs_inode_t *dp, /* incore directory inode */
771 struct dir_context *ctx)
772{
773 int i; /* shortform entry number */
774 xfs_mount_t *mp; /* filesystem mount point */
775 xfs_dir2_dataptr_t off; /* current entry's offset */
776 xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
777 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
778 xfs_dir2_dataptr_t dot_offset;
779 xfs_dir2_dataptr_t dotdot_offset;
780 xfs_ino_t ino;
781
782 mp = dp->i_mount;
783
784 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
785 /*
786 * Give up if the directory is way too short.
787 */
788 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
789 ASSERT(XFS_FORCED_SHUTDOWN(mp));
790 return XFS_ERROR(EIO);
791 }
792
793 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
794 ASSERT(dp->i_df.if_u1.if_data != NULL);
795
796 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
797
798 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
799
800 /*
801 * If the block number in the offset is out of range, we're done.
802 */
803 if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
804 return 0;
805
806 /*
807 * Precalculate offsets for . and .. as we will always need them.
808 *
809 * XXX(hch): the second argument is sometimes 0 and sometimes
810 * mp->m_dirdatablk.
811 */
812 dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
813 XFS_DIR3_DATA_DOT_OFFSET(mp));
814 dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
815 XFS_DIR3_DATA_DOTDOT_OFFSET(mp));
816
817 /*
818 * Put . entry unless we're starting past it.
819 */
820 if (ctx->pos <= dot_offset) {
821 ctx->pos = dot_offset & 0x7fffffff;
822 if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
823 return 0;
824 }
825
826 /*
827 * Put .. entry unless we're starting past it.
828 */
829 if (ctx->pos <= dotdot_offset) {
830 ino = xfs_dir2_sf_get_parent_ino(sfp);
831 ctx->pos = dotdot_offset & 0x7fffffff;
832 if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
833 return 0;
834 }
835
836 /*
837 * Loop while there are more entries and put'ing works.
838 */
839 sfep = xfs_dir2_sf_firstentry(sfp);
840 for (i = 0; i < sfp->count; i++) {
841 off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
842 xfs_dir2_sf_get_offset(sfep));
843
844 if (ctx->pos > off) {
845 sfep = xfs_dir2_sf_nextentry(sfp, sfep);
846 continue;
847 }
848
849 ino = xfs_dir2_sfe_get_ino(sfp, sfep);
850 ctx->pos = off & 0x7fffffff;
851 if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen,
852 ino, DT_UNKNOWN))
853 return 0;
854 sfep = xfs_dir2_sf_nextentry(sfp, sfep);
855 }
856
857 ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
858 0x7fffffff;
859 return 0;
860}
861
862/* 794/*
863 * Lookup an entry in a shortform directory. 795 * Lookup an entry in a shortform directory.
864 * Returns EEXIST if found, ENOENT if not found. 796 * Returns EEXIST if found, ENOENT if not found.
@@ -898,6 +830,7 @@ xfs_dir2_sf_lookup(
898 if (args->namelen == 1 && args->name[0] == '.') { 830 if (args->namelen == 1 && args->name[0] == '.') {
899 args->inumber = dp->i_ino; 831 args->inumber = dp->i_ino;
900 args->cmpresult = XFS_CMP_EXACT; 832 args->cmpresult = XFS_CMP_EXACT;
833 args->filetype = XFS_DIR3_FT_DIR;
901 return XFS_ERROR(EEXIST); 834 return XFS_ERROR(EEXIST);
902 } 835 }
903 /* 836 /*
@@ -907,6 +840,7 @@ xfs_dir2_sf_lookup(
907 args->name[0] == '.' && args->name[1] == '.') { 840 args->name[0] == '.' && args->name[1] == '.') {
908 args->inumber = xfs_dir2_sf_get_parent_ino(sfp); 841 args->inumber = xfs_dir2_sf_get_parent_ino(sfp);
909 args->cmpresult = XFS_CMP_EXACT; 842 args->cmpresult = XFS_CMP_EXACT;
843 args->filetype = XFS_DIR3_FT_DIR;
910 return XFS_ERROR(EEXIST); 844 return XFS_ERROR(EEXIST);
911 } 845 }
912 /* 846 /*
@@ -914,7 +848,7 @@ xfs_dir2_sf_lookup(
914 */ 848 */
915 ci_sfep = NULL; 849 ci_sfep = NULL;
916 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; 850 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
917 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { 851 i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) {
918 /* 852 /*
919 * Compare name and if it's an exact match, return the inode 853 * Compare name and if it's an exact match, return the inode
920 * number. If it's the first case-insensitive match, store the 854 * number. If it's the first case-insensitive match, store the
@@ -924,7 +858,10 @@ xfs_dir2_sf_lookup(
924 sfep->namelen); 858 sfep->namelen);
925 if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { 859 if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
926 args->cmpresult = cmp; 860 args->cmpresult = cmp;
927 args->inumber = xfs_dir2_sfe_get_ino(sfp, sfep); 861 args->inumber = xfs_dir3_sfe_get_ino(dp->i_mount,
862 sfp, sfep);
863 args->filetype = xfs_dir3_sfe_get_ftype(dp->i_mount,
864 sfp, sfep);
928 if (cmp == XFS_CMP_EXACT) 865 if (cmp == XFS_CMP_EXACT)
929 return XFS_ERROR(EEXIST); 866 return XFS_ERROR(EEXIST);
930 ci_sfep = sfep; 867 ci_sfep = sfep;
@@ -980,10 +917,10 @@ xfs_dir2_sf_removename(
980 * Find the one we're deleting. 917 * Find the one we're deleting.
981 */ 918 */
982 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; 919 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
983 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) { 920 i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) {
984 if (xfs_da_compname(args, sfep->name, sfep->namelen) == 921 if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
985 XFS_CMP_EXACT) { 922 XFS_CMP_EXACT) {
986 ASSERT(xfs_dir2_sfe_get_ino(sfp, sfep) == 923 ASSERT(xfs_dir3_sfe_get_ino(dp->i_mount, sfp, sfep) ==
987 args->inumber); 924 args->inumber);
988 break; 925 break;
989 } 926 }
@@ -997,7 +934,7 @@ xfs_dir2_sf_removename(
997 * Calculate sizes. 934 * Calculate sizes.
998 */ 935 */
999 byteoff = (int)((char *)sfep - (char *)sfp); 936 byteoff = (int)((char *)sfep - (char *)sfp);
1000 entsize = xfs_dir2_sf_entsize(sfp, args->namelen); 937 entsize = xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen);
1001 newsize = oldsize - entsize; 938 newsize = oldsize - entsize;
1002 /* 939 /*
1003 * Copy the part if any after the removed entry, sliding it down. 940 * Copy the part if any after the removed entry, sliding it down.
@@ -1113,16 +1050,19 @@ xfs_dir2_sf_replace(
1113 * Normal entry, look for the name. 1050 * Normal entry, look for the name.
1114 */ 1051 */
1115 else { 1052 else {
1116 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); 1053 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
1117 i < sfp->count; 1054 i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) {
1118 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
1119 if (xfs_da_compname(args, sfep->name, sfep->namelen) == 1055 if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
1120 XFS_CMP_EXACT) { 1056 XFS_CMP_EXACT) {
1121#if XFS_BIG_INUMS || defined(DEBUG) 1057#if XFS_BIG_INUMS || defined(DEBUG)
1122 ino = xfs_dir2_sfe_get_ino(sfp, sfep); 1058 ino = xfs_dir3_sfe_get_ino(dp->i_mount,
1059 sfp, sfep);
1123 ASSERT(args->inumber != ino); 1060 ASSERT(args->inumber != ino);
1124#endif 1061#endif
1125 xfs_dir2_sfe_put_ino(sfp, sfep, args->inumber); 1062 xfs_dir3_sfe_put_ino(dp->i_mount, sfp, sfep,
1063 args->inumber);
1064 xfs_dir3_sfe_put_ftype(dp->i_mount, sfp, sfep,
1065 args->filetype);
1126 break; 1066 break;
1127 } 1067 }
1128 } 1068 }
@@ -1189,10 +1129,12 @@ xfs_dir2_sf_toino4(
1189 int oldsize; /* old inode size */ 1129 int oldsize; /* old inode size */
1190 xfs_dir2_sf_entry_t *sfep; /* new sf entry */ 1130 xfs_dir2_sf_entry_t *sfep; /* new sf entry */
1191 xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ 1131 xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
1132 struct xfs_mount *mp;
1192 1133
1193 trace_xfs_dir2_sf_toino4(args); 1134 trace_xfs_dir2_sf_toino4(args);
1194 1135
1195 dp = args->dp; 1136 dp = args->dp;
1137 mp = dp->i_mount;
1196 1138
1197 /* 1139 /*
1198 * Copy the old directory to the buffer. 1140 * Copy the old directory to the buffer.
@@ -1230,13 +1172,15 @@ xfs_dir2_sf_toino4(
1230 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), 1172 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
1231 oldsfep = xfs_dir2_sf_firstentry(oldsfp); 1173 oldsfep = xfs_dir2_sf_firstentry(oldsfp);
1232 i < sfp->count; 1174 i < sfp->count;
1233 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep), 1175 i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep),
1234 oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) { 1176 oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep)) {
1235 sfep->namelen = oldsfep->namelen; 1177 sfep->namelen = oldsfep->namelen;
1236 sfep->offset = oldsfep->offset; 1178 sfep->offset = oldsfep->offset;
1237 memcpy(sfep->name, oldsfep->name, sfep->namelen); 1179 memcpy(sfep->name, oldsfep->name, sfep->namelen);
1238 xfs_dir2_sfe_put_ino(sfp, sfep, 1180 xfs_dir3_sfe_put_ino(mp, sfp, sfep,
1239 xfs_dir2_sfe_get_ino(oldsfp, oldsfep)); 1181 xfs_dir3_sfe_get_ino(mp, oldsfp, oldsfep));
1182 xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
1183 xfs_dir3_sfe_get_ftype(mp, oldsfp, oldsfep));
1240 } 1184 }
1241 /* 1185 /*
1242 * Clean up the inode. 1186 * Clean up the inode.
@@ -1264,10 +1208,12 @@ xfs_dir2_sf_toino8(
1264 int oldsize; /* old inode size */ 1208 int oldsize; /* old inode size */
1265 xfs_dir2_sf_entry_t *sfep; /* new sf entry */ 1209 xfs_dir2_sf_entry_t *sfep; /* new sf entry */
1266 xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ 1210 xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
1211 struct xfs_mount *mp;
1267 1212
1268 trace_xfs_dir2_sf_toino8(args); 1213 trace_xfs_dir2_sf_toino8(args);
1269 1214
1270 dp = args->dp; 1215 dp = args->dp;
1216 mp = dp->i_mount;
1271 1217
1272 /* 1218 /*
1273 * Copy the old directory to the buffer. 1219 * Copy the old directory to the buffer.
@@ -1305,13 +1251,15 @@ xfs_dir2_sf_toino8(
1305 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), 1251 for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
1306 oldsfep = xfs_dir2_sf_firstentry(oldsfp); 1252 oldsfep = xfs_dir2_sf_firstentry(oldsfp);
1307 i < sfp->count; 1253 i < sfp->count;
1308 i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep), 1254 i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep),
1309 oldsfep = xfs_dir2_sf_nextentry(oldsfp, oldsfep)) { 1255 oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep)) {
1310 sfep->namelen = oldsfep->namelen; 1256 sfep->namelen = oldsfep->namelen;
1311 sfep->offset = oldsfep->offset; 1257 sfep->offset = oldsfep->offset;
1312 memcpy(sfep->name, oldsfep->name, sfep->namelen); 1258 memcpy(sfep->name, oldsfep->name, sfep->namelen);
1313 xfs_dir2_sfe_put_ino(sfp, sfep, 1259 xfs_dir3_sfe_put_ino(mp, sfp, sfep,
1314 xfs_dir2_sfe_get_ino(oldsfp, oldsfep)); 1260 xfs_dir3_sfe_get_ino(mp, oldsfp, oldsfep));
1261 xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
1262 xfs_dir3_sfe_get_ftype(mp, oldsfp, oldsfep));
1315 } 1263 }
1316 /* 1264 /*
1317 * Clean up the inode. 1265 * Clean up the inode.
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 69cf4fcde03e..45560ee1a4ba 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -16,12 +16,13 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_sb.h" 19#include "xfs_format.h"
20#include "xfs_log.h" 20#include "xfs_log.h"
21#include "xfs_trans.h"
22#include "xfs_sb.h"
21#include "xfs_ag.h" 23#include "xfs_ag.h"
22#include "xfs_mount.h" 24#include "xfs_mount.h"
23#include "xfs_quota.h" 25#include "xfs_quota.h"
24#include "xfs_trans.h"
25#include "xfs_alloc_btree.h" 26#include "xfs_alloc_btree.h"
26#include "xfs_bmap_btree.h" 27#include "xfs_bmap_btree.h"
27#include "xfs_ialloc_btree.h" 28#include "xfs_ialloc_btree.h"
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 0adf27ecf3f1..251c66632e5e 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_bit.h" 21#include "xfs_bit.h"
21#include "xfs_log.h" 22#include "xfs_log.h"
22#include "xfs_trans.h" 23#include "xfs_trans.h"
@@ -28,6 +29,7 @@
28#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
29#include "xfs_inode.h" 30#include "xfs_inode.h"
30#include "xfs_bmap.h" 31#include "xfs_bmap.h"
32#include "xfs_bmap_util.h"
31#include "xfs_rtalloc.h" 33#include "xfs_rtalloc.h"
32#include "xfs_error.h" 34#include "xfs_error.h"
33#include "xfs_itable.h" 35#include "xfs_itable.h"
@@ -710,10 +712,8 @@ xfs_qm_dqread(
710 712
711 if (flags & XFS_QMOPT_DQALLOC) { 713 if (flags & XFS_QMOPT_DQALLOC) {
712 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); 714 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
713 error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp), 715 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_attrsetm,
714 XFS_QM_DQALLOC_LOG_RES(mp), 0, 716 XFS_QM_DQALLOC_SPACE_RES(mp), 0);
715 XFS_TRANS_PERM_LOG_RES,
716 XFS_WRITE_LOG_COUNT);
717 if (error) 717 if (error)
718 goto error1; 718 goto error1;
719 cancelflags = XFS_TRANS_RELEASE_LOG_RES; 719 cancelflags = XFS_TRANS_RELEASE_LOG_RES;
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 57aa4b03720c..60c6e1f12695 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
21#include "xfs_trans.h" 22#include "xfs_trans.h"
22#include "xfs_sb.h" 23#include "xfs_sb.h"
@@ -43,14 +44,15 @@ static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
43/* 44/*
44 * returns the number of iovecs needed to log the given dquot item. 45 * returns the number of iovecs needed to log the given dquot item.
45 */ 46 */
46STATIC uint 47STATIC void
47xfs_qm_dquot_logitem_size( 48xfs_qm_dquot_logitem_size(
48 struct xfs_log_item *lip) 49 struct xfs_log_item *lip,
50 int *nvecs,
51 int *nbytes)
49{ 52{
50 /* 53 *nvecs += 2;
51 * we need only two iovecs, one for the format, one for the real thing 54 *nbytes += sizeof(struct xfs_dq_logformat) +
52 */ 55 sizeof(struct xfs_disk_dquot);
53 return 2;
54} 56}
55 57
56/* 58/*
@@ -285,11 +287,14 @@ static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
285 * We only need 1 iovec for an quotaoff item. It just logs the 287 * We only need 1 iovec for an quotaoff item. It just logs the
286 * quotaoff_log_format structure. 288 * quotaoff_log_format structure.
287 */ 289 */
288STATIC uint 290STATIC void
289xfs_qm_qoff_logitem_size( 291xfs_qm_qoff_logitem_size(
290 struct xfs_log_item *lip) 292 struct xfs_log_item *lip,
293 int *nvecs,
294 int *nbytes)
291{ 295{
292 return 1; 296 *nvecs += 1;
297 *nbytes += sizeof(struct xfs_qoff_logitem);
293} 298}
294 299
295/* 300/*
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 35d3f5b041dd..1123d93ff795 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -26,7 +26,6 @@
26#include "xfs_bmap_btree.h" 26#include "xfs_bmap_btree.h"
27#include "xfs_dinode.h" 27#include "xfs_dinode.h"
28#include "xfs_inode.h" 28#include "xfs_inode.h"
29#include "xfs_utils.h"
30#include "xfs_error.h" 29#include "xfs_error.h"
31 30
32#ifdef DEBUG 31#ifdef DEBUG
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index c585bc646395..066df425c14f 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -21,10 +21,11 @@
21#include "xfs_trans.h" 21#include "xfs_trans.h"
22#include "xfs_sb.h" 22#include "xfs_sb.h"
23#include "xfs_ag.h" 23#include "xfs_ag.h"
24#include "xfs_dir2.h"
25#include "xfs_mount.h" 24#include "xfs_mount.h"
25#include "xfs_da_btree.h"
26#include "xfs_dir2_format.h"
27#include "xfs_dir2.h"
26#include "xfs_export.h" 28#include "xfs_export.h"
27#include "xfs_vnodeops.h"
28#include "xfs_bmap_btree.h" 29#include "xfs_bmap_btree.h"
29#include "xfs_inode.h" 30#include "xfs_inode.h"
30#include "xfs_inode_item.h" 31#include "xfs_inode_item.h"
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 85e9f87a1a7c..86f559f6e5d3 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -147,7 +147,7 @@ xfs_extent_busy_search(
147 * extent. If the overlap covers the beginning, the end, or all of the busy 147 * extent. If the overlap covers the beginning, the end, or all of the busy
148 * extent, the overlapping portion can be made unbusy and used for the 148 * extent, the overlapping portion can be made unbusy and used for the
149 * allocation. We can't split a busy extent because we can't modify a 149 * allocation. We can't split a busy extent because we can't modify a
150 * transaction/CIL context busy list, but we can update an entries block 150 * transaction/CIL context busy list, but we can update an entry's block
151 * number or length. 151 * number or length.
152 * 152 *
153 * Returns true if the extent can safely be reused, or false if the search 153 * Returns true if the extent can safely be reused, or false if the search
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 452920a3f03f..dc53e8febbbe 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -73,11 +73,22 @@ __xfs_efi_release(
73 * We only need 1 iovec for an efi item. It just logs the efi_log_format 73 * We only need 1 iovec for an efi item. It just logs the efi_log_format
74 * structure. 74 * structure.
75 */ 75 */
76STATIC uint 76static inline int
77xfs_efi_item_sizeof(
78 struct xfs_efi_log_item *efip)
79{
80 return sizeof(struct xfs_efi_log_format) +
81 (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
82}
83
84STATIC void
77xfs_efi_item_size( 85xfs_efi_item_size(
78 struct xfs_log_item *lip) 86 struct xfs_log_item *lip,
87 int *nvecs,
88 int *nbytes)
79{ 89{
80 return 1; 90 *nvecs += 1;
91 *nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip));
81} 92}
82 93
83/* 94/*
@@ -93,21 +104,17 @@ xfs_efi_item_format(
93 struct xfs_log_iovec *log_vector) 104 struct xfs_log_iovec *log_vector)
94{ 105{
95 struct xfs_efi_log_item *efip = EFI_ITEM(lip); 106 struct xfs_efi_log_item *efip = EFI_ITEM(lip);
96 uint size;
97 107
98 ASSERT(atomic_read(&efip->efi_next_extent) == 108 ASSERT(atomic_read(&efip->efi_next_extent) ==
99 efip->efi_format.efi_nextents); 109 efip->efi_format.efi_nextents);
100 110
101 efip->efi_format.efi_type = XFS_LI_EFI; 111 efip->efi_format.efi_type = XFS_LI_EFI;
102
103 size = sizeof(xfs_efi_log_format_t);
104 size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t);
105 efip->efi_format.efi_size = 1; 112 efip->efi_format.efi_size = 1;
106 113
107 log_vector->i_addr = &efip->efi_format; 114 log_vector->i_addr = &efip->efi_format;
108 log_vector->i_len = size; 115 log_vector->i_len = xfs_efi_item_sizeof(efip);
109 log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT; 116 log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
110 ASSERT(size >= sizeof(xfs_efi_log_format_t)); 117 ASSERT(log_vector->i_len >= sizeof(xfs_efi_log_format_t));
111} 118}
112 119
113 120
@@ -333,11 +340,22 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp)
333 * We only need 1 iovec for an efd item. It just logs the efd_log_format 340 * We only need 1 iovec for an efd item. It just logs the efd_log_format
334 * structure. 341 * structure.
335 */ 342 */
336STATIC uint 343static inline int
344xfs_efd_item_sizeof(
345 struct xfs_efd_log_item *efdp)
346{
347 return sizeof(xfs_efd_log_format_t) +
348 (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
349}
350
351STATIC void
337xfs_efd_item_size( 352xfs_efd_item_size(
338 struct xfs_log_item *lip) 353 struct xfs_log_item *lip,
354 int *nvecs,
355 int *nbytes)
339{ 356{
340 return 1; 357 *nvecs += 1;
358 *nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip));
341} 359}
342 360
343/* 361/*
@@ -353,20 +371,16 @@ xfs_efd_item_format(
353 struct xfs_log_iovec *log_vector) 371 struct xfs_log_iovec *log_vector)
354{ 372{
355 struct xfs_efd_log_item *efdp = EFD_ITEM(lip); 373 struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
356 uint size;
357 374
358 ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); 375 ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
359 376
360 efdp->efd_format.efd_type = XFS_LI_EFD; 377 efdp->efd_format.efd_type = XFS_LI_EFD;
361
362 size = sizeof(xfs_efd_log_format_t);
363 size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t);
364 efdp->efd_format.efd_size = 1; 378 efdp->efd_format.efd_size = 1;
365 379
366 log_vector->i_addr = &efdp->efd_format; 380 log_vector->i_addr = &efdp->efd_format;
367 log_vector->i_len = size; 381 log_vector->i_len = xfs_efd_item_sizeof(efdp);
368 log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT; 382 log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
369 ASSERT(size >= sizeof(xfs_efd_log_format_t)); 383 ASSERT(log_vector->i_len >= sizeof(xfs_efd_log_format_t));
370} 384}
371 385
372/* 386/*
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 432222418c56..0ffbce32d569 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -18,93 +18,11 @@
18#ifndef __XFS_EXTFREE_ITEM_H__ 18#ifndef __XFS_EXTFREE_ITEM_H__
19#define __XFS_EXTFREE_ITEM_H__ 19#define __XFS_EXTFREE_ITEM_H__
20 20
21/* kernel only EFI/EFD definitions */
22
21struct xfs_mount; 23struct xfs_mount;
22struct kmem_zone; 24struct kmem_zone;
23 25
24typedef struct xfs_extent {
25 xfs_dfsbno_t ext_start;
26 xfs_extlen_t ext_len;
27} xfs_extent_t;
28
29/*
30 * Since an xfs_extent_t has types (start:64, len: 32)
31 * there are different alignments on 32 bit and 64 bit kernels.
32 * So we provide the different variants for use by a
33 * conversion routine.
34 */
35
36typedef struct xfs_extent_32 {
37 __uint64_t ext_start;
38 __uint32_t ext_len;
39} __attribute__((packed)) xfs_extent_32_t;
40
41typedef struct xfs_extent_64 {
42 __uint64_t ext_start;
43 __uint32_t ext_len;
44 __uint32_t ext_pad;
45} xfs_extent_64_t;
46
47/*
48 * This is the structure used to lay out an efi log item in the
49 * log. The efi_extents field is a variable size array whose
50 * size is given by efi_nextents.
51 */
52typedef struct xfs_efi_log_format {
53 __uint16_t efi_type; /* efi log item type */
54 __uint16_t efi_size; /* size of this item */
55 __uint32_t efi_nextents; /* # extents to free */
56 __uint64_t efi_id; /* efi identifier */
57 xfs_extent_t efi_extents[1]; /* array of extents to free */
58} xfs_efi_log_format_t;
59
60typedef struct xfs_efi_log_format_32 {
61 __uint16_t efi_type; /* efi log item type */
62 __uint16_t efi_size; /* size of this item */
63 __uint32_t efi_nextents; /* # extents to free */
64 __uint64_t efi_id; /* efi identifier */
65 xfs_extent_32_t efi_extents[1]; /* array of extents to free */
66} __attribute__((packed)) xfs_efi_log_format_32_t;
67
68typedef struct xfs_efi_log_format_64 {
69 __uint16_t efi_type; /* efi log item type */
70 __uint16_t efi_size; /* size of this item */
71 __uint32_t efi_nextents; /* # extents to free */
72 __uint64_t efi_id; /* efi identifier */
73 xfs_extent_64_t efi_extents[1]; /* array of extents to free */
74} xfs_efi_log_format_64_t;
75
76/*
77 * This is the structure used to lay out an efd log item in the
78 * log. The efd_extents array is a variable size array whose
79 * size is given by efd_nextents;
80 */
81typedef struct xfs_efd_log_format {
82 __uint16_t efd_type; /* efd log item type */
83 __uint16_t efd_size; /* size of this item */
84 __uint32_t efd_nextents; /* # of extents freed */
85 __uint64_t efd_efi_id; /* id of corresponding efi */
86 xfs_extent_t efd_extents[1]; /* array of extents freed */
87} xfs_efd_log_format_t;
88
89typedef struct xfs_efd_log_format_32 {
90 __uint16_t efd_type; /* efd log item type */
91 __uint16_t efd_size; /* size of this item */
92 __uint32_t efd_nextents; /* # of extents freed */
93 __uint64_t efd_efi_id; /* id of corresponding efi */
94 xfs_extent_32_t efd_extents[1]; /* array of extents freed */
95} __attribute__((packed)) xfs_efd_log_format_32_t;
96
97typedef struct xfs_efd_log_format_64 {
98 __uint16_t efd_type; /* efd log item type */
99 __uint16_t efd_size; /* size of this item */
100 __uint32_t efd_nextents; /* # of extents freed */
101 __uint64_t efd_efi_id; /* id of corresponding efi */
102 xfs_extent_64_t efd_extents[1]; /* array of extents freed */
103} xfs_efd_log_format_64_t;
104
105
106#ifdef __KERNEL__
107
108/* 26/*
109 * Max number of extents in fast allocation path. 27 * Max number of extents in fast allocation path.
110 */ 28 */
@@ -160,6 +78,4 @@ int xfs_efi_copy_format(xfs_log_iovec_t *buf,
160 xfs_efi_log_format_t *dst_efi_fmt); 78 xfs_efi_log_format_t *dst_efi_fmt);
161void xfs_efi_item_free(xfs_efi_log_item_t *); 79void xfs_efi_item_free(xfs_efi_log_item_t *);
162 80
163#endif /* __KERNEL__ */
164
165#endif /* __XFS_EXTFREE_ITEM_H__ */ 81#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index de3dc98f4e8f..4c749ab543d0 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -28,10 +28,11 @@
28#include "xfs_inode.h" 28#include "xfs_inode.h"
29#include "xfs_inode_item.h" 29#include "xfs_inode_item.h"
30#include "xfs_bmap.h" 30#include "xfs_bmap.h"
31#include "xfs_bmap_util.h"
31#include "xfs_error.h" 32#include "xfs_error.h"
32#include "xfs_vnodeops.h"
33#include "xfs_da_btree.h" 33#include "xfs_da_btree.h"
34#include "xfs_dir2_format.h" 34#include "xfs_dir2_format.h"
35#include "xfs_dir2.h"
35#include "xfs_dir2_priv.h" 36#include "xfs_dir2_priv.h"
36#include "xfs_ioctl.h" 37#include "xfs_ioctl.h"
37#include "xfs_trace.h" 38#include "xfs_trace.h"
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 5170306a1009..ce78e654d37b 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -16,18 +16,18 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_log.h"
19#include "xfs_bmap_btree.h" 20#include "xfs_bmap_btree.h"
20#include "xfs_inum.h" 21#include "xfs_inum.h"
21#include "xfs_dinode.h" 22#include "xfs_dinode.h"
22#include "xfs_inode.h" 23#include "xfs_inode.h"
23#include "xfs_ag.h" 24#include "xfs_ag.h"
24#include "xfs_log.h"
25#include "xfs_trans.h" 25#include "xfs_trans.h"
26#include "xfs_sb.h" 26#include "xfs_sb.h"
27#include "xfs_mount.h" 27#include "xfs_mount.h"
28#include "xfs_bmap.h" 28#include "xfs_bmap.h"
29#include "xfs_bmap_util.h"
29#include "xfs_alloc.h" 30#include "xfs_alloc.h"
30#include "xfs_utils.h"
31#include "xfs_mru_cache.h" 31#include "xfs_mru_cache.h"
32#include "xfs_filestream.h" 32#include "xfs_filestream.h"
33#include "xfs_trace.h" 33#include "xfs_trace.h"
@@ -668,8 +668,8 @@ exit:
668 */ 668 */
669int 669int
670xfs_filestream_new_ag( 670xfs_filestream_new_ag(
671 xfs_bmalloca_t *ap, 671 struct xfs_bmalloca *ap,
672 xfs_agnumber_t *agp) 672 xfs_agnumber_t *agp)
673{ 673{
674 int flags, err; 674 int flags, err;
675 xfs_inode_t *ip, *pip = NULL; 675 xfs_inode_t *ip, *pip = NULL;
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 09dd9af45434..6d61dbee8564 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -18,8 +18,6 @@
18#ifndef __XFS_FILESTREAM_H__ 18#ifndef __XFS_FILESTREAM_H__
19#define __XFS_FILESTREAM_H__ 19#define __XFS_FILESTREAM_H__
20 20
21#ifdef __KERNEL__
22
23struct xfs_mount; 21struct xfs_mount;
24struct xfs_inode; 22struct xfs_inode;
25struct xfs_perag; 23struct xfs_perag;
@@ -69,6 +67,4 @@ xfs_inode_is_filestream(
69 (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM); 67 (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM);
70} 68}
71 69
72#endif /* __KERNEL__ */
73
74#endif /* __XFS_FILESTREAM_H__ */ 70#endif /* __XFS_FILESTREAM_H__ */
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h
new file mode 100644
index 000000000000..35c08ff54ca0
--- /dev/null
+++ b/fs/xfs/xfs_format.h
@@ -0,0 +1,169 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_FORMAT_H__
19#define __XFS_FORMAT_H__
20
21/*
22 * XFS On Disk Format Definitions
23 *
24 * This header file defines all the on-disk format definitions for
25 * general XFS objects. Directory and attribute related objects are defined in
26 * xfs_da_format.h, which log and log item formats are defined in
27 * xfs_log_format.h. Everything else goes here.
28 */
29
30struct xfs_mount;
31struct xfs_trans;
32struct xfs_inode;
33struct xfs_buf;
34struct xfs_ifork;
35
36/*
37 * RealTime Device format definitions
38 */
39
40/* Min and max rt extent sizes, specified in bytes */
41#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */
42#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */
43#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
44
45#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize)
46#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask)
47#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize)
48#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask)
49
50/*
51 * RT Summary and bit manipulation macros.
52 */
53#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
54#define XFS_SUMOFFSTOBLOCK(mp,s) \
55 (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
56#define XFS_SUMPTR(mp,bp,so) \
57 ((xfs_suminfo_t *)((bp)->b_addr + \
58 (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
59
60#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log)
61#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log)
62#define XFS_BITTOWORD(mp,bi) \
63 ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
64
65#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b))
66#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b))
67
68#define XFS_RTLOBIT(w) xfs_lowbit32(w)
69#define XFS_RTHIBIT(w) xfs_highbit32(w)
70
71#if XFS_BIG_BLKNOS
72#define XFS_RTBLOCKLOG(b) xfs_highbit64(b)
73#else
74#define XFS_RTBLOCKLOG(b) xfs_highbit32(b)
75#endif
76
77/*
78 * Dquot and dquot block format definitions
79 */
80#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
81#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */
82
83/*
84 * This is the main portion of the on-disk representation of quota
85 * information for a user. This is the q_core of the xfs_dquot_t that
86 * is kept in kernel memory. We pad this with some more expansion room
87 * to construct the on disk structure.
88 */
89typedef struct xfs_disk_dquot {
90 __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */
91 __u8 d_version; /* dquot version */
92 __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */
93 __be32 d_id; /* user,project,group id */
94 __be64 d_blk_hardlimit;/* absolute limit on disk blks */
95 __be64 d_blk_softlimit;/* preferred limit on disk blks */
96 __be64 d_ino_hardlimit;/* maximum # allocated inodes */
97 __be64 d_ino_softlimit;/* preferred inode limit */
98 __be64 d_bcount; /* disk blocks owned by the user */
99 __be64 d_icount; /* inodes owned by the user */
100 __be32 d_itimer; /* zero if within inode limits if not,
101 this is when we refuse service */
102 __be32 d_btimer; /* similar to above; for disk blocks */
103 __be16 d_iwarns; /* warnings issued wrt num inodes */
104 __be16 d_bwarns; /* warnings issued wrt disk blocks */
105 __be32 d_pad0; /* 64 bit align */
106 __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */
107 __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */
108 __be64 d_rtbcount; /* realtime blocks owned */
109 __be32 d_rtbtimer; /* similar to above; for RT disk blocks */
110 __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */
111 __be16 d_pad;
112} xfs_disk_dquot_t;
113
114/*
115 * This is what goes on disk. This is separated from the xfs_disk_dquot because
116 * carrying the unnecessary padding would be a waste of memory.
117 */
118typedef struct xfs_dqblk {
119 xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */
120 char dd_fill[4]; /* filling for posterity */
121
122 /*
123 * These two are only present on filesystems with the CRC bits set.
124 */
125 __be32 dd_crc; /* checksum */
126 __be64 dd_lsn; /* last modification in log */
127 uuid_t dd_uuid; /* location information */
128} xfs_dqblk_t;
129
130#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc)
131
132/*
133 * Remote symlink format and access functions.
134 */
135#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */
136
137struct xfs_dsymlink_hdr {
138 __be32 sl_magic;
139 __be32 sl_offset;
140 __be32 sl_bytes;
141 __be32 sl_crc;
142 uuid_t sl_uuid;
143 __be64 sl_owner;
144 __be64 sl_blkno;
145 __be64 sl_lsn;
146};
147
148/*
149 * The maximum pathlen is 1024 bytes. Since the minimum file system
150 * blocksize is 512 bytes, we can get a max of 3 extents back from
151 * bmapi when crc headers are taken into account.
152 */
153#define XFS_SYMLINK_MAPS 3
154
155#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \
156 ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
157 sizeof(struct xfs_dsymlink_hdr) : 0))
158
159int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
160int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
161 uint32_t size, struct xfs_buf *bp);
162bool xfs_symlink_hdr_ok(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
163 uint32_t size, struct xfs_buf *bp);
164void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
165 struct xfs_inode *ip, struct xfs_ifork *ifp);
166
167extern const struct xfs_buf_ops xfs_symlink_buf_ops;
168
169#endif /* __XFS_FORMAT_H__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index d04695545397..1edb5cc3e5f4 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -240,7 +240,9 @@ typedef struct xfs_fsop_resblks {
240 240
241 241
242/* 242/*
243 * Minimum and maximum sizes need for growth checks 243 * Minimum and maximum sizes need for growth checks.
244 *
245 * Block counts are in units of filesystem blocks, not basic blocks.
244 */ 246 */
245#define XFS_MIN_AG_BLOCKS 64 247#define XFS_MIN_AG_BLOCKS 64
246#define XFS_MIN_LOG_BLOCKS 512ULL 248#define XFS_MIN_LOG_BLOCKS 512ULL
@@ -311,6 +313,17 @@ typedef struct xfs_bstat {
311} xfs_bstat_t; 313} xfs_bstat_t;
312 314
313/* 315/*
316 * Project quota id helpers (previously projid was 16bit only
317 * and using two 16bit values to hold new 32bit projid was choosen
318 * to retain compatibility with "old" filesystems).
319 */
320static inline __uint32_t
321bstat_get_projid(struct xfs_bstat *bs)
322{
323 return (__uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo;
324}
325
326/*
314 * The user-level BulkStat Request interface structure. 327 * The user-level BulkStat Request interface structure.
315 */ 328 */
316typedef struct xfs_fsop_bulkreq { 329typedef struct xfs_fsop_bulkreq {
@@ -344,7 +357,7 @@ typedef struct xfs_error_injection {
344 * Speculative preallocation trimming. 357 * Speculative preallocation trimming.
345 */ 358 */
346#define XFS_EOFBLOCKS_VERSION 1 359#define XFS_EOFBLOCKS_VERSION 1
347struct xfs_eofblocks { 360struct xfs_fs_eofblocks {
348 __u32 eof_version; 361 __u32 eof_version;
349 __u32 eof_flags; 362 __u32 eof_flags;
350 uid_t eof_uid; 363 uid_t eof_uid;
@@ -450,6 +463,21 @@ typedef struct xfs_handle {
450 + (handle).ha_fid.fid_len) 463 + (handle).ha_fid.fid_len)
451 464
452/* 465/*
466 * Structure passed to XFS_IOC_SWAPEXT
467 */
468typedef struct xfs_swapext
469{
470 __int64_t sx_version; /* version */
471#define XFS_SX_VERSION 0
472 __int64_t sx_fdtarget; /* fd of target file */
473 __int64_t sx_fdtmp; /* fd of tmp file */
474 xfs_off_t sx_offset; /* offset into file */
475 xfs_off_t sx_length; /* leng from offset */
476 char sx_pad[16]; /* pad space, unused */
477 xfs_bstat_t sx_stat; /* stat of target b4 copy */
478} xfs_swapext_t;
479
480/*
453 * Flags for going down operation 481 * Flags for going down operation
454 */ 482 */
455#define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ 483#define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */
@@ -511,8 +539,14 @@ typedef struct xfs_handle {
511#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection) 539#define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection)
512#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) 540#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection)
513/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ 541/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */
542
514/* XFS_IOC_FREEZE -- FIFREEZE 119 */ 543/* XFS_IOC_FREEZE -- FIFREEZE 119 */
515/* XFS_IOC_THAW -- FITHAW 120 */ 544/* XFS_IOC_THAW -- FITHAW 120 */
545#ifndef FIFREEZE
546#define XFS_IOC_FREEZE _IOWR('X', 119, int)
547#define XFS_IOC_THAW _IOWR('X', 120, int)
548#endif
549
516#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) 550#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
517#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) 551#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
518#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) 552#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 614eb0cc3608..e64ee5288b86 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -203,8 +203,9 @@ xfs_growfs_data_private(
203 203
204 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); 204 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
205 tp->t_flags |= XFS_TRANS_RESERVE; 205 tp->t_flags |= XFS_TRANS_RESERVE;
206 if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp), 206 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
207 XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) { 207 XFS_GROWFS_SPACE_RES(mp), 0);
208 if (error) {
208 xfs_trans_cancel(tp, 0); 209 xfs_trans_cancel(tp, 0);
209 return error; 210 return error;
210 } 211 }
@@ -739,8 +740,7 @@ xfs_fs_log_dummy(
739 int error; 740 int error;
740 741
741 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); 742 tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
742 error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, 743 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
743 XFS_DEFAULT_LOG_COUNT);
744 if (error) { 744 if (error) {
745 xfs_trans_cancel(tp, 0); 745 xfs_trans_cancel(tp, 0);
746 return error; 746 return error;
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 7a0c17d7ec09..ccf2fb143962 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -39,6 +39,7 @@
39#include "xfs_cksum.h" 39#include "xfs_cksum.h"
40#include "xfs_buf_item.h" 40#include "xfs_buf_item.h"
41#include "xfs_icreate_item.h" 41#include "xfs_icreate_item.h"
42#include "xfs_icache.h"
42 43
43 44
44/* 45/*
@@ -506,7 +507,7 @@ xfs_ialloc_next_ag(
506 507
507/* 508/*
508 * Select an allocation group to look for a free inode in, based on the parent 509 * Select an allocation group to look for a free inode in, based on the parent
509 * inode and then mode. Return the allocation group buffer. 510 * inode and the mode. Return the allocation group buffer.
510 */ 511 */
511STATIC xfs_agnumber_t 512STATIC xfs_agnumber_t
512xfs_ialloc_ag_select( 513xfs_ialloc_ag_select(
@@ -728,7 +729,7 @@ xfs_dialloc_ag(
728 error = xfs_inobt_get_rec(cur, &rec, &j); 729 error = xfs_inobt_get_rec(cur, &rec, &j);
729 if (error) 730 if (error)
730 goto error0; 731 goto error0;
731 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 732 XFS_WANT_CORRUPTED_GOTO(j == 1, error0);
732 733
733 if (rec.ir_freecount > 0) { 734 if (rec.ir_freecount > 0) {
734 /* 735 /*
@@ -1341,7 +1342,7 @@ xfs_imap(
1341 xfs_agblock_t cluster_agbno; /* first block in inode cluster */ 1342 xfs_agblock_t cluster_agbno; /* first block in inode cluster */
1342 int error; /* error code */ 1343 int error; /* error code */
1343 int offset; /* index of inode in its buffer */ 1344 int offset; /* index of inode in its buffer */
1344 int offset_agbno; /* blks from chunk start to inode */ 1345 xfs_agblock_t offset_agbno; /* blks from chunk start to inode */
1345 1346
1346 ASSERT(ino != NULLFSINO); 1347 ASSERT(ino != NULLFSINO);
1347 1348
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 3f90e1ceb8d6..16219b9c6790 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_types.h" 21#include "xfs_types.h"
21#include "xfs_log.h" 22#include "xfs_log.h"
22#include "xfs_log_priv.h" 23#include "xfs_log_priv.h"
@@ -31,12 +32,12 @@
31#include "xfs_dinode.h" 32#include "xfs_dinode.h"
32#include "xfs_error.h" 33#include "xfs_error.h"
33#include "xfs_filestream.h" 34#include "xfs_filestream.h"
34#include "xfs_vnodeops.h"
35#include "xfs_inode_item.h" 35#include "xfs_inode_item.h"
36#include "xfs_quota.h" 36#include "xfs_quota.h"
37#include "xfs_trace.h" 37#include "xfs_trace.h"
38#include "xfs_fsops.h" 38#include "xfs_fsops.h"
39#include "xfs_icache.h" 39#include "xfs_icache.h"
40#include "xfs_bmap_util.h"
40 41
41#include <linux/kthread.h> 42#include <linux/kthread.h>
42#include <linux/freezer.h> 43#include <linux/freezer.h>
@@ -619,7 +620,7 @@ restart:
619 620
620/* 621/*
621 * Background scanning to trim post-EOF preallocated space. This is queued 622 * Background scanning to trim post-EOF preallocated space. This is queued
622 * based on the 'background_prealloc_discard_period' tunable (5m by default). 623 * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
623 */ 624 */
624STATIC void 625STATIC void
625xfs_queue_eofblocks( 626xfs_queue_eofblocks(
@@ -1203,15 +1204,15 @@ xfs_inode_match_id(
1203 struct xfs_inode *ip, 1204 struct xfs_inode *ip,
1204 struct xfs_eofblocks *eofb) 1205 struct xfs_eofblocks *eofb)
1205{ 1206{
1206 if (eofb->eof_flags & XFS_EOF_FLAGS_UID && 1207 if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
1207 ip->i_d.di_uid != eofb->eof_uid) 1208 !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
1208 return 0; 1209 return 0;
1209 1210
1210 if (eofb->eof_flags & XFS_EOF_FLAGS_GID && 1211 if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
1211 ip->i_d.di_gid != eofb->eof_gid) 1212 !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
1212 return 0; 1213 return 0;
1213 1214
1214 if (eofb->eof_flags & XFS_EOF_FLAGS_PRID && 1215 if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
1215 xfs_get_projid(ip) != eofb->eof_prid) 1216 xfs_get_projid(ip) != eofb->eof_prid)
1216 return 0; 1217 return 0;
1217 1218
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index a01afbb3909a..8a89f7d791bd 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -21,9 +21,24 @@
21struct xfs_mount; 21struct xfs_mount;
22struct xfs_perag; 22struct xfs_perag;
23 23
24struct xfs_eofblocks {
25 __u32 eof_flags;
26 kuid_t eof_uid;
27 kgid_t eof_gid;
28 prid_t eof_prid;
29 __u64 eof_min_file_size;
30};
31
24#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ 32#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
25#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ 33#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
26 34
35/*
36 * Flags for xfs_iget()
37 */
38#define XFS_IGET_CREATE 0x1
39#define XFS_IGET_UNTRUSTED 0x2
40#define XFS_IGET_DONTCACHE 0x4
41
27int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, 42int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
28 uint flags, uint lock_flags, xfs_inode_t **ipp); 43 uint flags, uint lock_flags, xfs_inode_t **ipp);
29 44
@@ -49,4 +64,39 @@ int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
49 int flags, void *args), 64 int flags, void *args),
50 int flags, void *args, int tag); 65 int flags, void *args, int tag);
51 66
67static inline int
68xfs_fs_eofblocks_from_user(
69 struct xfs_fs_eofblocks *src,
70 struct xfs_eofblocks *dst)
71{
72 if (src->eof_version != XFS_EOFBLOCKS_VERSION)
73 return EINVAL;
74
75 if (src->eof_flags & ~XFS_EOF_FLAGS_VALID)
76 return EINVAL;
77
78 if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) ||
79 memchr_inv(src->pad64, 0, sizeof(src->pad64)))
80 return EINVAL;
81
82 dst->eof_flags = src->eof_flags;
83 dst->eof_prid = src->eof_prid;
84 dst->eof_min_file_size = src->eof_min_file_size;
85
86 dst->eof_uid = INVALID_UID;
87 if (src->eof_flags & XFS_EOF_FLAGS_UID) {
88 dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
89 if (!uid_valid(dst->eof_uid))
90 return EINVAL;
91 }
92
93 dst->eof_gid = INVALID_GID;
94 if (src->eof_flags & XFS_EOF_FLAGS_GID) {
95 dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
96 if (!gid_valid(dst->eof_gid))
97 return EINVAL;
98 }
99 return 0;
100}
101
52#endif 102#endif
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 7716a4e7375e..5a5a593994d4 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -20,23 +20,11 @@
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h" 23#include "xfs_trans.h"
25#include "xfs_buf_item.h"
26#include "xfs_sb.h" 24#include "xfs_sb.h"
27#include "xfs_ag.h" 25#include "xfs_ag.h"
28#include "xfs_dir2.h"
29#include "xfs_mount.h" 26#include "xfs_mount.h"
30#include "xfs_trans_priv.h" 27#include "xfs_trans_priv.h"
31#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_attr_sf.h"
35#include "xfs_dinode.h"
36#include "xfs_inode.h"
37#include "xfs_inode_item.h"
38#include "xfs_btree.h"
39#include "xfs_ialloc.h"
40#include "xfs_error.h" 28#include "xfs_error.h"
41#include "xfs_icreate_item.h" 29#include "xfs_icreate_item.h"
42 30
@@ -52,11 +40,14 @@ static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip)
52 * 40 *
53 * We only need one iovec for the icreate log structure. 41 * We only need one iovec for the icreate log structure.
54 */ 42 */
55STATIC uint 43STATIC void
56xfs_icreate_item_size( 44xfs_icreate_item_size(
57 struct xfs_log_item *lip) 45 struct xfs_log_item *lip,
46 int *nvecs,
47 int *nbytes)
58{ 48{
59 return 1; 49 *nvecs += 1;
50 *nbytes += sizeof(struct xfs_icreate_log);
60} 51}
61 52
62/* 53/*
diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h
index 88ba8aa0bc41..59e89f87c09b 100644
--- a/fs/xfs/xfs_icreate_item.h
+++ b/fs/xfs/xfs_icreate_item.h
@@ -18,24 +18,6 @@
18#ifndef XFS_ICREATE_ITEM_H 18#ifndef XFS_ICREATE_ITEM_H
19#define XFS_ICREATE_ITEM_H 1 19#define XFS_ICREATE_ITEM_H 1
20 20
21/*
22 * on disk log item structure
23 *
24 * Log recovery assumes the first two entries are the type and size and they fit
25 * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
26 * decoding can be done correctly.
27 */
28struct xfs_icreate_log {
29 __uint16_t icl_type; /* type of log format structure */
30 __uint16_t icl_size; /* size of log format structure */
31 __be32 icl_ag; /* ag being allocated in */
32 __be32 icl_agbno; /* start block of inode range */
33 __be32 icl_count; /* number of inodes to initialise */
34 __be32 icl_isize; /* size of inodes */
35 __be32 icl_length; /* length of extent to initialise */
36 __be32 icl_gen; /* inode generation number to use */
37};
38
39/* in memory log item structure */ 21/* in memory log item structure */
40struct xfs_icreate_item { 22struct xfs_icreate_item {
41 struct xfs_log_item ic_item; 23 struct xfs_log_item ic_item;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index bb262c25c8de..e3d75385aa76 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -19,18 +19,23 @@
19 19
20#include "xfs.h" 20#include "xfs.h"
21#include "xfs_fs.h" 21#include "xfs_fs.h"
22#include "xfs_types.h" 22#include "xfs_format.h"
23#include "xfs_log.h" 23#include "xfs_log.h"
24#include "xfs_inum.h" 24#include "xfs_inum.h"
25#include "xfs_trans.h" 25#include "xfs_trans.h"
26#include "xfs_trans_space.h"
26#include "xfs_trans_priv.h" 27#include "xfs_trans_priv.h"
27#include "xfs_sb.h" 28#include "xfs_sb.h"
28#include "xfs_ag.h" 29#include "xfs_ag.h"
29#include "xfs_mount.h" 30#include "xfs_mount.h"
31#include "xfs_da_btree.h"
32#include "xfs_dir2_format.h"
33#include "xfs_dir2.h"
30#include "xfs_bmap_btree.h" 34#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 35#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 36#include "xfs_ialloc_btree.h"
33#include "xfs_attr_sf.h" 37#include "xfs_attr_sf.h"
38#include "xfs_attr.h"
34#include "xfs_dinode.h" 39#include "xfs_dinode.h"
35#include "xfs_inode.h" 40#include "xfs_inode.h"
36#include "xfs_buf_item.h" 41#include "xfs_buf_item.h"
@@ -39,16 +44,15 @@
39#include "xfs_alloc.h" 44#include "xfs_alloc.h"
40#include "xfs_ialloc.h" 45#include "xfs_ialloc.h"
41#include "xfs_bmap.h" 46#include "xfs_bmap.h"
47#include "xfs_bmap_util.h"
42#include "xfs_error.h" 48#include "xfs_error.h"
43#include "xfs_utils.h"
44#include "xfs_quota.h" 49#include "xfs_quota.h"
45#include "xfs_filestream.h" 50#include "xfs_filestream.h"
46#include "xfs_vnodeops.h"
47#include "xfs_cksum.h" 51#include "xfs_cksum.h"
48#include "xfs_trace.h" 52#include "xfs_trace.h"
49#include "xfs_icache.h" 53#include "xfs_icache.h"
54#include "xfs_symlink.h"
50 55
51kmem_zone_t *xfs_ifork_zone;
52kmem_zone_t *xfs_inode_zone; 56kmem_zone_t *xfs_inode_zone;
53 57
54/* 58/*
@@ -58,9 +62,6 @@ kmem_zone_t *xfs_inode_zone;
58#define XFS_ITRUNC_MAX_EXTENTS 2 62#define XFS_ITRUNC_MAX_EXTENTS 2
59 63
60STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); 64STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
61STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
62STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
63STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
64 65
65/* 66/*
66 * helper function to extract extent size hint from inode 67 * helper function to extract extent size hint from inode
@@ -310,623 +311,202 @@ xfs_isilocked(
310} 311}
311#endif 312#endif
312 313
313void
314__xfs_iflock(
315 struct xfs_inode *ip)
316{
317 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
318 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
319
320 do {
321 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
322 if (xfs_isiflocked(ip))
323 io_schedule();
324 } while (!xfs_iflock_nowait(ip));
325
326 finish_wait(wq, &wait.wait);
327}
328
329#ifdef DEBUG 314#ifdef DEBUG
315int xfs_locked_n;
316int xfs_small_retries;
317int xfs_middle_retries;
318int xfs_lots_retries;
319int xfs_lock_delays;
320#endif
321
330/* 322/*
331 * Make sure that the extents in the given memory buffer 323 * Bump the subclass so xfs_lock_inodes() acquires each lock with
332 * are valid. 324 * a different value
333 */ 325 */
334STATIC void 326static inline int
335xfs_validate_extents( 327xfs_lock_inumorder(int lock_mode, int subclass)
336 xfs_ifork_t *ifp,
337 int nrecs,
338 xfs_exntfmt_t fmt)
339{ 328{
340 xfs_bmbt_irec_t irec; 329 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
341 xfs_bmbt_rec_host_t rec; 330 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
342 int i; 331 if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
332 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
343 333
344 for (i = 0; i < nrecs; i++) { 334 return lock_mode;
345 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
346 rec.l0 = get_unaligned(&ep->l0);
347 rec.l1 = get_unaligned(&ep->l1);
348 xfs_bmbt_get_all(&rec, &irec);
349 if (fmt == XFS_EXTFMT_NOSTATE)
350 ASSERT(irec.br_state == XFS_EXT_NORM);
351 }
352} 335}
353#else /* DEBUG */
354#define xfs_validate_extents(ifp, nrecs, fmt)
355#endif /* DEBUG */
356 336
357/* 337/*
358 * Check that none of the inode's in the buffer have a next 338 * The following routine will lock n inodes in exclusive mode.
359 * unlinked field of 0. 339 * We assume the caller calls us with the inodes in i_ino order.
340 *
341 * We need to detect deadlock where an inode that we lock
342 * is in the AIL and we start waiting for another inode that is locked
343 * by a thread in a long running transaction (such as truncate). This can
344 * result in deadlock since the long running trans might need to wait
345 * for the inode we just locked in order to push the tail and free space
346 * in the log.
360 */ 347 */
361#if defined(DEBUG)
362void 348void
363xfs_inobp_check( 349xfs_lock_inodes(
364 xfs_mount_t *mp, 350 xfs_inode_t **ips,
365 xfs_buf_t *bp) 351 int inodes,
352 uint lock_mode)
366{ 353{
367 int i; 354 int attempts = 0, i, j, try_lock;
368 int j; 355 xfs_log_item_t *lp;
369 xfs_dinode_t *dip;
370 356
371 j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; 357 ASSERT(ips && (inodes >= 2)); /* we need at least two */
372 358
373 for (i = 0; i < j; i++) { 359 try_lock = 0;
374 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 360 i = 0;
375 i * mp->m_sb.sb_inodesize);
376 if (!dip->di_next_unlinked) {
377 xfs_alert(mp,
378 "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
379 bp);
380 ASSERT(dip->di_next_unlinked);
381 }
382 }
383}
384#endif
385 361
386static void 362again:
387xfs_inode_buf_verify( 363 for (; i < inodes; i++) {
388 struct xfs_buf *bp) 364 ASSERT(ips[i]);
389{
390 struct xfs_mount *mp = bp->b_target->bt_mount;
391 int i;
392 int ni;
393
394 /*
395 * Validate the magic number and version of every inode in the buffer
396 */
397 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
398 for (i = 0; i < ni; i++) {
399 int di_ok;
400 xfs_dinode_t *dip;
401
402 dip = (struct xfs_dinode *)xfs_buf_offset(bp,
403 (i << mp->m_sb.sb_inodelog));
404 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
405 XFS_DINODE_GOOD_VERSION(dip->di_version);
406 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
407 XFS_ERRTAG_ITOBP_INOTOBP,
408 XFS_RANDOM_ITOBP_INOTOBP))) {
409 xfs_buf_ioerror(bp, EFSCORRUPTED);
410 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
411 mp, dip);
412#ifdef DEBUG
413 xfs_emerg(mp,
414 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
415 (unsigned long long)bp->b_bn, i,
416 be16_to_cpu(dip->di_magic));
417 ASSERT(0);
418#endif
419 }
420 }
421 xfs_inobp_check(mp, bp);
422}
423
424
425static void
426xfs_inode_buf_read_verify(
427 struct xfs_buf *bp)
428{
429 xfs_inode_buf_verify(bp);
430}
431
432static void
433xfs_inode_buf_write_verify(
434 struct xfs_buf *bp)
435{
436 xfs_inode_buf_verify(bp);
437}
438
439const struct xfs_buf_ops xfs_inode_buf_ops = {
440 .verify_read = xfs_inode_buf_read_verify,
441 .verify_write = xfs_inode_buf_write_verify,
442};
443 365
366 if (i && (ips[i] == ips[i-1])) /* Already locked */
367 continue;
444 368
445/* 369 /*
446 * This routine is called to map an inode to the buffer containing the on-disk 370 * If try_lock is not set yet, make sure all locked inodes
447 * version of the inode. It returns a pointer to the buffer containing the 371 * are not in the AIL.
448 * on-disk inode in the bpp parameter, and in the dipp parameter it returns a 372 * If any are, set try_lock to be used later.
449 * pointer to the on-disk inode within that buffer. 373 */
450 *
451 * If a non-zero error is returned, then the contents of bpp and dipp are
452 * undefined.
453 */
454int
455xfs_imap_to_bp(
456 struct xfs_mount *mp,
457 struct xfs_trans *tp,
458 struct xfs_imap *imap,
459 struct xfs_dinode **dipp,
460 struct xfs_buf **bpp,
461 uint buf_flags,
462 uint iget_flags)
463{
464 struct xfs_buf *bp;
465 int error;
466 374
467 buf_flags |= XBF_UNMAPPED; 375 if (!try_lock) {
468 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, 376 for (j = (i - 1); j >= 0 && !try_lock; j--) {
469 (int)imap->im_len, buf_flags, &bp, 377 lp = (xfs_log_item_t *)ips[j]->i_itemp;
470 &xfs_inode_buf_ops); 378 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
471 if (error) { 379 try_lock++;
472 if (error == EAGAIN) { 380 }
473 ASSERT(buf_flags & XBF_TRYLOCK); 381 }
474 return error;
475 } 382 }
476 383
477 if (error == EFSCORRUPTED && 384 /*
478 (iget_flags & XFS_IGET_UNTRUSTED)) 385 * If any of the previous locks we have locked is in the AIL,
479 return XFS_ERROR(EINVAL); 386 * we must TRY to get the second and subsequent locks. If
480 387 * we can't get any, we must release all we have
481 xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", 388 * and try again.
482 __func__, error); 389 */
483 return error;
484 }
485
486 *bpp = bp;
487 *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
488 return 0;
489}
490
491/*
492 * Move inode type and inode format specific information from the
493 * on-disk inode to the in-core inode. For fifos, devs, and sockets
494 * this means set if_rdev to the proper value. For files, directories,
495 * and symlinks this means to bring in the in-line data or extent
496 * pointers. For a file in B-tree format, only the root is immediately
497 * brought in-core. The rest will be in-lined in if_extents when it
498 * is first referenced (see xfs_iread_extents()).
499 */
500STATIC int
501xfs_iformat(
502 xfs_inode_t *ip,
503 xfs_dinode_t *dip)
504{
505 xfs_attr_shortform_t *atp;
506 int size;
507 int error = 0;
508 xfs_fsize_t di_size;
509
510 if (unlikely(be32_to_cpu(dip->di_nextents) +
511 be16_to_cpu(dip->di_anextents) >
512 be64_to_cpu(dip->di_nblocks))) {
513 xfs_warn(ip->i_mount,
514 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
515 (unsigned long long)ip->i_ino,
516 (int)(be32_to_cpu(dip->di_nextents) +
517 be16_to_cpu(dip->di_anextents)),
518 (unsigned long long)
519 be64_to_cpu(dip->di_nblocks));
520 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
521 ip->i_mount, dip);
522 return XFS_ERROR(EFSCORRUPTED);
523 }
524
525 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
526 xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
527 (unsigned long long)ip->i_ino,
528 dip->di_forkoff);
529 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
530 ip->i_mount, dip);
531 return XFS_ERROR(EFSCORRUPTED);
532 }
533
534 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
535 !ip->i_mount->m_rtdev_targp)) {
536 xfs_warn(ip->i_mount,
537 "corrupt dinode %Lu, has realtime flag set.",
538 ip->i_ino);
539 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
540 XFS_ERRLEVEL_LOW, ip->i_mount, dip);
541 return XFS_ERROR(EFSCORRUPTED);
542 }
543
544 switch (ip->i_d.di_mode & S_IFMT) {
545 case S_IFIFO:
546 case S_IFCHR:
547 case S_IFBLK:
548 case S_IFSOCK:
549 if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
550 XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
551 ip->i_mount, dip);
552 return XFS_ERROR(EFSCORRUPTED);
553 }
554 ip->i_d.di_size = 0;
555 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
556 break;
557 390
558 case S_IFREG: 391 if (try_lock) {
559 case S_IFLNK: 392 /* try_lock must be 0 if i is 0. */
560 case S_IFDIR:
561 switch (dip->di_format) {
562 case XFS_DINODE_FMT_LOCAL:
563 /* 393 /*
564 * no local regular files yet 394 * try_lock means we have an inode locked
395 * that is in the AIL.
565 */ 396 */
566 if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { 397 ASSERT(i != 0);
567 xfs_warn(ip->i_mount, 398 if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
568 "corrupt inode %Lu (local format for regular file).", 399 attempts++;
569 (unsigned long long) ip->i_ino); 400
570 XFS_CORRUPTION_ERROR("xfs_iformat(4)", 401 /*
571 XFS_ERRLEVEL_LOW, 402 * Unlock all previous guys and try again.
572 ip->i_mount, dip); 403 * xfs_iunlock will try to push the tail
573 return XFS_ERROR(EFSCORRUPTED); 404 * if the inode is in the AIL.
574 } 405 */
406
407 for(j = i - 1; j >= 0; j--) {
408
409 /*
410 * Check to see if we've already
411 * unlocked this one.
412 * Not the first one going back,
413 * and the inode ptr is the same.
414 */
415 if ((j != (i - 1)) && ips[j] ==
416 ips[j+1])
417 continue;
418
419 xfs_iunlock(ips[j], lock_mode);
420 }
575 421
576 di_size = be64_to_cpu(dip->di_size); 422 if ((attempts % 5) == 0) {
577 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { 423 delay(1); /* Don't just spin the CPU */
578 xfs_warn(ip->i_mount, 424#ifdef DEBUG
579 "corrupt inode %Lu (bad size %Ld for local inode).", 425 xfs_lock_delays++;
580 (unsigned long long) ip->i_ino, 426#endif
581 (long long) di_size); 427 }
582 XFS_CORRUPTION_ERROR("xfs_iformat(5)", 428 i = 0;
583 XFS_ERRLEVEL_LOW, 429 try_lock = 0;
584 ip->i_mount, dip); 430 goto again;
585 return XFS_ERROR(EFSCORRUPTED);
586 } 431 }
587 432 } else {
588 size = (int)di_size; 433 xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
589 error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
590 break;
591 case XFS_DINODE_FMT_EXTENTS:
592 error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
593 break;
594 case XFS_DINODE_FMT_BTREE:
595 error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
596 break;
597 default:
598 XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
599 ip->i_mount);
600 return XFS_ERROR(EFSCORRUPTED);
601 } 434 }
602 break;
603
604 default:
605 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
606 return XFS_ERROR(EFSCORRUPTED);
607 }
608 if (error) {
609 return error;
610 } 435 }
611 if (!XFS_DFORK_Q(dip))
612 return 0;
613
614 ASSERT(ip->i_afp == NULL);
615 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
616
617 switch (dip->di_aformat) {
618 case XFS_DINODE_FMT_LOCAL:
619 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
620 size = be16_to_cpu(atp->hdr.totsize);
621
622 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
623 xfs_warn(ip->i_mount,
624 "corrupt inode %Lu (bad attr fork size %Ld).",
625 (unsigned long long) ip->i_ino,
626 (long long) size);
627 XFS_CORRUPTION_ERROR("xfs_iformat(8)",
628 XFS_ERRLEVEL_LOW,
629 ip->i_mount, dip);
630 return XFS_ERROR(EFSCORRUPTED);
631 }
632 436
633 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); 437#ifdef DEBUG
634 break; 438 if (attempts) {
635 case XFS_DINODE_FMT_EXTENTS: 439 if (attempts < 5) xfs_small_retries++;
636 error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); 440 else if (attempts < 100) xfs_middle_retries++;
637 break; 441 else xfs_lots_retries++;
638 case XFS_DINODE_FMT_BTREE: 442 } else {
639 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); 443 xfs_locked_n++;
640 break;
641 default:
642 error = XFS_ERROR(EFSCORRUPTED);
643 break;
644 }
645 if (error) {
646 kmem_zone_free(xfs_ifork_zone, ip->i_afp);
647 ip->i_afp = NULL;
648 xfs_idestroy_fork(ip, XFS_DATA_FORK);
649 } 444 }
650 return error; 445#endif
651} 446}
652 447
653/* 448/*
654 * The file is in-lined in the on-disk inode. 449 * xfs_lock_two_inodes() can only be used to lock one type of lock
655 * If it fits into if_inline_data, then copy 450 * at a time - the iolock or the ilock, but not both at once. If
656 * it there, otherwise allocate a buffer for it 451 * we lock both at once, lockdep will report false positives saying
657 * and copy the data there. Either way, set 452 * we have violated locking orders.
658 * if_data to point at the data.
659 * If we allocate a buffer for the data, make
660 * sure that its size is a multiple of 4 and
661 * record the real size in i_real_bytes.
662 */ 453 */
663STATIC int 454void
664xfs_iformat_local( 455xfs_lock_two_inodes(
665 xfs_inode_t *ip, 456 xfs_inode_t *ip0,
666 xfs_dinode_t *dip, 457 xfs_inode_t *ip1,
667 int whichfork, 458 uint lock_mode)
668 int size)
669{ 459{
670 xfs_ifork_t *ifp; 460 xfs_inode_t *temp;
671 int real_size; 461 int attempts = 0;
672 462 xfs_log_item_t *lp;
673 /*
674 * If the size is unreasonable, then something
675 * is wrong and we just bail out rather than crash in
676 * kmem_alloc() or memcpy() below.
677 */
678 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
679 xfs_warn(ip->i_mount,
680 "corrupt inode %Lu (bad size %d for local fork, size = %d).",
681 (unsigned long long) ip->i_ino, size,
682 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
683 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
684 ip->i_mount, dip);
685 return XFS_ERROR(EFSCORRUPTED);
686 }
687 ifp = XFS_IFORK_PTR(ip, whichfork);
688 real_size = 0;
689 if (size == 0)
690 ifp->if_u1.if_data = NULL;
691 else if (size <= sizeof(ifp->if_u2.if_inline_data))
692 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
693 else {
694 real_size = roundup(size, 4);
695 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
696 }
697 ifp->if_bytes = size;
698 ifp->if_real_bytes = real_size;
699 if (size)
700 memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
701 ifp->if_flags &= ~XFS_IFEXTENTS;
702 ifp->if_flags |= XFS_IFINLINE;
703 return 0;
704}
705 463
706/* 464 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
707 * The file consists of a set of extents all 465 ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
708 * of which fit into the on-disk inode. 466 ASSERT(ip0->i_ino != ip1->i_ino);
709 * If there are few enough extents to fit into
710 * the if_inline_ext, then copy them there.
711 * Otherwise allocate a buffer for them and copy
712 * them into it. Either way, set if_extents
713 * to point at the extents.
714 */
715STATIC int
716xfs_iformat_extents(
717 xfs_inode_t *ip,
718 xfs_dinode_t *dip,
719 int whichfork)
720{
721 xfs_bmbt_rec_t *dp;
722 xfs_ifork_t *ifp;
723 int nex;
724 int size;
725 int i;
726
727 ifp = XFS_IFORK_PTR(ip, whichfork);
728 nex = XFS_DFORK_NEXTENTS(dip, whichfork);
729 size = nex * (uint)sizeof(xfs_bmbt_rec_t);
730
731 /*
732 * If the number of extents is unreasonable, then something
733 * is wrong and we just bail out rather than crash in
734 * kmem_alloc() or memcpy() below.
735 */
736 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
737 xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
738 (unsigned long long) ip->i_ino, nex);
739 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
740 ip->i_mount, dip);
741 return XFS_ERROR(EFSCORRUPTED);
742 }
743
744 ifp->if_real_bytes = 0;
745 if (nex == 0)
746 ifp->if_u1.if_extents = NULL;
747 else if (nex <= XFS_INLINE_EXTS)
748 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
749 else
750 xfs_iext_add(ifp, 0, nex);
751
752 ifp->if_bytes = size;
753 if (size) {
754 dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
755 xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
756 for (i = 0; i < nex; i++, dp++) {
757 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
758 ep->l0 = get_unaligned_be64(&dp->l0);
759 ep->l1 = get_unaligned_be64(&dp->l1);
760 }
761 XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
762 if (whichfork != XFS_DATA_FORK ||
763 XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
764 if (unlikely(xfs_check_nostate_extents(
765 ifp, 0, nex))) {
766 XFS_ERROR_REPORT("xfs_iformat_extents(2)",
767 XFS_ERRLEVEL_LOW,
768 ip->i_mount);
769 return XFS_ERROR(EFSCORRUPTED);
770 }
771 }
772 ifp->if_flags |= XFS_IFEXTENTS;
773 return 0;
774}
775 467
776/* 468 if (ip0->i_ino > ip1->i_ino) {
777 * The file has too many extents to fit into 469 temp = ip0;
778 * the inode, so they are in B-tree format. 470 ip0 = ip1;
779 * Allocate a buffer for the root of the B-tree 471 ip1 = temp;
780 * and copy the root into it. The i_extents 472 }
781 * field will remain NULL until all of the
782 * extents are read in (when they are needed).
783 */
784STATIC int
785xfs_iformat_btree(
786 xfs_inode_t *ip,
787 xfs_dinode_t *dip,
788 int whichfork)
789{
790 struct xfs_mount *mp = ip->i_mount;
791 xfs_bmdr_block_t *dfp;
792 xfs_ifork_t *ifp;
793 /* REFERENCED */
794 int nrecs;
795 int size;
796
797 ifp = XFS_IFORK_PTR(ip, whichfork);
798 dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
799 size = XFS_BMAP_BROOT_SPACE(mp, dfp);
800 nrecs = be16_to_cpu(dfp->bb_numrecs);
801
802 /*
803 * blow out if -- fork has less extents than can fit in
804 * fork (fork shouldn't be a btree format), root btree
805 * block has more records than can fit into the fork,
806 * or the number of extents is greater than the number of
807 * blocks.
808 */
809 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
810 XFS_IFORK_MAXEXT(ip, whichfork) ||
811 XFS_BMDR_SPACE_CALC(nrecs) >
812 XFS_DFORK_SIZE(dip, mp, whichfork) ||
813 XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
814 xfs_warn(mp, "corrupt inode %Lu (btree).",
815 (unsigned long long) ip->i_ino);
816 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
817 mp, dip);
818 return XFS_ERROR(EFSCORRUPTED);
819 }
820
821 ifp->if_broot_bytes = size;
822 ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
823 ASSERT(ifp->if_broot != NULL);
824 /*
825 * Copy and convert from the on-disk structure
826 * to the in-memory structure.
827 */
828 xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
829 ifp->if_broot, size);
830 ifp->if_flags &= ~XFS_IFEXTENTS;
831 ifp->if_flags |= XFS_IFBROOT;
832 473
833 return 0; 474 again:
834} 475 xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
835 476
836STATIC void 477 /*
837xfs_dinode_from_disk( 478 * If the first lock we have locked is in the AIL, we must TRY to get
838 xfs_icdinode_t *to, 479 * the second lock. If we can't get it, we must release the first one
839 xfs_dinode_t *from) 480 * and try again.
840{ 481 */
841 to->di_magic = be16_to_cpu(from->di_magic); 482 lp = (xfs_log_item_t *)ip0->i_itemp;
842 to->di_mode = be16_to_cpu(from->di_mode); 483 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
843 to->di_version = from ->di_version; 484 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
844 to->di_format = from->di_format; 485 xfs_iunlock(ip0, lock_mode);
845 to->di_onlink = be16_to_cpu(from->di_onlink); 486 if ((++attempts % 5) == 0)
846 to->di_uid = be32_to_cpu(from->di_uid); 487 delay(1); /* Don't just spin the CPU */
847 to->di_gid = be32_to_cpu(from->di_gid); 488 goto again;
848 to->di_nlink = be32_to_cpu(from->di_nlink); 489 }
849 to->di_projid_lo = be16_to_cpu(from->di_projid_lo); 490 } else {
850 to->di_projid_hi = be16_to_cpu(from->di_projid_hi); 491 xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
851 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
852 to->di_flushiter = be16_to_cpu(from->di_flushiter);
853 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
854 to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
855 to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
856 to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
857 to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
858 to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
859 to->di_size = be64_to_cpu(from->di_size);
860 to->di_nblocks = be64_to_cpu(from->di_nblocks);
861 to->di_extsize = be32_to_cpu(from->di_extsize);
862 to->di_nextents = be32_to_cpu(from->di_nextents);
863 to->di_anextents = be16_to_cpu(from->di_anextents);
864 to->di_forkoff = from->di_forkoff;
865 to->di_aformat = from->di_aformat;
866 to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
867 to->di_dmstate = be16_to_cpu(from->di_dmstate);
868 to->di_flags = be16_to_cpu(from->di_flags);
869 to->di_gen = be32_to_cpu(from->di_gen);
870
871 if (to->di_version == 3) {
872 to->di_changecount = be64_to_cpu(from->di_changecount);
873 to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
874 to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
875 to->di_flags2 = be64_to_cpu(from->di_flags2);
876 to->di_ino = be64_to_cpu(from->di_ino);
877 to->di_lsn = be64_to_cpu(from->di_lsn);
878 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
879 uuid_copy(&to->di_uuid, &from->di_uuid);
880 } 492 }
881} 493}
882 494
495
883void 496void
884xfs_dinode_to_disk( 497__xfs_iflock(
885 xfs_dinode_t *to, 498 struct xfs_inode *ip)
886 xfs_icdinode_t *from)
887{ 499{
888 to->di_magic = cpu_to_be16(from->di_magic); 500 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
889 to->di_mode = cpu_to_be16(from->di_mode); 501 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
890 to->di_version = from ->di_version; 502
891 to->di_format = from->di_format; 503 do {
892 to->di_onlink = cpu_to_be16(from->di_onlink); 504 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
893 to->di_uid = cpu_to_be32(from->di_uid); 505 if (xfs_isiflocked(ip))
894 to->di_gid = cpu_to_be32(from->di_gid); 506 io_schedule();
895 to->di_nlink = cpu_to_be32(from->di_nlink); 507 } while (!xfs_iflock_nowait(ip));
896 to->di_projid_lo = cpu_to_be16(from->di_projid_lo); 508
897 to->di_projid_hi = cpu_to_be16(from->di_projid_hi); 509 finish_wait(wq, &wait.wait);
898 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
899 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
900 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
901 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
902 to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
903 to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
904 to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
905 to->di_size = cpu_to_be64(from->di_size);
906 to->di_nblocks = cpu_to_be64(from->di_nblocks);
907 to->di_extsize = cpu_to_be32(from->di_extsize);
908 to->di_nextents = cpu_to_be32(from->di_nextents);
909 to->di_anextents = cpu_to_be16(from->di_anextents);
910 to->di_forkoff = from->di_forkoff;
911 to->di_aformat = from->di_aformat;
912 to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
913 to->di_dmstate = cpu_to_be16(from->di_dmstate);
914 to->di_flags = cpu_to_be16(from->di_flags);
915 to->di_gen = cpu_to_be32(from->di_gen);
916
917 if (from->di_version == 3) {
918 to->di_changecount = cpu_to_be64(from->di_changecount);
919 to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
920 to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
921 to->di_flags2 = cpu_to_be64(from->di_flags2);
922 to->di_ino = cpu_to_be64(from->di_ino);
923 to->di_lsn = cpu_to_be64(from->di_lsn);
924 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
925 uuid_copy(&to->di_uuid, &from->di_uuid);
926 to->di_flushiter = 0;
927 } else {
928 to->di_flushiter = cpu_to_be16(from->di_flushiter);
929 }
930} 510}
931 511
932STATIC uint 512STATIC uint
@@ -987,235 +567,50 @@ xfs_dic2xflags(
987 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); 567 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
988} 568}
989 569
990static bool
991xfs_dinode_verify(
992 struct xfs_mount *mp,
993 struct xfs_inode *ip,
994 struct xfs_dinode *dip)
995{
996 if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
997 return false;
998
999 /* only version 3 or greater inodes are extensively verified here */
1000 if (dip->di_version < 3)
1001 return true;
1002
1003 if (!xfs_sb_version_hascrc(&mp->m_sb))
1004 return false;
1005 if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
1006 offsetof(struct xfs_dinode, di_crc)))
1007 return false;
1008 if (be64_to_cpu(dip->di_ino) != ip->i_ino)
1009 return false;
1010 if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
1011 return false;
1012 return true;
1013}
1014
1015void
1016xfs_dinode_calc_crc(
1017 struct xfs_mount *mp,
1018 struct xfs_dinode *dip)
1019{
1020 __uint32_t crc;
1021
1022 if (dip->di_version < 3)
1023 return;
1024
1025 ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
1026 crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
1027 offsetof(struct xfs_dinode, di_crc));
1028 dip->di_crc = xfs_end_cksum(crc);
1029}
1030
1031/* 570/*
1032 * Read the disk inode attributes into the in-core inode structure. 571 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
1033 * 572 * is allowed, otherwise it has to be an exact match. If a CI match is found,
1034 * For version 5 superblocks, if we are initialising a new inode and we are not 573 * ci_name->name will point to a the actual name (caller must free) or
1035 * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new 574 * will be set to NULL if an exact match is found.
1036 * inode core with a random generation number. If we are keeping inodes around,
1037 * we need to read the inode cluster to get the existing generation number off
1038 * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
1039 * format) then log recovery is dependent on the di_flushiter field being
1040 * initialised from the current on-disk value and hence we must also read the
1041 * inode off disk.
1042 */ 575 */
1043int 576int
1044xfs_iread( 577xfs_lookup(
1045 xfs_mount_t *mp, 578 xfs_inode_t *dp,
1046 xfs_trans_t *tp, 579 struct xfs_name *name,
1047 xfs_inode_t *ip, 580 xfs_inode_t **ipp,
1048 uint iget_flags) 581 struct xfs_name *ci_name)
1049{ 582{
1050 xfs_buf_t *bp; 583 xfs_ino_t inum;
1051 xfs_dinode_t *dip; 584 int error;
1052 int error; 585 uint lock_mode;
1053
1054 /*
1055 * Fill in the location information in the in-core inode.
1056 */
1057 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
1058 if (error)
1059 return error;
1060
1061 /* shortcut IO on inode allocation if possible */
1062 if ((iget_flags & XFS_IGET_CREATE) &&
1063 xfs_sb_version_hascrc(&mp->m_sb) &&
1064 !(mp->m_flags & XFS_MOUNT_IKEEP)) {
1065 /* initialise the on-disk inode core */
1066 memset(&ip->i_d, 0, sizeof(ip->i_d));
1067 ip->i_d.di_magic = XFS_DINODE_MAGIC;
1068 ip->i_d.di_gen = prandom_u32();
1069 if (xfs_sb_version_hascrc(&mp->m_sb)) {
1070 ip->i_d.di_version = 3;
1071 ip->i_d.di_ino = ip->i_ino;
1072 uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
1073 } else
1074 ip->i_d.di_version = 2;
1075 return 0;
1076 }
1077
1078 /*
1079 * Get pointers to the on-disk inode and the buffer containing it.
1080 */
1081 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
1082 if (error)
1083 return error;
1084 586
1085 /* even unallocated inodes are verified */ 587 trace_xfs_lookup(dp, name);
1086 if (!xfs_dinode_verify(mp, ip, dip)) {
1087 xfs_alert(mp, "%s: validation failed for inode %lld failed",
1088 __func__, ip->i_ino);
1089 588
1090 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); 589 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1091 error = XFS_ERROR(EFSCORRUPTED); 590 return XFS_ERROR(EIO);
1092 goto out_brelse;
1093 }
1094 591
1095 /* 592 lock_mode = xfs_ilock_map_shared(dp);
1096 * If the on-disk inode is already linked to a directory 593 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
1097 * entry, copy all of the inode into the in-core inode. 594 xfs_iunlock_map_shared(dp, lock_mode);
1098 * xfs_iformat() handles copying in the inode format
1099 * specific information.
1100 * Otherwise, just get the truly permanent information.
1101 */
1102 if (dip->di_mode) {
1103 xfs_dinode_from_disk(&ip->i_d, dip);
1104 error = xfs_iformat(ip, dip);
1105 if (error) {
1106#ifdef DEBUG
1107 xfs_alert(mp, "%s: xfs_iformat() returned error %d",
1108 __func__, error);
1109#endif /* DEBUG */
1110 goto out_brelse;
1111 }
1112 } else {
1113 /*
1114 * Partial initialisation of the in-core inode. Just the bits
1115 * that xfs_ialloc won't overwrite or relies on being correct.
1116 */
1117 ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
1118 ip->i_d.di_version = dip->di_version;
1119 ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
1120 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
1121
1122 if (dip->di_version == 3) {
1123 ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
1124 uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
1125 }
1126 595
1127 /* 596 if (error)
1128 * Make sure to pull in the mode here as well in 597 goto out;
1129 * case the inode is released without being used.
1130 * This ensures that xfs_inactive() will see that
1131 * the inode is already free and not try to mess
1132 * with the uninitialized part of it.
1133 */
1134 ip->i_d.di_mode = 0;
1135 }
1136
1137 /*
1138 * The inode format changed when we moved the link count and
1139 * made it 32 bits long. If this is an old format inode,
1140 * convert it in memory to look like a new one. If it gets
1141 * flushed to disk we will convert back before flushing or
1142 * logging it. We zero out the new projid field and the old link
1143 * count field. We'll handle clearing the pad field (the remains
1144 * of the old uuid field) when we actually convert the inode to
1145 * the new format. We don't change the version number so that we
1146 * can distinguish this from a real new format inode.
1147 */
1148 if (ip->i_d.di_version == 1) {
1149 ip->i_d.di_nlink = ip->i_d.di_onlink;
1150 ip->i_d.di_onlink = 0;
1151 xfs_set_projid(ip, 0);
1152 }
1153 598
1154 ip->i_delayed_blks = 0; 599 error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
600 if (error)
601 goto out_free_name;
1155 602
1156 /* 603 return 0;
1157 * Mark the buffer containing the inode as something to keep
1158 * around for a while. This helps to keep recently accessed
1159 * meta-data in-core longer.
1160 */
1161 xfs_buf_set_ref(bp, XFS_INO_REF);
1162 604
1163 /* 605out_free_name:
1164 * Use xfs_trans_brelse() to release the buffer containing the on-disk 606 if (ci_name)
1165 * inode, because it was acquired with xfs_trans_read_buf() in 607 kmem_free(ci_name->name);
1166 * xfs_imap_to_bp() above. If tp is NULL, this is just a normal 608out:
1167 * brelse(). If we're within a transaction, then xfs_trans_brelse() 609 *ipp = NULL;
1168 * will only release the buffer if it is not dirty within the
1169 * transaction. It will be OK to release the buffer in this case,
1170 * because inodes on disk are never destroyed and we will be locking the
1171 * new in-core inode before putting it in the cache where other
1172 * processes can find it. Thus we don't have to worry about the inode
1173 * being changed just because we released the buffer.
1174 */
1175 out_brelse:
1176 xfs_trans_brelse(tp, bp);
1177 return error; 610 return error;
1178} 611}
1179 612
1180/* 613/*
1181 * Read in extents from a btree-format inode.
1182 * Allocate and fill in if_extents. Real work is done in xfs_bmap.c.
1183 */
1184int
1185xfs_iread_extents(
1186 xfs_trans_t *tp,
1187 xfs_inode_t *ip,
1188 int whichfork)
1189{
1190 int error;
1191 xfs_ifork_t *ifp;
1192 xfs_extnum_t nextents;
1193
1194 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
1195 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
1196 ip->i_mount);
1197 return XFS_ERROR(EFSCORRUPTED);
1198 }
1199 nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
1200 ifp = XFS_IFORK_PTR(ip, whichfork);
1201
1202 /*
1203 * We know that the size is valid (it's checked in iformat_btree)
1204 */
1205 ifp->if_bytes = ifp->if_real_bytes = 0;
1206 ifp->if_flags |= XFS_IFEXTENTS;
1207 xfs_iext_add(ifp, 0, nextents);
1208 error = xfs_bmap_read_extents(tp, ip, whichfork);
1209 if (error) {
1210 xfs_iext_destroy(ifp);
1211 ifp->if_flags &= ~XFS_IFEXTENTS;
1212 return error;
1213 }
1214 xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
1215 return 0;
1216}
1217
1218/*
1219 * Allocate an inode on disk and return a copy of its in-core version. 614 * Allocate an inode on disk and return a copy of its in-core version.
1220 * The in-core inode is locked exclusively. Set mode, nlink, and rdev 615 * The in-core inode is locked exclusively. Set mode, nlink, and rdev
1221 * appropriately within the inode. The uid and gid for the inode are 616 * appropriately within the inode. The uid and gid for the inode are
@@ -1295,8 +690,8 @@ xfs_ialloc(
1295 ip->i_d.di_onlink = 0; 690 ip->i_d.di_onlink = 0;
1296 ip->i_d.di_nlink = nlink; 691 ip->i_d.di_nlink = nlink;
1297 ASSERT(ip->i_d.di_nlink == nlink); 692 ASSERT(ip->i_d.di_nlink == nlink);
1298 ip->i_d.di_uid = current_fsuid(); 693 ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
1299 ip->i_d.di_gid = current_fsgid(); 694 ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
1300 xfs_set_projid(ip, prid); 695 xfs_set_projid(ip, prid);
1301 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); 696 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1302 697
@@ -1335,7 +730,7 @@ xfs_ialloc(
1335 */ 730 */
1336 if ((irix_sgid_inherit) && 731 if ((irix_sgid_inherit) &&
1337 (ip->i_d.di_mode & S_ISGID) && 732 (ip->i_d.di_mode & S_ISGID) &&
1338 (!in_group_p((gid_t)ip->i_d.di_gid))) { 733 (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) {
1339 ip->i_d.di_mode &= ~S_ISGID; 734 ip->i_d.di_mode &= ~S_ISGID;
1340 } 735 }
1341 736
@@ -1467,6 +862,583 @@ xfs_ialloc(
1467} 862}
1468 863
1469/* 864/*
865 * Allocates a new inode from disk and return a pointer to the
866 * incore copy. This routine will internally commit the current
867 * transaction and allocate a new one if the Space Manager needed
868 * to do an allocation to replenish the inode free-list.
869 *
870 * This routine is designed to be called from xfs_create and
871 * xfs_create_dir.
872 *
873 */
874int
875xfs_dir_ialloc(
876 xfs_trans_t **tpp, /* input: current transaction;
877 output: may be a new transaction. */
878 xfs_inode_t *dp, /* directory within whose allocate
879 the inode. */
880 umode_t mode,
881 xfs_nlink_t nlink,
882 xfs_dev_t rdev,
883 prid_t prid, /* project id */
884 int okalloc, /* ok to allocate new space */
885 xfs_inode_t **ipp, /* pointer to inode; it will be
886 locked. */
887 int *committed)
888
889{
890 xfs_trans_t *tp;
891 xfs_trans_t *ntp;
892 xfs_inode_t *ip;
893 xfs_buf_t *ialloc_context = NULL;
894 int code;
895 void *dqinfo;
896 uint tflags;
897
898 tp = *tpp;
899 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
900
901 /*
902 * xfs_ialloc will return a pointer to an incore inode if
903 * the Space Manager has an available inode on the free
904 * list. Otherwise, it will do an allocation and replenish
905 * the freelist. Since we can only do one allocation per
906 * transaction without deadlocks, we will need to commit the
907 * current transaction and start a new one. We will then
908 * need to call xfs_ialloc again to get the inode.
909 *
910 * If xfs_ialloc did an allocation to replenish the freelist,
911 * it returns the bp containing the head of the freelist as
912 * ialloc_context. We will hold a lock on it across the
913 * transaction commit so that no other process can steal
914 * the inode(s) that we've just allocated.
915 */
916 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
917 &ialloc_context, &ip);
918
919 /*
920 * Return an error if we were unable to allocate a new inode.
921 * This should only happen if we run out of space on disk or
922 * encounter a disk error.
923 */
924 if (code) {
925 *ipp = NULL;
926 return code;
927 }
928 if (!ialloc_context && !ip) {
929 *ipp = NULL;
930 return XFS_ERROR(ENOSPC);
931 }
932
933 /*
934 * If the AGI buffer is non-NULL, then we were unable to get an
935 * inode in one operation. We need to commit the current
936 * transaction and call xfs_ialloc() again. It is guaranteed
937 * to succeed the second time.
938 */
939 if (ialloc_context) {
940 struct xfs_trans_res tres;
941
942 /*
943 * Normally, xfs_trans_commit releases all the locks.
944 * We call bhold to hang on to the ialloc_context across
945 * the commit. Holding this buffer prevents any other
946 * processes from doing any allocations in this
947 * allocation group.
948 */
949 xfs_trans_bhold(tp, ialloc_context);
950 /*
951 * Save the log reservation so we can use
952 * them in the next transaction.
953 */
954 tres.tr_logres = xfs_trans_get_log_res(tp);
955 tres.tr_logcount = xfs_trans_get_log_count(tp);
956
957 /*
958 * We want the quota changes to be associated with the next
959 * transaction, NOT this one. So, detach the dqinfo from this
960 * and attach it to the next transaction.
961 */
962 dqinfo = NULL;
963 tflags = 0;
964 if (tp->t_dqinfo) {
965 dqinfo = (void *)tp->t_dqinfo;
966 tp->t_dqinfo = NULL;
967 tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
968 tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
969 }
970
971 ntp = xfs_trans_dup(tp);
972 code = xfs_trans_commit(tp, 0);
973 tp = ntp;
974 if (committed != NULL) {
975 *committed = 1;
976 }
977 /*
978 * If we get an error during the commit processing,
979 * release the buffer that is still held and return
980 * to the caller.
981 */
982 if (code) {
983 xfs_buf_relse(ialloc_context);
984 if (dqinfo) {
985 tp->t_dqinfo = dqinfo;
986 xfs_trans_free_dqinfo(tp);
987 }
988 *tpp = ntp;
989 *ipp = NULL;
990 return code;
991 }
992
993 /*
994 * transaction commit worked ok so we can drop the extra ticket
995 * reference that we gained in xfs_trans_dup()
996 */
997 xfs_log_ticket_put(tp->t_ticket);
998 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
999 code = xfs_trans_reserve(tp, &tres, 0, 0);
1000
1001 /*
1002 * Re-attach the quota info that we detached from prev trx.
1003 */
1004 if (dqinfo) {
1005 tp->t_dqinfo = dqinfo;
1006 tp->t_flags |= tflags;
1007 }
1008
1009 if (code) {
1010 xfs_buf_relse(ialloc_context);
1011 *tpp = ntp;
1012 *ipp = NULL;
1013 return code;
1014 }
1015 xfs_trans_bjoin(tp, ialloc_context);
1016
1017 /*
1018 * Call ialloc again. Since we've locked out all
1019 * other allocations in this allocation group,
1020 * this call should always succeed.
1021 */
1022 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
1023 okalloc, &ialloc_context, &ip);
1024
1025 /*
1026 * If we get an error at this point, return to the caller
1027 * so that the current transaction can be aborted.
1028 */
1029 if (code) {
1030 *tpp = tp;
1031 *ipp = NULL;
1032 return code;
1033 }
1034 ASSERT(!ialloc_context && ip);
1035
1036 } else {
1037 if (committed != NULL)
1038 *committed = 0;
1039 }
1040
1041 *ipp = ip;
1042 *tpp = tp;
1043
1044 return 0;
1045}
1046
1047/*
1048 * Decrement the link count on an inode & log the change.
1049 * If this causes the link count to go to zero, initiate the
1050 * logging activity required to truncate a file.
1051 */
1052int /* error */
1053xfs_droplink(
1054 xfs_trans_t *tp,
1055 xfs_inode_t *ip)
1056{
1057 int error;
1058
1059 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1060
1061 ASSERT (ip->i_d.di_nlink > 0);
1062 ip->i_d.di_nlink--;
1063 drop_nlink(VFS_I(ip));
1064 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1065
1066 error = 0;
1067 if (ip->i_d.di_nlink == 0) {
1068 /*
1069 * We're dropping the last link to this file.
1070 * Move the on-disk inode to the AGI unlinked list.
1071 * From xfs_inactive() we will pull the inode from
1072 * the list and free it.
1073 */
1074 error = xfs_iunlink(tp, ip);
1075 }
1076 return error;
1077}
1078
1079/*
1080 * This gets called when the inode's version needs to be changed from 1 to 2.
1081 * Currently this happens when the nlink field overflows the old 16-bit value
1082 * or when chproj is called to change the project for the first time.
1083 * As a side effect the superblock version will also get rev'd
1084 * to contain the NLINK bit.
1085 */
1086void
1087xfs_bump_ino_vers2(
1088 xfs_trans_t *tp,
1089 xfs_inode_t *ip)
1090{
1091 xfs_mount_t *mp;
1092
1093 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1094 ASSERT(ip->i_d.di_version == 1);
1095
1096 ip->i_d.di_version = 2;
1097 ip->i_d.di_onlink = 0;
1098 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1099 mp = tp->t_mountp;
1100 if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
1101 spin_lock(&mp->m_sb_lock);
1102 if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
1103 xfs_sb_version_addnlink(&mp->m_sb);
1104 spin_unlock(&mp->m_sb_lock);
1105 xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
1106 } else {
1107 spin_unlock(&mp->m_sb_lock);
1108 }
1109 }
1110 /* Caller must log the inode */
1111}
1112
1113/*
1114 * Increment the link count on an inode & log the change.
1115 */
1116int
1117xfs_bumplink(
1118 xfs_trans_t *tp,
1119 xfs_inode_t *ip)
1120{
1121 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1122
1123 ASSERT(ip->i_d.di_nlink > 0);
1124 ip->i_d.di_nlink++;
1125 inc_nlink(VFS_I(ip));
1126 if ((ip->i_d.di_version == 1) &&
1127 (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
1128 /*
1129 * The inode has increased its number of links beyond
1130 * what can fit in an old format inode. It now needs
1131 * to be converted to a version 2 inode with a 32 bit
1132 * link count. If this is the first inode in the file
1133 * system to do this, then we need to bump the superblock
1134 * version number as well.
1135 */
1136 xfs_bump_ino_vers2(tp, ip);
1137 }
1138
1139 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1140 return 0;
1141}
1142
1143int
1144xfs_create(
1145 xfs_inode_t *dp,
1146 struct xfs_name *name,
1147 umode_t mode,
1148 xfs_dev_t rdev,
1149 xfs_inode_t **ipp)
1150{
1151 int is_dir = S_ISDIR(mode);
1152 struct xfs_mount *mp = dp->i_mount;
1153 struct xfs_inode *ip = NULL;
1154 struct xfs_trans *tp = NULL;
1155 int error;
1156 xfs_bmap_free_t free_list;
1157 xfs_fsblock_t first_block;
1158 bool unlock_dp_on_error = false;
1159 uint cancel_flags;
1160 int committed;
1161 prid_t prid;
1162 struct xfs_dquot *udqp = NULL;
1163 struct xfs_dquot *gdqp = NULL;
1164 struct xfs_dquot *pdqp = NULL;
1165 struct xfs_trans_res tres;
1166 uint resblks;
1167
1168 trace_xfs_create(dp, name);
1169
1170 if (XFS_FORCED_SHUTDOWN(mp))
1171 return XFS_ERROR(EIO);
1172
1173 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1174 prid = xfs_get_projid(dp);
1175 else
1176 prid = XFS_PROJID_DEFAULT;
1177
1178 /*
1179 * Make sure that we have allocated dquot(s) on disk.
1180 */
1181 error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
1182 xfs_kgid_to_gid(current_fsgid()), prid,
1183 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1184 &udqp, &gdqp, &pdqp);
1185 if (error)
1186 return error;
1187
1188 if (is_dir) {
1189 rdev = 0;
1190 resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
1191 tres.tr_logres = M_RES(mp)->tr_mkdir.tr_logres;
1192 tres.tr_logcount = XFS_MKDIR_LOG_COUNT;
1193 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
1194 } else {
1195 resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1196 tres.tr_logres = M_RES(mp)->tr_create.tr_logres;
1197 tres.tr_logcount = XFS_CREATE_LOG_COUNT;
1198 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1199 }
1200
1201 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1202
1203 /*
1204 * Initially assume that the file does not exist and
1205 * reserve the resources for that case. If that is not
1206 * the case we'll drop the one we have and get a more
1207 * appropriate transaction later.
1208 */
1209 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
1210 error = xfs_trans_reserve(tp, &tres, resblks, 0);
1211 if (error == ENOSPC) {
1212 /* flush outstanding delalloc blocks and retry */
1213 xfs_flush_inodes(mp);
1214 error = xfs_trans_reserve(tp, &tres, resblks, 0);
1215 }
1216 if (error == ENOSPC) {
1217 /* No space at all so try a "no-allocation" reservation */
1218 resblks = 0;
1219 error = xfs_trans_reserve(tp, &tres, 0, 0);
1220 }
1221 if (error) {
1222 cancel_flags = 0;
1223 goto out_trans_cancel;
1224 }
1225
1226 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1227 unlock_dp_on_error = true;
1228
1229 xfs_bmap_init(&free_list, &first_block);
1230
1231 /*
1232 * Reserve disk quota and the inode.
1233 */
1234 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
1235 pdqp, resblks, 1, 0);
1236 if (error)
1237 goto out_trans_cancel;
1238
1239 error = xfs_dir_canenter(tp, dp, name, resblks);
1240 if (error)
1241 goto out_trans_cancel;
1242
1243 /*
1244 * A newly created regular or special file just has one directory
1245 * entry pointing to them, but a directory also the "." entry
1246 * pointing to itself.
1247 */
1248 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
1249 prid, resblks > 0, &ip, &committed);
1250 if (error) {
1251 if (error == ENOSPC)
1252 goto out_trans_cancel;
1253 goto out_trans_abort;
1254 }
1255
1256 /*
1257 * Now we join the directory inode to the transaction. We do not do it
1258 * earlier because xfs_dir_ialloc might commit the previous transaction
1259 * (and release all the locks). An error from here on will result in
1260 * the transaction cancel unlocking dp so don't do it explicitly in the
1261 * error path.
1262 */
1263 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1264 unlock_dp_on_error = false;
1265
1266 error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1267 &first_block, &free_list, resblks ?
1268 resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1269 if (error) {
1270 ASSERT(error != ENOSPC);
1271 goto out_trans_abort;
1272 }
1273 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1274 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1275
1276 if (is_dir) {
1277 error = xfs_dir_init(tp, ip, dp);
1278 if (error)
1279 goto out_bmap_cancel;
1280
1281 error = xfs_bumplink(tp, dp);
1282 if (error)
1283 goto out_bmap_cancel;
1284 }
1285
1286 /*
1287 * If this is a synchronous mount, make sure that the
1288 * create transaction goes to disk before returning to
1289 * the user.
1290 */
1291 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1292 xfs_trans_set_sync(tp);
1293
1294 /*
1295 * Attach the dquot(s) to the inodes and modify them incore.
1296 * These ids of the inode couldn't have changed since the new
1297 * inode has been locked ever since it was created.
1298 */
1299 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1300
1301 error = xfs_bmap_finish(&tp, &free_list, &committed);
1302 if (error)
1303 goto out_bmap_cancel;
1304
1305 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1306 if (error)
1307 goto out_release_inode;
1308
1309 xfs_qm_dqrele(udqp);
1310 xfs_qm_dqrele(gdqp);
1311 xfs_qm_dqrele(pdqp);
1312
1313 *ipp = ip;
1314 return 0;
1315
1316 out_bmap_cancel:
1317 xfs_bmap_cancel(&free_list);
1318 out_trans_abort:
1319 cancel_flags |= XFS_TRANS_ABORT;
1320 out_trans_cancel:
1321 xfs_trans_cancel(tp, cancel_flags);
1322 out_release_inode:
1323 /*
1324 * Wait until after the current transaction is aborted to
1325 * release the inode. This prevents recursive transactions
1326 * and deadlocks from xfs_inactive.
1327 */
1328 if (ip)
1329 IRELE(ip);
1330
1331 xfs_qm_dqrele(udqp);
1332 xfs_qm_dqrele(gdqp);
1333 xfs_qm_dqrele(pdqp);
1334
1335 if (unlock_dp_on_error)
1336 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1337 return error;
1338}
1339
1340int
1341xfs_link(
1342 xfs_inode_t *tdp,
1343 xfs_inode_t *sip,
1344 struct xfs_name *target_name)
1345{
1346 xfs_mount_t *mp = tdp->i_mount;
1347 xfs_trans_t *tp;
1348 int error;
1349 xfs_bmap_free_t free_list;
1350 xfs_fsblock_t first_block;
1351 int cancel_flags;
1352 int committed;
1353 int resblks;
1354
1355 trace_xfs_link(tdp, target_name);
1356
1357 ASSERT(!S_ISDIR(sip->i_d.di_mode));
1358
1359 if (XFS_FORCED_SHUTDOWN(mp))
1360 return XFS_ERROR(EIO);
1361
1362 error = xfs_qm_dqattach(sip, 0);
1363 if (error)
1364 goto std_return;
1365
1366 error = xfs_qm_dqattach(tdp, 0);
1367 if (error)
1368 goto std_return;
1369
1370 tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
1371 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1372 resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1373 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
1374 if (error == ENOSPC) {
1375 resblks = 0;
1376 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
1377 }
1378 if (error) {
1379 cancel_flags = 0;
1380 goto error_return;
1381 }
1382
1383 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1384
1385 xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1386 xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1387
1388 /*
1389 * If we are using project inheritance, we only allow hard link
1390 * creation in our tree when the project IDs are the same; else
1391 * the tree quota mechanism could be circumvented.
1392 */
1393 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1394 (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1395 error = XFS_ERROR(EXDEV);
1396 goto error_return;
1397 }
1398
1399 error = xfs_dir_canenter(tp, tdp, target_name, resblks);
1400 if (error)
1401 goto error_return;
1402
1403 xfs_bmap_init(&free_list, &first_block);
1404
1405 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1406 &first_block, &free_list, resblks);
1407 if (error)
1408 goto abort_return;
1409 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1410 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1411
1412 error = xfs_bumplink(tp, sip);
1413 if (error)
1414 goto abort_return;
1415
1416 /*
1417 * If this is a synchronous mount, make sure that the
1418 * link transaction goes to disk before returning to
1419 * the user.
1420 */
1421 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1422 xfs_trans_set_sync(tp);
1423 }
1424
1425 error = xfs_bmap_finish (&tp, &free_list, &committed);
1426 if (error) {
1427 xfs_bmap_cancel(&free_list);
1428 goto abort_return;
1429 }
1430
1431 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1432
1433 abort_return:
1434 cancel_flags |= XFS_TRANS_ABORT;
1435 error_return:
1436 xfs_trans_cancel(tp, cancel_flags);
1437 std_return:
1438 return error;
1439}
1440
1441/*
1470 * Free up the underlying blocks past new_size. The new size must be smaller 1442 * Free up the underlying blocks past new_size. The new size must be smaller
1471 * than the current size. This routine can be used both for the attribute and 1443 * than the current size. This routine can be used both for the attribute and
1472 * data fork, and does not modify the inode size, which is left to the caller. 1444 * data fork, and does not modify the inode size, which is left to the caller.
@@ -1576,10 +1548,7 @@ xfs_itruncate_extents(
1576 * reference that we gained in xfs_trans_dup() 1548 * reference that we gained in xfs_trans_dup()
1577 */ 1549 */
1578 xfs_log_ticket_put(tp->t_ticket); 1550 xfs_log_ticket_put(tp->t_ticket);
1579 error = xfs_trans_reserve(tp, 0, 1551 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
1580 XFS_ITRUNCATE_LOG_RES(mp), 0,
1581 XFS_TRANS_PERM_LOG_RES,
1582 XFS_ITRUNCATE_LOG_COUNT);
1583 if (error) 1552 if (error)
1584 goto out; 1553 goto out;
1585 } 1554 }
@@ -1605,6 +1574,271 @@ out_bmap_cancel:
1605 goto out; 1574 goto out;
1606} 1575}
1607 1576
1577int
1578xfs_release(
1579 xfs_inode_t *ip)
1580{
1581 xfs_mount_t *mp = ip->i_mount;
1582 int error;
1583
1584 if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
1585 return 0;
1586
1587 /* If this is a read-only mount, don't do this (would generate I/O) */
1588 if (mp->m_flags & XFS_MOUNT_RDONLY)
1589 return 0;
1590
1591 if (!XFS_FORCED_SHUTDOWN(mp)) {
1592 int truncated;
1593
1594 /*
1595 * If we are using filestreams, and we have an unlinked
1596 * file that we are processing the last close on, then nothing
1597 * will be able to reopen and write to this file. Purge this
1598 * inode from the filestreams cache so that it doesn't delay
1599 * teardown of the inode.
1600 */
1601 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1602 xfs_filestream_deassociate(ip);
1603
1604 /*
1605 * If we previously truncated this file and removed old data
1606 * in the process, we want to initiate "early" writeout on
1607 * the last close. This is an attempt to combat the notorious
1608 * NULL files problem which is particularly noticeable from a
1609 * truncate down, buffered (re-)write (delalloc), followed by
1610 * a crash. What we are effectively doing here is
1611 * significantly reducing the time window where we'd otherwise
1612 * be exposed to that problem.
1613 */
1614 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1615 if (truncated) {
1616 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
1617 if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
1618 error = -filemap_flush(VFS_I(ip)->i_mapping);
1619 if (error)
1620 return error;
1621 }
1622 }
1623 }
1624
1625 if (ip->i_d.di_nlink == 0)
1626 return 0;
1627
1628 if (xfs_can_free_eofblocks(ip, false)) {
1629
1630 /*
1631 * If we can't get the iolock just skip truncating the blocks
1632 * past EOF because we could deadlock with the mmap_sem
1633 * otherwise. We'll get another chance to drop them once the
1634 * last reference to the inode is dropped, so we'll never leak
1635 * blocks permanently.
1636 *
1637 * Further, check if the inode is being opened, written and
1638 * closed frequently and we have delayed allocation blocks
1639 * outstanding (e.g. streaming writes from the NFS server),
1640 * truncating the blocks past EOF will cause fragmentation to
1641 * occur.
1642 *
1643 * In this case don't do the truncation, either, but we have to
1644 * be careful how we detect this case. Blocks beyond EOF show
1645 * up as i_delayed_blks even when the inode is clean, so we
1646 * need to truncate them away first before checking for a dirty
1647 * release. Hence on the first dirty close we will still remove
1648 * the speculative allocation, but after that we will leave it
1649 * in place.
1650 */
1651 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
1652 return 0;
1653
1654 error = xfs_free_eofblocks(mp, ip, true);
1655 if (error && error != EAGAIN)
1656 return error;
1657
1658 /* delalloc blocks after truncation means it really is dirty */
1659 if (ip->i_delayed_blks)
1660 xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1661 }
1662 return 0;
1663}
1664
1665/*
1666 * xfs_inactive
1667 *
1668 * This is called when the vnode reference count for the vnode
1669 * goes to zero. If the file has been unlinked, then it must
1670 * now be truncated. Also, we clear all of the read-ahead state
1671 * kept for the inode here since the file is now closed.
1672 */
1673int
1674xfs_inactive(
1675 xfs_inode_t *ip)
1676{
1677 xfs_bmap_free_t free_list;
1678 xfs_fsblock_t first_block;
1679 int committed;
1680 struct xfs_trans *tp;
1681 struct xfs_mount *mp;
1682 struct xfs_trans_res *resp;
1683 int error;
1684 int truncate = 0;
1685
1686 /*
1687 * If the inode is already free, then there can be nothing
1688 * to clean up here.
1689 */
1690 if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
1691 ASSERT(ip->i_df.if_real_bytes == 0);
1692 ASSERT(ip->i_df.if_broot_bytes == 0);
1693 return VN_INACTIVE_CACHE;
1694 }
1695
1696 mp = ip->i_mount;
1697
1698 error = 0;
1699
1700 /* If this is a read-only mount, don't do this (would generate I/O) */
1701 if (mp->m_flags & XFS_MOUNT_RDONLY)
1702 goto out;
1703
1704 if (ip->i_d.di_nlink != 0) {
1705 /*
1706 * force is true because we are evicting an inode from the
1707 * cache. Post-eof blocks must be freed, lest we end up with
1708 * broken free space accounting.
1709 */
1710 if (xfs_can_free_eofblocks(ip, true)) {
1711 error = xfs_free_eofblocks(mp, ip, false);
1712 if (error)
1713 return VN_INACTIVE_CACHE;
1714 }
1715 goto out;
1716 }
1717
1718 if (S_ISREG(ip->i_d.di_mode) &&
1719 (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
1720 ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
1721 truncate = 1;
1722
1723 error = xfs_qm_dqattach(ip, 0);
1724 if (error)
1725 return VN_INACTIVE_CACHE;
1726
1727 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1728 resp = (truncate || S_ISLNK(ip->i_d.di_mode)) ?
1729 &M_RES(mp)->tr_itruncate : &M_RES(mp)->tr_ifree;
1730
1731 error = xfs_trans_reserve(tp, resp, 0, 0);
1732 if (error) {
1733 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1734 xfs_trans_cancel(tp, 0);
1735 return VN_INACTIVE_CACHE;
1736 }
1737
1738 xfs_ilock(ip, XFS_ILOCK_EXCL);
1739 xfs_trans_ijoin(tp, ip, 0);
1740
1741 if (S_ISLNK(ip->i_d.di_mode)) {
1742 error = xfs_inactive_symlink(ip, &tp);
1743 if (error)
1744 goto out_cancel;
1745 } else if (truncate) {
1746 ip->i_d.di_size = 0;
1747 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1748
1749 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
1750 if (error)
1751 goto out_cancel;
1752
1753 ASSERT(ip->i_d.di_nextents == 0);
1754 }
1755
1756 /*
1757 * If there are attributes associated with the file then blow them away
1758 * now. The code calls a routine that recursively deconstructs the
1759 * attribute fork. We need to just commit the current transaction
1760 * because we can't use it for xfs_attr_inactive().
1761 */
1762 if (ip->i_d.di_anextents > 0) {
1763 ASSERT(ip->i_d.di_forkoff != 0);
1764
1765 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1766 if (error)
1767 goto out_unlock;
1768
1769 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1770
1771 error = xfs_attr_inactive(ip);
1772 if (error)
1773 goto out;
1774
1775 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1776 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, 0, 0);
1777 if (error) {
1778 xfs_trans_cancel(tp, 0);
1779 goto out;
1780 }
1781
1782 xfs_ilock(ip, XFS_ILOCK_EXCL);
1783 xfs_trans_ijoin(tp, ip, 0);
1784 }
1785
1786 if (ip->i_afp)
1787 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1788
1789 ASSERT(ip->i_d.di_anextents == 0);
1790
1791 /*
1792 * Free the inode.
1793 */
1794 xfs_bmap_init(&free_list, &first_block);
1795 error = xfs_ifree(tp, ip, &free_list);
1796 if (error) {
1797 /*
1798 * If we fail to free the inode, shut down. The cancel
1799 * might do that, we need to make sure. Otherwise the
1800 * inode might be lost for a long time or forever.
1801 */
1802 if (!XFS_FORCED_SHUTDOWN(mp)) {
1803 xfs_notice(mp, "%s: xfs_ifree returned error %d",
1804 __func__, error);
1805 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1806 }
1807 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1808 } else {
1809 /*
1810 * Credit the quota account(s). The inode is gone.
1811 */
1812 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1813
1814 /*
1815 * Just ignore errors at this point. There is nothing we can
1816 * do except to try to keep going. Make sure it's not a silent
1817 * error.
1818 */
1819 error = xfs_bmap_finish(&tp, &free_list, &committed);
1820 if (error)
1821 xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
1822 __func__, error);
1823 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1824 if (error)
1825 xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
1826 __func__, error);
1827 }
1828
1829 /*
1830 * Release the dquots held by inode, if any.
1831 */
1832 xfs_qm_dqdetach(ip);
1833out_unlock:
1834 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1835out:
1836 return VN_INACTIVE_CACHE;
1837out_cancel:
1838 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1839 goto out_unlock;
1840}
1841
1608/* 1842/*
1609 * This is called when the inode's link count goes to 0. 1843 * This is called when the inode's link count goes to 0.
1610 * We place the on-disk inode on a list in the AGI. It 1844 * We place the on-disk inode on a list in the AGI. It
@@ -1861,7 +2095,7 @@ xfs_iunlink_remove(
1861} 2095}
1862 2096
1863/* 2097/*
1864 * A big issue when freeing the inode cluster is is that we _cannot_ skip any 2098 * A big issue when freeing the inode cluster is that we _cannot_ skip any
1865 * inodes that are in memory - they all must be marked stale and attached to 2099 * inodes that are in memory - they all must be marked stale and attached to
1866 * the cluster buffer. 2100 * the cluster buffer.
1867 */ 2101 */
@@ -2094,272 +2328,6 @@ xfs_ifree(
2094} 2328}
2095 2329
2096/* 2330/*
2097 * Reallocate the space for if_broot based on the number of records
2098 * being added or deleted as indicated in rec_diff. Move the records
2099 * and pointers in if_broot to fit the new size. When shrinking this
2100 * will eliminate holes between the records and pointers created by
2101 * the caller. When growing this will create holes to be filled in
2102 * by the caller.
2103 *
2104 * The caller must not request to add more records than would fit in
2105 * the on-disk inode root. If the if_broot is currently NULL, then
2106 * if we adding records one will be allocated. The caller must also
2107 * not request that the number of records go below zero, although
2108 * it can go to zero.
2109 *
2110 * ip -- the inode whose if_broot area is changing
2111 * ext_diff -- the change in the number of records, positive or negative,
2112 * requested for the if_broot array.
2113 */
2114void
2115xfs_iroot_realloc(
2116 xfs_inode_t *ip,
2117 int rec_diff,
2118 int whichfork)
2119{
2120 struct xfs_mount *mp = ip->i_mount;
2121 int cur_max;
2122 xfs_ifork_t *ifp;
2123 struct xfs_btree_block *new_broot;
2124 int new_max;
2125 size_t new_size;
2126 char *np;
2127 char *op;
2128
2129 /*
2130 * Handle the degenerate case quietly.
2131 */
2132 if (rec_diff == 0) {
2133 return;
2134 }
2135
2136 ifp = XFS_IFORK_PTR(ip, whichfork);
2137 if (rec_diff > 0) {
2138 /*
2139 * If there wasn't any memory allocated before, just
2140 * allocate it now and get out.
2141 */
2142 if (ifp->if_broot_bytes == 0) {
2143 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
2144 ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
2145 ifp->if_broot_bytes = (int)new_size;
2146 return;
2147 }
2148
2149 /*
2150 * If there is already an existing if_broot, then we need
2151 * to realloc() it and shift the pointers to their new
2152 * location. The records don't change location because
2153 * they are kept butted up against the btree block header.
2154 */
2155 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
2156 new_max = cur_max + rec_diff;
2157 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
2158 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
2159 XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
2160 KM_SLEEP | KM_NOFS);
2161 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2162 ifp->if_broot_bytes);
2163 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2164 (int)new_size);
2165 ifp->if_broot_bytes = (int)new_size;
2166 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
2167 XFS_IFORK_SIZE(ip, whichfork));
2168 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
2169 return;
2170 }
2171
2172 /*
2173 * rec_diff is less than 0. In this case, we are shrinking the
2174 * if_broot buffer. It must already exist. If we go to zero
2175 * records, just get rid of the root and clear the status bit.
2176 */
2177 ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
2178 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
2179 new_max = cur_max + rec_diff;
2180 ASSERT(new_max >= 0);
2181 if (new_max > 0)
2182 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
2183 else
2184 new_size = 0;
2185 if (new_size > 0) {
2186 new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
2187 /*
2188 * First copy over the btree block header.
2189 */
2190 memcpy(new_broot, ifp->if_broot,
2191 XFS_BMBT_BLOCK_LEN(ip->i_mount));
2192 } else {
2193 new_broot = NULL;
2194 ifp->if_flags &= ~XFS_IFBROOT;
2195 }
2196
2197 /*
2198 * Only copy the records and pointers if there are any.
2199 */
2200 if (new_max > 0) {
2201 /*
2202 * First copy the records.
2203 */
2204 op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
2205 np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
2206 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
2207
2208 /*
2209 * Then copy the pointers.
2210 */
2211 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2212 ifp->if_broot_bytes);
2213 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
2214 (int)new_size);
2215 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
2216 }
2217 kmem_free(ifp->if_broot);
2218 ifp->if_broot = new_broot;
2219 ifp->if_broot_bytes = (int)new_size;
2220 if (ifp->if_broot)
2221 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
2222 XFS_IFORK_SIZE(ip, whichfork));
2223 return;
2224}
2225
2226
2227/*
2228 * This is called when the amount of space needed for if_data
2229 * is increased or decreased. The change in size is indicated by
2230 * the number of bytes that need to be added or deleted in the
2231 * byte_diff parameter.
2232 *
2233 * If the amount of space needed has decreased below the size of the
2234 * inline buffer, then switch to using the inline buffer. Otherwise,
2235 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
2236 * to what is needed.
2237 *
2238 * ip -- the inode whose if_data area is changing
2239 * byte_diff -- the change in the number of bytes, positive or negative,
2240 * requested for the if_data array.
2241 */
2242void
2243xfs_idata_realloc(
2244 xfs_inode_t *ip,
2245 int byte_diff,
2246 int whichfork)
2247{
2248 xfs_ifork_t *ifp;
2249 int new_size;
2250 int real_size;
2251
2252 if (byte_diff == 0) {
2253 return;
2254 }
2255
2256 ifp = XFS_IFORK_PTR(ip, whichfork);
2257 new_size = (int)ifp->if_bytes + byte_diff;
2258 ASSERT(new_size >= 0);
2259
2260 if (new_size == 0) {
2261 if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2262 kmem_free(ifp->if_u1.if_data);
2263 }
2264 ifp->if_u1.if_data = NULL;
2265 real_size = 0;
2266 } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
2267 /*
2268 * If the valid extents/data can fit in if_inline_ext/data,
2269 * copy them from the malloc'd vector and free it.
2270 */
2271 if (ifp->if_u1.if_data == NULL) {
2272 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2273 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2274 ASSERT(ifp->if_real_bytes != 0);
2275 memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
2276 new_size);
2277 kmem_free(ifp->if_u1.if_data);
2278 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2279 }
2280 real_size = 0;
2281 } else {
2282 /*
2283 * Stuck with malloc/realloc.
2284 * For inline data, the underlying buffer must be
2285 * a multiple of 4 bytes in size so that it can be
2286 * logged and stay on word boundaries. We enforce
2287 * that here.
2288 */
2289 real_size = roundup(new_size, 4);
2290 if (ifp->if_u1.if_data == NULL) {
2291 ASSERT(ifp->if_real_bytes == 0);
2292 ifp->if_u1.if_data = kmem_alloc(real_size,
2293 KM_SLEEP | KM_NOFS);
2294 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2295 /*
2296 * Only do the realloc if the underlying size
2297 * is really changing.
2298 */
2299 if (ifp->if_real_bytes != real_size) {
2300 ifp->if_u1.if_data =
2301 kmem_realloc(ifp->if_u1.if_data,
2302 real_size,
2303 ifp->if_real_bytes,
2304 KM_SLEEP | KM_NOFS);
2305 }
2306 } else {
2307 ASSERT(ifp->if_real_bytes == 0);
2308 ifp->if_u1.if_data = kmem_alloc(real_size,
2309 KM_SLEEP | KM_NOFS);
2310 memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
2311 ifp->if_bytes);
2312 }
2313 }
2314 ifp->if_real_bytes = real_size;
2315 ifp->if_bytes = new_size;
2316 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2317}
2318
2319void
2320xfs_idestroy_fork(
2321 xfs_inode_t *ip,
2322 int whichfork)
2323{
2324 xfs_ifork_t *ifp;
2325
2326 ifp = XFS_IFORK_PTR(ip, whichfork);
2327 if (ifp->if_broot != NULL) {
2328 kmem_free(ifp->if_broot);
2329 ifp->if_broot = NULL;
2330 }
2331
2332 /*
2333 * If the format is local, then we can't have an extents
2334 * array so just look for an inline data array. If we're
2335 * not local then we may or may not have an extents list,
2336 * so check and free it up if we do.
2337 */
2338 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
2339 if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
2340 (ifp->if_u1.if_data != NULL)) {
2341 ASSERT(ifp->if_real_bytes != 0);
2342 kmem_free(ifp->if_u1.if_data);
2343 ifp->if_u1.if_data = NULL;
2344 ifp->if_real_bytes = 0;
2345 }
2346 } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
2347 ((ifp->if_flags & XFS_IFEXTIREC) ||
2348 ((ifp->if_u1.if_extents != NULL) &&
2349 (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
2350 ASSERT(ifp->if_real_bytes != 0);
2351 xfs_iext_destroy(ifp);
2352 }
2353 ASSERT(ifp->if_u1.if_extents == NULL ||
2354 ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
2355 ASSERT(ifp->if_real_bytes == 0);
2356 if (whichfork == XFS_ATTR_FORK) {
2357 kmem_zone_free(xfs_ifork_zone, ip->i_afp);
2358 ip->i_afp = NULL;
2359 }
2360}
2361
2362/*
2363 * This is called to unpin an inode. The caller must have the inode locked 2331 * This is called to unpin an inode. The caller must have the inode locked
2364 * in at least shared mode so that the buffer cannot be subsequently pinned 2332 * in at least shared mode so that the buffer cannot be subsequently pinned
2365 * once someone is waiting for it to be unpinned. 2333 * once someone is waiting for it to be unpinned.
@@ -2402,162 +2370,471 @@ xfs_iunpin_wait(
2402 __xfs_iunpin_wait(ip); 2370 __xfs_iunpin_wait(ip);
2403} 2371}
2404 2372
2405/*
2406 * xfs_iextents_copy()
2407 *
2408 * This is called to copy the REAL extents (as opposed to the delayed
2409 * allocation extents) from the inode into the given buffer. It
2410 * returns the number of bytes copied into the buffer.
2411 *
2412 * If there are no delayed allocation extents, then we can just
2413 * memcpy() the extents into the buffer. Otherwise, we need to
2414 * examine each extent in turn and skip those which are delayed.
2415 */
2416int 2373int
2417xfs_iextents_copy( 2374xfs_remove(
2418 xfs_inode_t *ip, 2375 xfs_inode_t *dp,
2419 xfs_bmbt_rec_t *dp, 2376 struct xfs_name *name,
2420 int whichfork) 2377 xfs_inode_t *ip)
2421{ 2378{
2422 int copied; 2379 xfs_mount_t *mp = dp->i_mount;
2423 int i; 2380 xfs_trans_t *tp = NULL;
2424 xfs_ifork_t *ifp; 2381 int is_dir = S_ISDIR(ip->i_d.di_mode);
2425 int nrecs; 2382 int error = 0;
2426 xfs_fsblock_t start_block; 2383 xfs_bmap_free_t free_list;
2384 xfs_fsblock_t first_block;
2385 int cancel_flags;
2386 int committed;
2387 int link_zero;
2388 uint resblks;
2389 uint log_count;
2427 2390
2428 ifp = XFS_IFORK_PTR(ip, whichfork); 2391 trace_xfs_remove(dp, name);
2429 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2392
2430 ASSERT(ifp->if_bytes > 0); 2393 if (XFS_FORCED_SHUTDOWN(mp))
2394 return XFS_ERROR(EIO);
2395
2396 error = xfs_qm_dqattach(dp, 0);
2397 if (error)
2398 goto std_return;
2399
2400 error = xfs_qm_dqattach(ip, 0);
2401 if (error)
2402 goto std_return;
2431 2403
2432 nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); 2404 if (is_dir) {
2433 XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); 2405 tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
2434 ASSERT(nrecs > 0); 2406 log_count = XFS_DEFAULT_LOG_COUNT;
2407 } else {
2408 tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2409 log_count = XFS_REMOVE_LOG_COUNT;
2410 }
2411 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2435 2412
2436 /* 2413 /*
2437 * There are some delayed allocation extents in the 2414 * We try to get the real space reservation first,
2438 * inode, so copy the extents one at a time and skip 2415 * allowing for directory btree deletion(s) implying
2439 * the delayed ones. There must be at least one 2416 * possible bmap insert(s). If we can't get the space
2440 * non-delayed extent. 2417 * reservation then we use 0 instead, and avoid the bmap
2418 * btree insert(s) in the directory code by, if the bmap
2419 * insert tries to happen, instead trimming the LAST
2420 * block from the directory.
2441 */ 2421 */
2442 copied = 0; 2422 resblks = XFS_REMOVE_SPACE_RES(mp);
2443 for (i = 0; i < nrecs; i++) { 2423 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
2444 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); 2424 if (error == ENOSPC) {
2445 start_block = xfs_bmbt_get_startblock(ep); 2425 resblks = 0;
2446 if (isnullstartblock(start_block)) { 2426 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
2447 /* 2427 }
2448 * It's a delayed allocation extent, so skip it. 2428 if (error) {
2449 */ 2429 ASSERT(error != ENOSPC);
2450 continue; 2430 cancel_flags = 0;
2431 goto out_trans_cancel;
2432 }
2433
2434 xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
2435
2436 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2437 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2438
2439 /*
2440 * If we're removing a directory perform some additional validation.
2441 */
2442 if (is_dir) {
2443 ASSERT(ip->i_d.di_nlink >= 2);
2444 if (ip->i_d.di_nlink != 2) {
2445 error = XFS_ERROR(ENOTEMPTY);
2446 goto out_trans_cancel;
2451 } 2447 }
2448 if (!xfs_dir_isempty(ip)) {
2449 error = XFS_ERROR(ENOTEMPTY);
2450 goto out_trans_cancel;
2451 }
2452 }
2453
2454 xfs_bmap_init(&free_list, &first_block);
2455 error = xfs_dir_removename(tp, dp, name, ip->i_ino,
2456 &first_block, &free_list, resblks);
2457 if (error) {
2458 ASSERT(error != ENOENT);
2459 goto out_bmap_cancel;
2460 }
2461 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2452 2462
2453 /* Translate to on disk format */ 2463 if (is_dir) {
2454 put_unaligned(cpu_to_be64(ep->l0), &dp->l0); 2464 /*
2455 put_unaligned(cpu_to_be64(ep->l1), &dp->l1); 2465 * Drop the link from ip's "..".
2456 dp++; 2466 */
2457 copied++; 2467 error = xfs_droplink(tp, dp);
2468 if (error)
2469 goto out_bmap_cancel;
2470
2471 /*
2472 * Drop the "." link from ip to self.
2473 */
2474 error = xfs_droplink(tp, ip);
2475 if (error)
2476 goto out_bmap_cancel;
2477 } else {
2478 /*
2479 * When removing a non-directory we need to log the parent
2480 * inode here. For a directory this is done implicitly
2481 * by the xfs_droplink call for the ".." entry.
2482 */
2483 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2458 } 2484 }
2459 ASSERT(copied != 0);
2460 xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
2461 2485
2462 return (copied * (uint)sizeof(xfs_bmbt_rec_t)); 2486 /*
2487 * Drop the link from dp to ip.
2488 */
2489 error = xfs_droplink(tp, ip);
2490 if (error)
2491 goto out_bmap_cancel;
2492
2493 /*
2494 * Determine if this is the last link while
2495 * we are in the transaction.
2496 */
2497 link_zero = (ip->i_d.di_nlink == 0);
2498
2499 /*
2500 * If this is a synchronous mount, make sure that the
2501 * remove transaction goes to disk before returning to
2502 * the user.
2503 */
2504 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
2505 xfs_trans_set_sync(tp);
2506
2507 error = xfs_bmap_finish(&tp, &free_list, &committed);
2508 if (error)
2509 goto out_bmap_cancel;
2510
2511 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2512 if (error)
2513 goto std_return;
2514
2515 /*
2516 * If we are using filestreams, kill the stream association.
2517 * If the file is still open it may get a new one but that
2518 * will get killed on last close in xfs_close() so we don't
2519 * have to worry about that.
2520 */
2521 if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
2522 xfs_filestream_deassociate(ip);
2523
2524 return 0;
2525
2526 out_bmap_cancel:
2527 xfs_bmap_cancel(&free_list);
2528 cancel_flags |= XFS_TRANS_ABORT;
2529 out_trans_cancel:
2530 xfs_trans_cancel(tp, cancel_flags);
2531 std_return:
2532 return error;
2463} 2533}
2464 2534
2465/* 2535/*
2466 * Each of the following cases stores data into the same region 2536 * Enter all inodes for a rename transaction into a sorted array.
2467 * of the on-disk inode, so only one of them can be valid at
2468 * any given time. While it is possible to have conflicting formats
2469 * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
2470 * in EXTENTS format, this can only happen when the fork has
2471 * changed formats after being modified but before being flushed.
2472 * In these cases, the format always takes precedence, because the
2473 * format indicates the current state of the fork.
2474 */ 2537 */
2475/*ARGSUSED*/
2476STATIC void 2538STATIC void
2477xfs_iflush_fork( 2539xfs_sort_for_rename(
2478 xfs_inode_t *ip, 2540 xfs_inode_t *dp1, /* in: old (source) directory inode */
2479 xfs_dinode_t *dip, 2541 xfs_inode_t *dp2, /* in: new (target) directory inode */
2480 xfs_inode_log_item_t *iip, 2542 xfs_inode_t *ip1, /* in: inode of old entry */
2481 int whichfork, 2543 xfs_inode_t *ip2, /* in: inode of new entry, if it
2482 xfs_buf_t *bp) 2544 already exists, NULL otherwise. */
2483{ 2545 xfs_inode_t **i_tab,/* out: array of inode returned, sorted */
2484 char *cp; 2546 int *num_inodes) /* out: number of inodes in array */
2485 xfs_ifork_t *ifp; 2547{
2486 xfs_mount_t *mp; 2548 xfs_inode_t *temp;
2487 static const short brootflag[2] = 2549 int i, j;
2488 { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
2489 static const short dataflag[2] =
2490 { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
2491 static const short extflag[2] =
2492 { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
2493
2494 if (!iip)
2495 return;
2496 ifp = XFS_IFORK_PTR(ip, whichfork);
2497 /*
2498 * This can happen if we gave up in iformat in an error path,
2499 * for the attribute fork.
2500 */
2501 if (!ifp) {
2502 ASSERT(whichfork == XFS_ATTR_FORK);
2503 return;
2504 }
2505 cp = XFS_DFORK_PTR(dip, whichfork);
2506 mp = ip->i_mount;
2507 switch (XFS_IFORK_FORMAT(ip, whichfork)) {
2508 case XFS_DINODE_FMT_LOCAL:
2509 if ((iip->ili_fields & dataflag[whichfork]) &&
2510 (ifp->if_bytes > 0)) {
2511 ASSERT(ifp->if_u1.if_data != NULL);
2512 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2513 memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
2514 }
2515 break;
2516 2550
2517 case XFS_DINODE_FMT_EXTENTS: 2551 /*
2518 ASSERT((ifp->if_flags & XFS_IFEXTENTS) || 2552 * i_tab contains a list of pointers to inodes. We initialize
2519 !(iip->ili_fields & extflag[whichfork])); 2553 * the table here & we'll sort it. We will then use it to
2520 if ((iip->ili_fields & extflag[whichfork]) && 2554 * order the acquisition of the inode locks.
2521 (ifp->if_bytes > 0)) { 2555 *
2522 ASSERT(xfs_iext_get_ext(ifp, 0)); 2556 * Note that the table may contain duplicates. e.g., dp1 == dp2.
2523 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); 2557 */
2524 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, 2558 i_tab[0] = dp1;
2525 whichfork); 2559 i_tab[1] = dp2;
2526 } 2560 i_tab[2] = ip1;
2527 break; 2561 if (ip2) {
2562 *num_inodes = 4;
2563 i_tab[3] = ip2;
2564 } else {
2565 *num_inodes = 3;
2566 i_tab[3] = NULL;
2567 }
2528 2568
2529 case XFS_DINODE_FMT_BTREE: 2569 /*
2530 if ((iip->ili_fields & brootflag[whichfork]) && 2570 * Sort the elements via bubble sort. (Remember, there are at
2531 (ifp->if_broot_bytes > 0)) { 2571 * most 4 elements to sort, so this is adequate.)
2532 ASSERT(ifp->if_broot != NULL); 2572 */
2533 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= 2573 for (i = 0; i < *num_inodes; i++) {
2534 XFS_IFORK_SIZE(ip, whichfork)); 2574 for (j = 1; j < *num_inodes; j++) {
2535 xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, 2575 if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
2536 (xfs_bmdr_block_t *)cp, 2576 temp = i_tab[j];
2537 XFS_DFORK_SIZE(dip, mp, whichfork)); 2577 i_tab[j] = i_tab[j-1];
2578 i_tab[j-1] = temp;
2579 }
2538 } 2580 }
2539 break; 2581 }
2582}
2583
2584/*
2585 * xfs_rename
2586 */
2587int
2588xfs_rename(
2589 xfs_inode_t *src_dp,
2590 struct xfs_name *src_name,
2591 xfs_inode_t *src_ip,
2592 xfs_inode_t *target_dp,
2593 struct xfs_name *target_name,
2594 xfs_inode_t *target_ip)
2595{
2596 xfs_trans_t *tp = NULL;
2597 xfs_mount_t *mp = src_dp->i_mount;
2598 int new_parent; /* moving to a new dir */
2599 int src_is_directory; /* src_name is a directory */
2600 int error;
2601 xfs_bmap_free_t free_list;
2602 xfs_fsblock_t first_block;
2603 int cancel_flags;
2604 int committed;
2605 xfs_inode_t *inodes[4];
2606 int spaceres;
2607 int num_inodes;
2608
2609 trace_xfs_rename(src_dp, target_dp, src_name, target_name);
2610
2611 new_parent = (src_dp != target_dp);
2612 src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
2613
2614 xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
2615 inodes, &num_inodes);
2616
2617 xfs_bmap_init(&free_list, &first_block);
2618 tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
2619 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2620 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
2621 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
2622 if (error == ENOSPC) {
2623 spaceres = 0;
2624 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
2625 }
2626 if (error) {
2627 xfs_trans_cancel(tp, 0);
2628 goto std_return;
2629 }
2630
2631 /*
2632 * Attach the dquots to the inodes
2633 */
2634 error = xfs_qm_vop_rename_dqattach(inodes);
2635 if (error) {
2636 xfs_trans_cancel(tp, cancel_flags);
2637 goto std_return;
2638 }
2639
2640 /*
2641 * Lock all the participating inodes. Depending upon whether
2642 * the target_name exists in the target directory, and
2643 * whether the target directory is the same as the source
2644 * directory, we can lock from 2 to 4 inodes.
2645 */
2646 xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
2647
2648 /*
2649 * Join all the inodes to the transaction. From this point on,
2650 * we can rely on either trans_commit or trans_cancel to unlock
2651 * them.
2652 */
2653 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
2654 if (new_parent)
2655 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
2656 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
2657 if (target_ip)
2658 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
2659
2660 /*
2661 * If we are using project inheritance, we only allow renames
2662 * into our tree when the project IDs are the same; else the
2663 * tree quota mechanism would be circumvented.
2664 */
2665 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2666 (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
2667 error = XFS_ERROR(EXDEV);
2668 goto error_return;
2669 }
2670
2671 /*
2672 * Set up the target.
2673 */
2674 if (target_ip == NULL) {
2675 /*
2676 * If there's no space reservation, check the entry will
2677 * fit before actually inserting it.
2678 */
2679 error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
2680 if (error)
2681 goto error_return;
2682 /*
2683 * If target does not exist and the rename crosses
2684 * directories, adjust the target directory link count
2685 * to account for the ".." reference from the new entry.
2686 */
2687 error = xfs_dir_createname(tp, target_dp, target_name,
2688 src_ip->i_ino, &first_block,
2689 &free_list, spaceres);
2690 if (error == ENOSPC)
2691 goto error_return;
2692 if (error)
2693 goto abort_return;
2694
2695 xfs_trans_ichgtime(tp, target_dp,
2696 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2540 2697
2541 case XFS_DINODE_FMT_DEV: 2698 if (new_parent && src_is_directory) {
2542 if (iip->ili_fields & XFS_ILOG_DEV) { 2699 error = xfs_bumplink(tp, target_dp);
2543 ASSERT(whichfork == XFS_DATA_FORK); 2700 if (error)
2544 xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); 2701 goto abort_return;
2702 }
2703 } else { /* target_ip != NULL */
2704 /*
2705 * If target exists and it's a directory, check that both
2706 * target and source are directories and that target can be
2707 * destroyed, or that neither is a directory.
2708 */
2709 if (S_ISDIR(target_ip->i_d.di_mode)) {
2710 /*
2711 * Make sure target dir is empty.
2712 */
2713 if (!(xfs_dir_isempty(target_ip)) ||
2714 (target_ip->i_d.di_nlink > 2)) {
2715 error = XFS_ERROR(EEXIST);
2716 goto error_return;
2717 }
2545 } 2718 }
2546 break;
2547 2719
2548 case XFS_DINODE_FMT_UUID: 2720 /*
2549 if (iip->ili_fields & XFS_ILOG_UUID) { 2721 * Link the source inode under the target name.
2550 ASSERT(whichfork == XFS_DATA_FORK); 2722 * If the source inode is a directory and we are moving
2551 memcpy(XFS_DFORK_DPTR(dip), 2723 * it across directories, its ".." entry will be
2552 &ip->i_df.if_u2.if_uuid, 2724 * inconsistent until we replace that down below.
2553 sizeof(uuid_t)); 2725 *
2726 * In case there is already an entry with the same
2727 * name at the destination directory, remove it first.
2728 */
2729 error = xfs_dir_replace(tp, target_dp, target_name,
2730 src_ip->i_ino,
2731 &first_block, &free_list, spaceres);
2732 if (error)
2733 goto abort_return;
2734
2735 xfs_trans_ichgtime(tp, target_dp,
2736 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2737
2738 /*
2739 * Decrement the link count on the target since the target
2740 * dir no longer points to it.
2741 */
2742 error = xfs_droplink(tp, target_ip);
2743 if (error)
2744 goto abort_return;
2745
2746 if (src_is_directory) {
2747 /*
2748 * Drop the link from the old "." entry.
2749 */
2750 error = xfs_droplink(tp, target_ip);
2751 if (error)
2752 goto abort_return;
2554 } 2753 }
2555 break; 2754 } /* target_ip != NULL */
2556 2755
2557 default: 2756 /*
2558 ASSERT(0); 2757 * Remove the source.
2559 break; 2758 */
2759 if (new_parent && src_is_directory) {
2760 /*
2761 * Rewrite the ".." entry to point to the new
2762 * directory.
2763 */
2764 error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
2765 target_dp->i_ino,
2766 &first_block, &free_list, spaceres);
2767 ASSERT(error != EEXIST);
2768 if (error)
2769 goto abort_return;
2770 }
2771
2772 /*
2773 * We always want to hit the ctime on the source inode.
2774 *
2775 * This isn't strictly required by the standards since the source
2776 * inode isn't really being changed, but old unix file systems did
2777 * it and some incremental backup programs won't work without it.
2778 */
2779 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
2780 xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
2781
2782 /*
2783 * Adjust the link count on src_dp. This is necessary when
2784 * renaming a directory, either within one parent when
2785 * the target existed, or across two parent directories.
2786 */
2787 if (src_is_directory && (new_parent || target_ip != NULL)) {
2788
2789 /*
2790 * Decrement link count on src_directory since the
2791 * entry that's moved no longer points to it.
2792 */
2793 error = xfs_droplink(tp, src_dp);
2794 if (error)
2795 goto abort_return;
2796 }
2797
2798 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
2799 &first_block, &free_list, spaceres);
2800 if (error)
2801 goto abort_return;
2802
2803 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2804 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
2805 if (new_parent)
2806 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
2807
2808 /*
2809 * If this is a synchronous mount, make sure that the
2810 * rename transaction goes to disk before returning to
2811 * the user.
2812 */
2813 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2814 xfs_trans_set_sync(tp);
2560 } 2815 }
2816
2817 error = xfs_bmap_finish(&tp, &free_list, &committed);
2818 if (error) {
2819 xfs_bmap_cancel(&free_list);
2820 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
2821 XFS_TRANS_ABORT));
2822 goto std_return;
2823 }
2824
2825 /*
2826 * trans_commit will unlock src_ip, target_ip & decrement
2827 * the vnode references.
2828 */
2829 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2830
2831 abort_return:
2832 cancel_flags |= XFS_TRANS_ABORT;
2833 error_return:
2834 xfs_bmap_cancel(&free_list);
2835 xfs_trans_cancel(tp, cancel_flags);
2836 std_return:
2837 return error;
2561} 2838}
2562 2839
2563STATIC int 2840STATIC int
@@ -2816,7 +3093,6 @@ abort_out:
2816 return error; 3093 return error;
2817} 3094}
2818 3095
2819
2820STATIC int 3096STATIC int
2821xfs_iflush_int( 3097xfs_iflush_int(
2822 struct xfs_inode *ip, 3098 struct xfs_inode *ip,
@@ -3004,1072 +3280,3 @@ xfs_iflush_int(
3004corrupt_out: 3280corrupt_out:
3005 return XFS_ERROR(EFSCORRUPTED); 3281 return XFS_ERROR(EFSCORRUPTED);
3006} 3282}
3007
3008/*
3009 * Return a pointer to the extent record at file index idx.
3010 */
3011xfs_bmbt_rec_host_t *
3012xfs_iext_get_ext(
3013 xfs_ifork_t *ifp, /* inode fork pointer */
3014 xfs_extnum_t idx) /* index of target extent */
3015{
3016 ASSERT(idx >= 0);
3017 ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
3018
3019 if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
3020 return ifp->if_u1.if_ext_irec->er_extbuf;
3021 } else if (ifp->if_flags & XFS_IFEXTIREC) {
3022 xfs_ext_irec_t *erp; /* irec pointer */
3023 int erp_idx = 0; /* irec index */
3024 xfs_extnum_t page_idx = idx; /* ext index in target list */
3025
3026 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
3027 return &erp->er_extbuf[page_idx];
3028 } else if (ifp->if_bytes) {
3029 return &ifp->if_u1.if_extents[idx];
3030 } else {
3031 return NULL;
3032 }
3033}
3034
3035/*
3036 * Insert new item(s) into the extent records for incore inode
3037 * fork 'ifp'. 'count' new items are inserted at index 'idx'.
3038 */
3039void
3040xfs_iext_insert(
3041 xfs_inode_t *ip, /* incore inode pointer */
3042 xfs_extnum_t idx, /* starting index of new items */
3043 xfs_extnum_t count, /* number of inserted items */
3044 xfs_bmbt_irec_t *new, /* items to insert */
3045 int state) /* type of extent conversion */
3046{
3047 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
3048 xfs_extnum_t i; /* extent record index */
3049
3050 trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
3051
3052 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
3053 xfs_iext_add(ifp, idx, count);
3054 for (i = idx; i < idx + count; i++, new++)
3055 xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
3056}
3057
3058/*
3059 * This is called when the amount of space required for incore file
3060 * extents needs to be increased. The ext_diff parameter stores the
3061 * number of new extents being added and the idx parameter contains
3062 * the extent index where the new extents will be added. If the new
3063 * extents are being appended, then we just need to (re)allocate and
3064 * initialize the space. Otherwise, if the new extents are being
3065 * inserted into the middle of the existing entries, a bit more work
3066 * is required to make room for the new extents to be inserted. The
3067 * caller is responsible for filling in the new extent entries upon
3068 * return.
3069 */
3070void
3071xfs_iext_add(
3072 xfs_ifork_t *ifp, /* inode fork pointer */
3073 xfs_extnum_t idx, /* index to begin adding exts */
3074 int ext_diff) /* number of extents to add */
3075{
3076 int byte_diff; /* new bytes being added */
3077 int new_size; /* size of extents after adding */
3078 xfs_extnum_t nextents; /* number of extents in file */
3079
3080 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3081 ASSERT((idx >= 0) && (idx <= nextents));
3082 byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
3083 new_size = ifp->if_bytes + byte_diff;
3084 /*
3085 * If the new number of extents (nextents + ext_diff)
3086 * fits inside the inode, then continue to use the inline
3087 * extent buffer.
3088 */
3089 if (nextents + ext_diff <= XFS_INLINE_EXTS) {
3090 if (idx < nextents) {
3091 memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
3092 &ifp->if_u2.if_inline_ext[idx],
3093 (nextents - idx) * sizeof(xfs_bmbt_rec_t));
3094 memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
3095 }
3096 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
3097 ifp->if_real_bytes = 0;
3098 }
3099 /*
3100 * Otherwise use a linear (direct) extent list.
3101 * If the extents are currently inside the inode,
3102 * xfs_iext_realloc_direct will switch us from
3103 * inline to direct extent allocation mode.
3104 */
3105 else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
3106 xfs_iext_realloc_direct(ifp, new_size);
3107 if (idx < nextents) {
3108 memmove(&ifp->if_u1.if_extents[idx + ext_diff],
3109 &ifp->if_u1.if_extents[idx],
3110 (nextents - idx) * sizeof(xfs_bmbt_rec_t));
3111 memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
3112 }
3113 }
3114 /* Indirection array */
3115 else {
3116 xfs_ext_irec_t *erp;
3117 int erp_idx = 0;
3118 int page_idx = idx;
3119
3120 ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
3121 if (ifp->if_flags & XFS_IFEXTIREC) {
3122 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
3123 } else {
3124 xfs_iext_irec_init(ifp);
3125 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3126 erp = ifp->if_u1.if_ext_irec;
3127 }
3128 /* Extents fit in target extent page */
3129 if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
3130 if (page_idx < erp->er_extcount) {
3131 memmove(&erp->er_extbuf[page_idx + ext_diff],
3132 &erp->er_extbuf[page_idx],
3133 (erp->er_extcount - page_idx) *
3134 sizeof(xfs_bmbt_rec_t));
3135 memset(&erp->er_extbuf[page_idx], 0, byte_diff);
3136 }
3137 erp->er_extcount += ext_diff;
3138 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3139 }
3140 /* Insert a new extent page */
3141 else if (erp) {
3142 xfs_iext_add_indirect_multi(ifp,
3143 erp_idx, page_idx, ext_diff);
3144 }
3145 /*
3146 * If extent(s) are being appended to the last page in
3147 * the indirection array and the new extent(s) don't fit
3148 * in the page, then erp is NULL and erp_idx is set to
3149 * the next index needed in the indirection array.
3150 */
3151 else {
3152 int count = ext_diff;
3153
3154 while (count) {
3155 erp = xfs_iext_irec_new(ifp, erp_idx);
3156 erp->er_extcount = count;
3157 count -= MIN(count, (int)XFS_LINEAR_EXTS);
3158 if (count) {
3159 erp_idx++;
3160 }
3161 }
3162 }
3163 }
3164 ifp->if_bytes = new_size;
3165}
3166
3167/*
3168 * This is called when incore extents are being added to the indirection
3169 * array and the new extents do not fit in the target extent list. The
3170 * erp_idx parameter contains the irec index for the target extent list
3171 * in the indirection array, and the idx parameter contains the extent
3172 * index within the list. The number of extents being added is stored
3173 * in the count parameter.
3174 *
3175 * |-------| |-------|
3176 * | | | | idx - number of extents before idx
3177 * | idx | | count |
3178 * | | | | count - number of extents being inserted at idx
3179 * |-------| |-------|
3180 * | count | | nex2 | nex2 - number of extents after idx + count
3181 * |-------| |-------|
3182 */
3183void
3184xfs_iext_add_indirect_multi(
3185 xfs_ifork_t *ifp, /* inode fork pointer */
3186 int erp_idx, /* target extent irec index */
3187 xfs_extnum_t idx, /* index within target list */
3188 int count) /* new extents being added */
3189{
3190 int byte_diff; /* new bytes being added */
3191 xfs_ext_irec_t *erp; /* pointer to irec entry */
3192 xfs_extnum_t ext_diff; /* number of extents to add */
3193 xfs_extnum_t ext_cnt; /* new extents still needed */
3194 xfs_extnum_t nex2; /* extents after idx + count */
3195 xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */
3196 int nlists; /* number of irec's (lists) */
3197
3198 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3199 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3200 nex2 = erp->er_extcount - idx;
3201 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3202
3203 /*
3204 * Save second part of target extent list
3205 * (all extents past */
3206 if (nex2) {
3207 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
3208 nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
3209 memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
3210 erp->er_extcount -= nex2;
3211 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
3212 memset(&erp->er_extbuf[idx], 0, byte_diff);
3213 }
3214
3215 /*
3216 * Add the new extents to the end of the target
3217 * list, then allocate new irec record(s) and
3218 * extent buffer(s) as needed to store the rest
3219 * of the new extents.
3220 */
3221 ext_cnt = count;
3222 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
3223 if (ext_diff) {
3224 erp->er_extcount += ext_diff;
3225 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3226 ext_cnt -= ext_diff;
3227 }
3228 while (ext_cnt) {
3229 erp_idx++;
3230 erp = xfs_iext_irec_new(ifp, erp_idx);
3231 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
3232 erp->er_extcount = ext_diff;
3233 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3234 ext_cnt -= ext_diff;
3235 }
3236
3237 /* Add nex2 extents back to indirection array */
3238 if (nex2) {
3239 xfs_extnum_t ext_avail;
3240 int i;
3241
3242 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
3243 ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
3244 i = 0;
3245 /*
3246 * If nex2 extents fit in the current page, append
3247 * nex2_ep after the new extents.
3248 */
3249 if (nex2 <= ext_avail) {
3250 i = erp->er_extcount;
3251 }
3252 /*
3253 * Otherwise, check if space is available in the
3254 * next page.
3255 */
3256 else if ((erp_idx < nlists - 1) &&
3257 (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
3258 ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
3259 erp_idx++;
3260 erp++;
3261 /* Create a hole for nex2 extents */
3262 memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
3263 erp->er_extcount * sizeof(xfs_bmbt_rec_t));
3264 }
3265 /*
3266 * Final choice, create a new extent page for
3267 * nex2 extents.
3268 */
3269 else {
3270 erp_idx++;
3271 erp = xfs_iext_irec_new(ifp, erp_idx);
3272 }
3273 memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
3274 kmem_free(nex2_ep);
3275 erp->er_extcount += nex2;
3276 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
3277 }
3278}
3279
3280/*
3281 * This is called when the amount of space required for incore file
3282 * extents needs to be decreased. The ext_diff parameter stores the
3283 * number of extents to be removed and the idx parameter contains
3284 * the extent index where the extents will be removed from.
3285 *
3286 * If the amount of space needed has decreased below the linear
3287 * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
3288 * extent array. Otherwise, use kmem_realloc() to adjust the
3289 * size to what is needed.
3290 */
3291void
3292xfs_iext_remove(
3293 xfs_inode_t *ip, /* incore inode pointer */
3294 xfs_extnum_t idx, /* index to begin removing exts */
3295 int ext_diff, /* number of extents to remove */
3296 int state) /* type of extent conversion */
3297{
3298 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
3299 xfs_extnum_t nextents; /* number of extents in file */
3300 int new_size; /* size of extents after removal */
3301
3302 trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
3303
3304 ASSERT(ext_diff > 0);
3305 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3306 new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
3307
3308 if (new_size == 0) {
3309 xfs_iext_destroy(ifp);
3310 } else if (ifp->if_flags & XFS_IFEXTIREC) {
3311 xfs_iext_remove_indirect(ifp, idx, ext_diff);
3312 } else if (ifp->if_real_bytes) {
3313 xfs_iext_remove_direct(ifp, idx, ext_diff);
3314 } else {
3315 xfs_iext_remove_inline(ifp, idx, ext_diff);
3316 }
3317 ifp->if_bytes = new_size;
3318}
3319
3320/*
3321 * This removes ext_diff extents from the inline buffer, beginning
3322 * at extent index idx.
3323 */
3324void
3325xfs_iext_remove_inline(
3326 xfs_ifork_t *ifp, /* inode fork pointer */
3327 xfs_extnum_t idx, /* index to begin removing exts */
3328 int ext_diff) /* number of extents to remove */
3329{
3330 int nextents; /* number of extents in file */
3331
3332 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3333 ASSERT(idx < XFS_INLINE_EXTS);
3334 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3335 ASSERT(((nextents - ext_diff) > 0) &&
3336 (nextents - ext_diff) < XFS_INLINE_EXTS);
3337
3338 if (idx + ext_diff < nextents) {
3339 memmove(&ifp->if_u2.if_inline_ext[idx],
3340 &ifp->if_u2.if_inline_ext[idx + ext_diff],
3341 (nextents - (idx + ext_diff)) *
3342 sizeof(xfs_bmbt_rec_t));
3343 memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
3344 0, ext_diff * sizeof(xfs_bmbt_rec_t));
3345 } else {
3346 memset(&ifp->if_u2.if_inline_ext[idx], 0,
3347 ext_diff * sizeof(xfs_bmbt_rec_t));
3348 }
3349}
3350
3351/*
3352 * This removes ext_diff extents from a linear (direct) extent list,
3353 * beginning at extent index idx. If the extents are being removed
3354 * from the end of the list (ie. truncate) then we just need to re-
3355 * allocate the list to remove the extra space. Otherwise, if the
3356 * extents are being removed from the middle of the existing extent
3357 * entries, then we first need to move the extent records beginning
3358 * at idx + ext_diff up in the list to overwrite the records being
3359 * removed, then remove the extra space via kmem_realloc.
3360 */
3361void
3362xfs_iext_remove_direct(
3363 xfs_ifork_t *ifp, /* inode fork pointer */
3364 xfs_extnum_t idx, /* index to begin removing exts */
3365 int ext_diff) /* number of extents to remove */
3366{
3367 xfs_extnum_t nextents; /* number of extents in file */
3368 int new_size; /* size of extents after removal */
3369
3370 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3371 new_size = ifp->if_bytes -
3372 (ext_diff * sizeof(xfs_bmbt_rec_t));
3373 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3374
3375 if (new_size == 0) {
3376 xfs_iext_destroy(ifp);
3377 return;
3378 }
3379 /* Move extents up in the list (if needed) */
3380 if (idx + ext_diff < nextents) {
3381 memmove(&ifp->if_u1.if_extents[idx],
3382 &ifp->if_u1.if_extents[idx + ext_diff],
3383 (nextents - (idx + ext_diff)) *
3384 sizeof(xfs_bmbt_rec_t));
3385 }
3386 memset(&ifp->if_u1.if_extents[nextents - ext_diff],
3387 0, ext_diff * sizeof(xfs_bmbt_rec_t));
3388 /*
3389 * Reallocate the direct extent list. If the extents
3390 * will fit inside the inode then xfs_iext_realloc_direct
3391 * will switch from direct to inline extent allocation
3392 * mode for us.
3393 */
3394 xfs_iext_realloc_direct(ifp, new_size);
3395 ifp->if_bytes = new_size;
3396}
3397
3398/*
3399 * This is called when incore extents are being removed from the
3400 * indirection array and the extents being removed span multiple extent
3401 * buffers. The idx parameter contains the file extent index where we
3402 * want to begin removing extents, and the count parameter contains
3403 * how many extents need to be removed.
3404 *
3405 * |-------| |-------|
3406 * | nex1 | | | nex1 - number of extents before idx
3407 * |-------| | count |
3408 * | | | | count - number of extents being removed at idx
3409 * | count | |-------|
3410 * | | | nex2 | nex2 - number of extents after idx + count
3411 * |-------| |-------|
3412 */
3413void
3414xfs_iext_remove_indirect(
3415 xfs_ifork_t *ifp, /* inode fork pointer */
3416 xfs_extnum_t idx, /* index to begin removing extents */
3417 int count) /* number of extents to remove */
3418{
3419 xfs_ext_irec_t *erp; /* indirection array pointer */
3420 int erp_idx = 0; /* indirection array index */
3421 xfs_extnum_t ext_cnt; /* extents left to remove */
3422 xfs_extnum_t ext_diff; /* extents to remove in current list */
3423 xfs_extnum_t nex1; /* number of extents before idx */
3424 xfs_extnum_t nex2; /* extents after idx + count */
3425 int page_idx = idx; /* index in target extent list */
3426
3427 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3428 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
3429 ASSERT(erp != NULL);
3430 nex1 = page_idx;
3431 ext_cnt = count;
3432 while (ext_cnt) {
3433 nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
3434 ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
3435 /*
3436 * Check for deletion of entire list;
3437 * xfs_iext_irec_remove() updates extent offsets.
3438 */
3439 if (ext_diff == erp->er_extcount) {
3440 xfs_iext_irec_remove(ifp, erp_idx);
3441 ext_cnt -= ext_diff;
3442 nex1 = 0;
3443 if (ext_cnt) {
3444 ASSERT(erp_idx < ifp->if_real_bytes /
3445 XFS_IEXT_BUFSZ);
3446 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3447 nex1 = 0;
3448 continue;
3449 } else {
3450 break;
3451 }
3452 }
3453 /* Move extents up (if needed) */
3454 if (nex2) {
3455 memmove(&erp->er_extbuf[nex1],
3456 &erp->er_extbuf[nex1 + ext_diff],
3457 nex2 * sizeof(xfs_bmbt_rec_t));
3458 }
3459 /* Zero out rest of page */
3460 memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
3461 ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
3462 /* Update remaining counters */
3463 erp->er_extcount -= ext_diff;
3464 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
3465 ext_cnt -= ext_diff;
3466 nex1 = 0;
3467 erp_idx++;
3468 erp++;
3469 }
3470 ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
3471 xfs_iext_irec_compact(ifp);
3472}
3473
3474/*
3475 * Create, destroy, or resize a linear (direct) block of extents.
3476 */
3477void
3478xfs_iext_realloc_direct(
3479 xfs_ifork_t *ifp, /* inode fork pointer */
3480 int new_size) /* new size of extents */
3481{
3482 int rnew_size; /* real new size of extents */
3483
3484 rnew_size = new_size;
3485
3486 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
3487 ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
3488 (new_size != ifp->if_real_bytes)));
3489
3490 /* Free extent records */
3491 if (new_size == 0) {
3492 xfs_iext_destroy(ifp);
3493 }
3494 /* Resize direct extent list and zero any new bytes */
3495 else if (ifp->if_real_bytes) {
3496 /* Check if extents will fit inside the inode */
3497 if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
3498 xfs_iext_direct_to_inline(ifp, new_size /
3499 (uint)sizeof(xfs_bmbt_rec_t));
3500 ifp->if_bytes = new_size;
3501 return;
3502 }
3503 if (!is_power_of_2(new_size)){
3504 rnew_size = roundup_pow_of_two(new_size);
3505 }
3506 if (rnew_size != ifp->if_real_bytes) {
3507 ifp->if_u1.if_extents =
3508 kmem_realloc(ifp->if_u1.if_extents,
3509 rnew_size,
3510 ifp->if_real_bytes, KM_NOFS);
3511 }
3512 if (rnew_size > ifp->if_real_bytes) {
3513 memset(&ifp->if_u1.if_extents[ifp->if_bytes /
3514 (uint)sizeof(xfs_bmbt_rec_t)], 0,
3515 rnew_size - ifp->if_real_bytes);
3516 }
3517 }
3518 /*
3519 * Switch from the inline extent buffer to a direct
3520 * extent list. Be sure to include the inline extent
3521 * bytes in new_size.
3522 */
3523 else {
3524 new_size += ifp->if_bytes;
3525 if (!is_power_of_2(new_size)) {
3526 rnew_size = roundup_pow_of_two(new_size);
3527 }
3528 xfs_iext_inline_to_direct(ifp, rnew_size);
3529 }
3530 ifp->if_real_bytes = rnew_size;
3531 ifp->if_bytes = new_size;
3532}
3533
3534/*
3535 * Switch from linear (direct) extent records to inline buffer.
3536 */
3537void
3538xfs_iext_direct_to_inline(
3539 xfs_ifork_t *ifp, /* inode fork pointer */
3540 xfs_extnum_t nextents) /* number of extents in file */
3541{
3542 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
3543 ASSERT(nextents <= XFS_INLINE_EXTS);
3544 /*
3545 * The inline buffer was zeroed when we switched
3546 * from inline to direct extent allocation mode,
3547 * so we don't need to clear it here.
3548 */
3549 memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
3550 nextents * sizeof(xfs_bmbt_rec_t));
3551 kmem_free(ifp->if_u1.if_extents);
3552 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
3553 ifp->if_real_bytes = 0;
3554}
3555
3556/*
3557 * Switch from inline buffer to linear (direct) extent records.
3558 * new_size should already be rounded up to the next power of 2
3559 * by the caller (when appropriate), so use new_size as it is.
3560 * However, since new_size may be rounded up, we can't update
3561 * if_bytes here. It is the caller's responsibility to update
3562 * if_bytes upon return.
3563 */
3564void
3565xfs_iext_inline_to_direct(
3566 xfs_ifork_t *ifp, /* inode fork pointer */
3567 int new_size) /* number of extents in file */
3568{
3569 ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
3570 memset(ifp->if_u1.if_extents, 0, new_size);
3571 if (ifp->if_bytes) {
3572 memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
3573 ifp->if_bytes);
3574 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
3575 sizeof(xfs_bmbt_rec_t));
3576 }
3577 ifp->if_real_bytes = new_size;
3578}
3579
3580/*
3581 * Resize an extent indirection array to new_size bytes.
3582 */
3583STATIC void
3584xfs_iext_realloc_indirect(
3585 xfs_ifork_t *ifp, /* inode fork pointer */
3586 int new_size) /* new indirection array size */
3587{
3588 int nlists; /* number of irec's (ex lists) */
3589 int size; /* current indirection array size */
3590
3591 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3592 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3593 size = nlists * sizeof(xfs_ext_irec_t);
3594 ASSERT(ifp->if_real_bytes);
3595 ASSERT((new_size >= 0) && (new_size != size));
3596 if (new_size == 0) {
3597 xfs_iext_destroy(ifp);
3598 } else {
3599 ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
3600 kmem_realloc(ifp->if_u1.if_ext_irec,
3601 new_size, size, KM_NOFS);
3602 }
3603}
3604
3605/*
3606 * Switch from indirection array to linear (direct) extent allocations.
3607 */
3608STATIC void
3609xfs_iext_indirect_to_direct(
3610 xfs_ifork_t *ifp) /* inode fork pointer */
3611{
3612 xfs_bmbt_rec_host_t *ep; /* extent record pointer */
3613 xfs_extnum_t nextents; /* number of extents in file */
3614 int size; /* size of file extents */
3615
3616 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3617 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3618 ASSERT(nextents <= XFS_LINEAR_EXTS);
3619 size = nextents * sizeof(xfs_bmbt_rec_t);
3620
3621 xfs_iext_irec_compact_pages(ifp);
3622 ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
3623
3624 ep = ifp->if_u1.if_ext_irec->er_extbuf;
3625 kmem_free(ifp->if_u1.if_ext_irec);
3626 ifp->if_flags &= ~XFS_IFEXTIREC;
3627 ifp->if_u1.if_extents = ep;
3628 ifp->if_bytes = size;
3629 if (nextents < XFS_LINEAR_EXTS) {
3630 xfs_iext_realloc_direct(ifp, size);
3631 }
3632}
3633
3634/*
3635 * Free incore file extents.
3636 */
3637void
3638xfs_iext_destroy(
3639 xfs_ifork_t *ifp) /* inode fork pointer */
3640{
3641 if (ifp->if_flags & XFS_IFEXTIREC) {
3642 int erp_idx;
3643 int nlists;
3644
3645 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3646 for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
3647 xfs_iext_irec_remove(ifp, erp_idx);
3648 }
3649 ifp->if_flags &= ~XFS_IFEXTIREC;
3650 } else if (ifp->if_real_bytes) {
3651 kmem_free(ifp->if_u1.if_extents);
3652 } else if (ifp->if_bytes) {
3653 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
3654 sizeof(xfs_bmbt_rec_t));
3655 }
3656 ifp->if_u1.if_extents = NULL;
3657 ifp->if_real_bytes = 0;
3658 ifp->if_bytes = 0;
3659}
3660
3661/*
3662 * Return a pointer to the extent record for file system block bno.
3663 */
3664xfs_bmbt_rec_host_t * /* pointer to found extent record */
3665xfs_iext_bno_to_ext(
3666 xfs_ifork_t *ifp, /* inode fork pointer */
3667 xfs_fileoff_t bno, /* block number to search for */
3668 xfs_extnum_t *idxp) /* index of target extent */
3669{
3670 xfs_bmbt_rec_host_t *base; /* pointer to first extent */
3671 xfs_filblks_t blockcount = 0; /* number of blocks in extent */
3672 xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
3673 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
3674 int high; /* upper boundary in search */
3675 xfs_extnum_t idx = 0; /* index of target extent */
3676 int low; /* lower boundary in search */
3677 xfs_extnum_t nextents; /* number of file extents */
3678 xfs_fileoff_t startoff = 0; /* start offset of extent */
3679
3680 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3681 if (nextents == 0) {
3682 *idxp = 0;
3683 return NULL;
3684 }
3685 low = 0;
3686 if (ifp->if_flags & XFS_IFEXTIREC) {
3687 /* Find target extent list */
3688 int erp_idx = 0;
3689 erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
3690 base = erp->er_extbuf;
3691 high = erp->er_extcount - 1;
3692 } else {
3693 base = ifp->if_u1.if_extents;
3694 high = nextents - 1;
3695 }
3696 /* Binary search extent records */
3697 while (low <= high) {
3698 idx = (low + high) >> 1;
3699 ep = base + idx;
3700 startoff = xfs_bmbt_get_startoff(ep);
3701 blockcount = xfs_bmbt_get_blockcount(ep);
3702 if (bno < startoff) {
3703 high = idx - 1;
3704 } else if (bno >= startoff + blockcount) {
3705 low = idx + 1;
3706 } else {
3707 /* Convert back to file-based extent index */
3708 if (ifp->if_flags & XFS_IFEXTIREC) {
3709 idx += erp->er_extoff;
3710 }
3711 *idxp = idx;
3712 return ep;
3713 }
3714 }
3715 /* Convert back to file-based extent index */
3716 if (ifp->if_flags & XFS_IFEXTIREC) {
3717 idx += erp->er_extoff;
3718 }
3719 if (bno >= startoff + blockcount) {
3720 if (++idx == nextents) {
3721 ep = NULL;
3722 } else {
3723 ep = xfs_iext_get_ext(ifp, idx);
3724 }
3725 }
3726 *idxp = idx;
3727 return ep;
3728}
3729
3730/*
3731 * Return a pointer to the indirection array entry containing the
3732 * extent record for filesystem block bno. Store the index of the
3733 * target irec in *erp_idxp.
3734 */
3735xfs_ext_irec_t * /* pointer to found extent record */
3736xfs_iext_bno_to_irec(
3737 xfs_ifork_t *ifp, /* inode fork pointer */
3738 xfs_fileoff_t bno, /* block number to search for */
3739 int *erp_idxp) /* irec index of target ext list */
3740{
3741 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
3742 xfs_ext_irec_t *erp_next; /* next indirection array entry */
3743 int erp_idx; /* indirection array index */
3744 int nlists; /* number of extent irec's (lists) */
3745 int high; /* binary search upper limit */
3746 int low; /* binary search lower limit */
3747
3748 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3749 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3750 erp_idx = 0;
3751 low = 0;
3752 high = nlists - 1;
3753 while (low <= high) {
3754 erp_idx = (low + high) >> 1;
3755 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3756 erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
3757 if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
3758 high = erp_idx - 1;
3759 } else if (erp_next && bno >=
3760 xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
3761 low = erp_idx + 1;
3762 } else {
3763 break;
3764 }
3765 }
3766 *erp_idxp = erp_idx;
3767 return erp;
3768}
3769
3770/*
3771 * Return a pointer to the indirection array entry containing the
3772 * extent record at file extent index *idxp. Store the index of the
3773 * target irec in *erp_idxp and store the page index of the target
3774 * extent record in *idxp.
3775 */
3776xfs_ext_irec_t *
3777xfs_iext_idx_to_irec(
3778 xfs_ifork_t *ifp, /* inode fork pointer */
3779 xfs_extnum_t *idxp, /* extent index (file -> page) */
3780 int *erp_idxp, /* pointer to target irec */
3781 int realloc) /* new bytes were just added */
3782{
3783 xfs_ext_irec_t *prev; /* pointer to previous irec */
3784 xfs_ext_irec_t *erp = NULL; /* pointer to current irec */
3785 int erp_idx; /* indirection array index */
3786 int nlists; /* number of irec's (ex lists) */
3787 int high; /* binary search upper limit */
3788 int low; /* binary search lower limit */
3789 xfs_extnum_t page_idx = *idxp; /* extent index in target list */
3790
3791 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3792 ASSERT(page_idx >= 0);
3793 ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
3794 ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
3795
3796 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3797 erp_idx = 0;
3798 low = 0;
3799 high = nlists - 1;
3800
3801 /* Binary search extent irec's */
3802 while (low <= high) {
3803 erp_idx = (low + high) >> 1;
3804 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3805 prev = erp_idx > 0 ? erp - 1 : NULL;
3806 if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
3807 realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
3808 high = erp_idx - 1;
3809 } else if (page_idx > erp->er_extoff + erp->er_extcount ||
3810 (page_idx == erp->er_extoff + erp->er_extcount &&
3811 !realloc)) {
3812 low = erp_idx + 1;
3813 } else if (page_idx == erp->er_extoff + erp->er_extcount &&
3814 erp->er_extcount == XFS_LINEAR_EXTS) {
3815 ASSERT(realloc);
3816 page_idx = 0;
3817 erp_idx++;
3818 erp = erp_idx < nlists ? erp + 1 : NULL;
3819 break;
3820 } else {
3821 page_idx -= erp->er_extoff;
3822 break;
3823 }
3824 }
3825 *idxp = page_idx;
3826 *erp_idxp = erp_idx;
3827 return(erp);
3828}
3829
3830/*
3831 * Allocate and initialize an indirection array once the space needed
3832 * for incore extents increases above XFS_IEXT_BUFSZ.
3833 */
3834void
3835xfs_iext_irec_init(
3836 xfs_ifork_t *ifp) /* inode fork pointer */
3837{
3838 xfs_ext_irec_t *erp; /* indirection array pointer */
3839 xfs_extnum_t nextents; /* number of extents in file */
3840
3841 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3842 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3843 ASSERT(nextents <= XFS_LINEAR_EXTS);
3844
3845 erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
3846
3847 if (nextents == 0) {
3848 ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
3849 } else if (!ifp->if_real_bytes) {
3850 xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
3851 } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
3852 xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
3853 }
3854 erp->er_extbuf = ifp->if_u1.if_extents;
3855 erp->er_extcount = nextents;
3856 erp->er_extoff = 0;
3857
3858 ifp->if_flags |= XFS_IFEXTIREC;
3859 ifp->if_real_bytes = XFS_IEXT_BUFSZ;
3860 ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
3861 ifp->if_u1.if_ext_irec = erp;
3862
3863 return;
3864}
3865
3866/*
3867 * Allocate and initialize a new entry in the indirection array.
3868 */
3869xfs_ext_irec_t *
3870xfs_iext_irec_new(
3871 xfs_ifork_t *ifp, /* inode fork pointer */
3872 int erp_idx) /* index for new irec */
3873{
3874 xfs_ext_irec_t *erp; /* indirection array pointer */
3875 int i; /* loop counter */
3876 int nlists; /* number of irec's (ex lists) */
3877
3878 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3879 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3880
3881 /* Resize indirection array */
3882 xfs_iext_realloc_indirect(ifp, ++nlists *
3883 sizeof(xfs_ext_irec_t));
3884 /*
3885 * Move records down in the array so the
3886 * new page can use erp_idx.
3887 */
3888 erp = ifp->if_u1.if_ext_irec;
3889 for (i = nlists - 1; i > erp_idx; i--) {
3890 memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
3891 }
3892 ASSERT(i == erp_idx);
3893
3894 /* Initialize new extent record */
3895 erp = ifp->if_u1.if_ext_irec;
3896 erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
3897 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
3898 memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
3899 erp[erp_idx].er_extcount = 0;
3900 erp[erp_idx].er_extoff = erp_idx > 0 ?
3901 erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
3902 return (&erp[erp_idx]);
3903}
3904
3905/*
3906 * Remove a record from the indirection array.
3907 */
3908void
3909xfs_iext_irec_remove(
3910 xfs_ifork_t *ifp, /* inode fork pointer */
3911 int erp_idx) /* irec index to remove */
3912{
3913 xfs_ext_irec_t *erp; /* indirection array pointer */
3914 int i; /* loop counter */
3915 int nlists; /* number of irec's (ex lists) */
3916
3917 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3918 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3919 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3920 if (erp->er_extbuf) {
3921 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
3922 -erp->er_extcount);
3923 kmem_free(erp->er_extbuf);
3924 }
3925 /* Compact extent records */
3926 erp = ifp->if_u1.if_ext_irec;
3927 for (i = erp_idx; i < nlists - 1; i++) {
3928 memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
3929 }
3930 /*
3931 * Manually free the last extent record from the indirection
3932 * array. A call to xfs_iext_realloc_indirect() with a size
3933 * of zero would result in a call to xfs_iext_destroy() which
3934 * would in turn call this function again, creating a nasty
3935 * infinite loop.
3936 */
3937 if (--nlists) {
3938 xfs_iext_realloc_indirect(ifp,
3939 nlists * sizeof(xfs_ext_irec_t));
3940 } else {
3941 kmem_free(ifp->if_u1.if_ext_irec);
3942 }
3943 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
3944}
3945
3946/*
3947 * This is called to clean up large amounts of unused memory allocated
3948 * by the indirection array. Before compacting anything though, verify
3949 * that the indirection array is still needed and switch back to the
3950 * linear extent list (or even the inline buffer) if possible. The
3951 * compaction policy is as follows:
3952 *
3953 * Full Compaction: Extents fit into a single page (or inline buffer)
3954 * Partial Compaction: Extents occupy less than 50% of allocated space
3955 * No Compaction: Extents occupy at least 50% of allocated space
3956 */
3957void
3958xfs_iext_irec_compact(
3959 xfs_ifork_t *ifp) /* inode fork pointer */
3960{
3961 xfs_extnum_t nextents; /* number of extents in file */
3962 int nlists; /* number of irec's (ex lists) */
3963
3964 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3965 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3966 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3967
3968 if (nextents == 0) {
3969 xfs_iext_destroy(ifp);
3970 } else if (nextents <= XFS_INLINE_EXTS) {
3971 xfs_iext_indirect_to_direct(ifp);
3972 xfs_iext_direct_to_inline(ifp, nextents);
3973 } else if (nextents <= XFS_LINEAR_EXTS) {
3974 xfs_iext_indirect_to_direct(ifp);
3975 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
3976 xfs_iext_irec_compact_pages(ifp);
3977 }
3978}
3979
3980/*
3981 * Combine extents from neighboring extent pages.
3982 */
3983void
3984xfs_iext_irec_compact_pages(
3985 xfs_ifork_t *ifp) /* inode fork pointer */
3986{
3987 xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */
3988 int erp_idx = 0; /* indirection array index */
3989 int nlists; /* number of irec's (ex lists) */
3990
3991 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3992 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3993 while (erp_idx < nlists - 1) {
3994 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3995 erp_next = erp + 1;
3996 if (erp_next->er_extcount <=
3997 (XFS_LINEAR_EXTS - erp->er_extcount)) {
3998 memcpy(&erp->er_extbuf[erp->er_extcount],
3999 erp_next->er_extbuf, erp_next->er_extcount *
4000 sizeof(xfs_bmbt_rec_t));
4001 erp->er_extcount += erp_next->er_extcount;
4002 /*
4003 * Free page before removing extent record
4004 * so er_extoffs don't get modified in
4005 * xfs_iext_irec_remove.
4006 */
4007 kmem_free(erp_next->er_extbuf);
4008 erp_next->er_extbuf = NULL;
4009 xfs_iext_irec_remove(ifp, erp_idx + 1);
4010 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4011 } else {
4012 erp_idx++;
4013 }
4014 }
4015}
4016
4017/*
4018 * This is called to update the er_extoff field in the indirection
4019 * array when extents have been added or removed from one of the
4020 * extent lists. erp_idx contains the irec index to begin updating
4021 * at and ext_diff contains the number of extents that were added
4022 * or removed.
4023 */
4024void
4025xfs_iext_irec_update_extoffs(
4026 xfs_ifork_t *ifp, /* inode fork pointer */
4027 int erp_idx, /* irec index to update */
4028 int ext_diff) /* number of new extents */
4029{
4030 int i; /* loop counter */
4031 int nlists; /* number of irec's (ex lists */
4032
4033 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
4034 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
4035 for (i = erp_idx; i < nlists; i++) {
4036 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
4037 }
4038}
4039
4040/*
4041 * Test whether it is appropriate to check an inode for and free post EOF
4042 * blocks. The 'force' parameter determines whether we should also consider
4043 * regular files that are marked preallocated or append-only.
4044 */
4045bool
4046xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
4047{
4048 /* prealloc/delalloc exists only on regular files */
4049 if (!S_ISREG(ip->i_d.di_mode))
4050 return false;
4051
4052 /*
4053 * Zero sized files with no cached pages and delalloc blocks will not
4054 * have speculative prealloc/delalloc blocks to remove.
4055 */
4056 if (VFS_I(ip)->i_size == 0 &&
4057 VN_CACHED(VFS_I(ip)) == 0 &&
4058 ip->i_delayed_blks == 0)
4059 return false;
4060
4061 /* If we haven't read in the extent list, then don't do it now. */
4062 if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
4063 return false;
4064
4065 /*
4066 * Do not free real preallocated or append-only files unless the file
4067 * has delalloc blocks and we are forced to remove them.
4068 */
4069 if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
4070 if (!force || ip->i_delayed_blks == 0)
4071 return false;
4072
4073 return true;
4074}
4075
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index b55fd347ab5b..4a91358c1470 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -18,225 +18,15 @@
18#ifndef __XFS_INODE_H__ 18#ifndef __XFS_INODE_H__
19#define __XFS_INODE_H__ 19#define __XFS_INODE_H__
20 20
21struct posix_acl; 21#include "xfs_inode_buf.h"
22struct xfs_dinode; 22#include "xfs_inode_fork.h"
23struct xfs_inode;
24
25/*
26 * Fork identifiers.
27 */
28#define XFS_DATA_FORK 0
29#define XFS_ATTR_FORK 1
30
31/*
32 * The following xfs_ext_irec_t struct introduces a second (top) level
33 * to the in-core extent allocation scheme. These structs are allocated
34 * in a contiguous block, creating an indirection array where each entry
35 * (irec) contains a pointer to a buffer of in-core extent records which
36 * it manages. Each extent buffer is 4k in size, since 4k is the system
37 * page size on Linux i386 and systems with larger page sizes don't seem
38 * to gain much, if anything, by using their native page size as the
39 * extent buffer size. Also, using 4k extent buffers everywhere provides
40 * a consistent interface for CXFS across different platforms.
41 *
42 * There is currently no limit on the number of irec's (extent lists)
43 * allowed, so heavily fragmented files may require an indirection array
44 * which spans multiple system pages of memory. The number of extents
45 * which would require this amount of contiguous memory is very large
46 * and should not cause problems in the foreseeable future. However,
47 * if the memory needed for the contiguous array ever becomes a problem,
48 * it is possible that a third level of indirection may be required.
49 */
50typedef struct xfs_ext_irec {
51 xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */
52 xfs_extnum_t er_extoff; /* extent offset in file */
53 xfs_extnum_t er_extcount; /* number of extents in page/block */
54} xfs_ext_irec_t;
55 23
56/* 24/*
57 * File incore extent information, present for each of data & attr forks. 25 * Kernel only inode definitions
58 */ 26 */
59#define XFS_IEXT_BUFSZ 4096
60#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
61#define XFS_INLINE_EXTS 2
62#define XFS_INLINE_DATA 32
63typedef struct xfs_ifork {
64 int if_bytes; /* bytes in if_u1 */
65 int if_real_bytes; /* bytes allocated in if_u1 */
66 struct xfs_btree_block *if_broot; /* file's incore btree root */
67 short if_broot_bytes; /* bytes allocated for root */
68 unsigned char if_flags; /* per-fork flags */
69 union {
70 xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
71 xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
72 char *if_data; /* inline file data */
73 } if_u1;
74 union {
75 xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
76 /* very small file extents */
77 char if_inline_data[XFS_INLINE_DATA];
78 /* very small file data */
79 xfs_dev_t if_rdev; /* dev number if special */
80 uuid_t if_uuid; /* mount point value */
81 } if_u2;
82} xfs_ifork_t;
83
84/*
85 * Inode location information. Stored in the inode and passed to
86 * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
87 */
88struct xfs_imap {
89 xfs_daddr_t im_blkno; /* starting BB of inode chunk */
90 ushort im_len; /* length in BBs of inode chunk */
91 ushort im_boffset; /* inode offset in block in bytes */
92};
93
94/*
95 * This is the xfs in-core inode structure.
96 * Most of the on-disk inode is embedded in the i_d field.
97 *
98 * The extent pointers/inline file space, however, are managed
99 * separately. The memory for this information is pointed to by
100 * the if_u1 unions depending on the type of the data.
101 * This is used to linearize the array of extents for fast in-core
102 * access. This is used until the file's number of extents
103 * surpasses XFS_MAX_INCORE_EXTENTS, at which point all extent pointers
104 * are accessed through the buffer cache.
105 *
106 * Other state kept in the in-core inode is used for identification,
107 * locking, transactional updating, etc of the inode.
108 *
109 * Generally, we do not want to hold the i_rlock while holding the
110 * i_ilock. Hierarchy is i_iolock followed by i_rlock.
111 *
112 * xfs_iptr_t contains all the inode fields up to and including the
113 * i_mnext and i_mprev fields, it is used as a marker in the inode
114 * chain off the mount structure by xfs_sync calls.
115 */
116
117typedef struct xfs_ictimestamp {
118 __int32_t t_sec; /* timestamp seconds */
119 __int32_t t_nsec; /* timestamp nanoseconds */
120} xfs_ictimestamp_t;
121
122/*
123 * NOTE: This structure must be kept identical to struct xfs_dinode
124 * in xfs_dinode.h except for the endianness annotations.
125 */
126typedef struct xfs_icdinode {
127 __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
128 __uint16_t di_mode; /* mode and type of file */
129 __int8_t di_version; /* inode version */
130 __int8_t di_format; /* format of di_c data */
131 __uint16_t di_onlink; /* old number of links to file */
132 __uint32_t di_uid; /* owner's user id */
133 __uint32_t di_gid; /* owner's group id */
134 __uint32_t di_nlink; /* number of links to file */
135 __uint16_t di_projid_lo; /* lower part of owner's project id */
136 __uint16_t di_projid_hi; /* higher part of owner's project id */
137 __uint8_t di_pad[6]; /* unused, zeroed space */
138 __uint16_t di_flushiter; /* incremented on flush */
139 xfs_ictimestamp_t di_atime; /* time last accessed */
140 xfs_ictimestamp_t di_mtime; /* time last modified */
141 xfs_ictimestamp_t di_ctime; /* time created/inode modified */
142 xfs_fsize_t di_size; /* number of bytes in file */
143 xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */
144 xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
145 xfs_extnum_t di_nextents; /* number of extents in data fork */
146 xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
147 __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
148 __int8_t di_aformat; /* format of attr fork's data */
149 __uint32_t di_dmevmask; /* DMIG event mask */
150 __uint16_t di_dmstate; /* DMIG state info */
151 __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
152 __uint32_t di_gen; /* generation number */
153
154 /* di_next_unlinked is the only non-core field in the old dinode */
155 xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */
156
157 /* start of the extended dinode, writable fields */
158 __uint32_t di_crc; /* CRC of the inode */
159 __uint64_t di_changecount; /* number of attribute changes */
160 xfs_lsn_t di_lsn; /* flush sequence */
161 __uint64_t di_flags2; /* more random flags */
162 __uint8_t di_pad2[16]; /* more padding for future expansion */
163
164 /* fields only written to during inode creation */
165 xfs_ictimestamp_t di_crtime; /* time created */
166 xfs_ino_t di_ino; /* inode number */
167 uuid_t di_uuid; /* UUID of the filesystem */
168
169 /* structure must be padded to 64 bit alignment */
170} xfs_icdinode_t;
171
172static inline uint xfs_icdinode_size(int version)
173{
174 if (version == 3)
175 return sizeof(struct xfs_icdinode);
176 return offsetof(struct xfs_icdinode, di_next_unlinked);
177}
178
179/*
180 * Flags for xfs_ichgtime().
181 */
182#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
183#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
184#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */
185
186/*
187 * Per-fork incore inode flags.
188 */
189#define XFS_IFINLINE 0x01 /* Inline data is read in */
190#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */
191#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */
192#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */
193
194/*
195 * Fork handling.
196 */
197
198#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0)
199#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3))
200
201#define XFS_IFORK_PTR(ip,w) \
202 ((w) == XFS_DATA_FORK ? \
203 &(ip)->i_df : \
204 (ip)->i_afp)
205#define XFS_IFORK_DSIZE(ip) \
206 (XFS_IFORK_Q(ip) ? \
207 XFS_IFORK_BOFF(ip) : \
208 XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
209#define XFS_IFORK_ASIZE(ip) \
210 (XFS_IFORK_Q(ip) ? \
211 XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
212 XFS_IFORK_BOFF(ip) : \
213 0)
214#define XFS_IFORK_SIZE(ip,w) \
215 ((w) == XFS_DATA_FORK ? \
216 XFS_IFORK_DSIZE(ip) : \
217 XFS_IFORK_ASIZE(ip))
218#define XFS_IFORK_FORMAT(ip,w) \
219 ((w) == XFS_DATA_FORK ? \
220 (ip)->i_d.di_format : \
221 (ip)->i_d.di_aformat)
222#define XFS_IFORK_FMT_SET(ip,w,n) \
223 ((w) == XFS_DATA_FORK ? \
224 ((ip)->i_d.di_format = (n)) : \
225 ((ip)->i_d.di_aformat = (n)))
226#define XFS_IFORK_NEXTENTS(ip,w) \
227 ((w) == XFS_DATA_FORK ? \
228 (ip)->i_d.di_nextents : \
229 (ip)->i_d.di_anextents)
230#define XFS_IFORK_NEXT_SET(ip,w,n) \
231 ((w) == XFS_DATA_FORK ? \
232 ((ip)->i_d.di_nextents = (n)) : \
233 ((ip)->i_d.di_anextents = (n)))
234#define XFS_IFORK_MAXEXT(ip, w) \
235 (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
236
237
238#ifdef __KERNEL__
239 27
28struct xfs_dinode;
29struct xfs_inode;
240struct xfs_buf; 30struct xfs_buf;
241struct xfs_bmap_free; 31struct xfs_bmap_free;
242struct xfs_bmbt_irec; 32struct xfs_bmbt_irec;
@@ -525,9 +315,21 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
525 ((pip)->i_d.di_mode & S_ISGID)) 315 ((pip)->i_d.di_mode & S_ISGID))
526 316
527 317
528/* 318int xfs_release(struct xfs_inode *ip);
529 * xfs_inode.c prototypes. 319int xfs_inactive(struct xfs_inode *ip);
530 */ 320int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
321 struct xfs_inode **ipp, struct xfs_name *ci_name);
322int xfs_create(struct xfs_inode *dp, struct xfs_name *name,
323 umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);
324int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
325 struct xfs_inode *ip);
326int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
327 struct xfs_name *target_name);
328int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
329 struct xfs_inode *src_ip, struct xfs_inode *target_dp,
330 struct xfs_name *target_name,
331 struct xfs_inode *target_ip);
332
531void xfs_ilock(xfs_inode_t *, uint); 333void xfs_ilock(xfs_inode_t *, uint);
532int xfs_ilock_nowait(xfs_inode_t *, uint); 334int xfs_ilock_nowait(xfs_inode_t *, uint);
533void xfs_iunlock(xfs_inode_t *, uint); 335void xfs_iunlock(xfs_inode_t *, uint);
@@ -548,13 +350,28 @@ int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
548int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); 350int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
549 351
550void xfs_iext_realloc(xfs_inode_t *, int, int); 352void xfs_iext_realloc(xfs_inode_t *, int, int);
353
551void xfs_iunpin_wait(xfs_inode_t *); 354void xfs_iunpin_wait(xfs_inode_t *);
355#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
356
552int xfs_iflush(struct xfs_inode *, struct xfs_buf **); 357int xfs_iflush(struct xfs_inode *, struct xfs_buf **);
553void xfs_lock_inodes(xfs_inode_t **, int, uint); 358void xfs_lock_inodes(xfs_inode_t **, int, uint);
554void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 359void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
555 360
556xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); 361xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
557 362
363int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
364 xfs_nlink_t, xfs_dev_t, prid_t, int,
365 struct xfs_inode **, int *);
366int xfs_droplink(struct xfs_trans *, struct xfs_inode *);
367int xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
368void xfs_bump_ino_vers2(struct xfs_trans *, struct xfs_inode *);
369
370/* from xfs_file.c */
371int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
372int xfs_iozero(struct xfs_inode *, loff_t, size_t);
373
374
558#define IHOLD(ip) \ 375#define IHOLD(ip) \
559do { \ 376do { \
560 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ 377 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
@@ -568,65 +385,6 @@ do { \
568 iput(VFS_I(ip)); \ 385 iput(VFS_I(ip)); \
569} while (0) 386} while (0)
570 387
571#endif /* __KERNEL__ */
572
573/*
574 * Flags for xfs_iget()
575 */
576#define XFS_IGET_CREATE 0x1
577#define XFS_IGET_UNTRUSTED 0x2
578#define XFS_IGET_DONTCACHE 0x4
579
580int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
581 struct xfs_imap *, struct xfs_dinode **,
582 struct xfs_buf **, uint, uint);
583int xfs_iread(struct xfs_mount *, struct xfs_trans *,
584 struct xfs_inode *, uint);
585void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
586void xfs_dinode_to_disk(struct xfs_dinode *,
587 struct xfs_icdinode *);
588void xfs_idestroy_fork(struct xfs_inode *, int);
589void xfs_idata_realloc(struct xfs_inode *, int, int);
590void xfs_iroot_realloc(struct xfs_inode *, int, int);
591int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
592int xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int);
593
594xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t);
595void xfs_iext_insert(xfs_inode_t *, xfs_extnum_t, xfs_extnum_t,
596 xfs_bmbt_irec_t *, int);
597void xfs_iext_add(xfs_ifork_t *, xfs_extnum_t, int);
598void xfs_iext_add_indirect_multi(xfs_ifork_t *, int, xfs_extnum_t, int);
599void xfs_iext_remove(xfs_inode_t *, xfs_extnum_t, int, int);
600void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
601void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
602void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
603void xfs_iext_realloc_direct(xfs_ifork_t *, int);
604void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
605void xfs_iext_inline_to_direct(xfs_ifork_t *, int);
606void xfs_iext_destroy(xfs_ifork_t *);
607xfs_bmbt_rec_host_t *xfs_iext_bno_to_ext(xfs_ifork_t *, xfs_fileoff_t, int *);
608xfs_ext_irec_t *xfs_iext_bno_to_irec(xfs_ifork_t *, xfs_fileoff_t, int *);
609xfs_ext_irec_t *xfs_iext_idx_to_irec(xfs_ifork_t *, xfs_extnum_t *, int *, int);
610void xfs_iext_irec_init(xfs_ifork_t *);
611xfs_ext_irec_t *xfs_iext_irec_new(xfs_ifork_t *, int);
612void xfs_iext_irec_remove(xfs_ifork_t *, int);
613void xfs_iext_irec_compact(xfs_ifork_t *);
614void xfs_iext_irec_compact_pages(xfs_ifork_t *);
615void xfs_iext_irec_compact_full(xfs_ifork_t *);
616void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
617bool xfs_can_free_eofblocks(struct xfs_inode *, bool);
618
619#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
620
621#if defined(DEBUG)
622void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
623#else
624#define xfs_inobp_check(mp, bp)
625#endif /* DEBUG */
626
627extern struct kmem_zone *xfs_ifork_zone;
628extern struct kmem_zone *xfs_inode_zone; 388extern struct kmem_zone *xfs_inode_zone;
629extern struct kmem_zone *xfs_ili_zone;
630extern const struct xfs_buf_ops xfs_inode_buf_ops;
631 389
632#endif /* __XFS_INODE_H__ */ 390#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c
new file mode 100644
index 000000000000..e011d597f12f
--- /dev/null
+++ b/fs/xfs/xfs_inode_buf.c
@@ -0,0 +1,483 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_format.h"
21#include "xfs_log.h"
22#include "xfs_trans.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h"
26#include "xfs_bmap_btree.h"
27#include "xfs_ialloc_btree.h"
28#include "xfs_dinode.h"
29#include "xfs_inode.h"
30#include "xfs_error.h"
31#include "xfs_cksum.h"
32#include "xfs_icache.h"
33#include "xfs_ialloc.h"
34
35/*
36 * Check that none of the inode's in the buffer have a next
37 * unlinked field of 0.
38 */
39#if defined(DEBUG)
40void
41xfs_inobp_check(
42 xfs_mount_t *mp,
43 xfs_buf_t *bp)
44{
45 int i;
46 int j;
47 xfs_dinode_t *dip;
48
49 j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
50
51 for (i = 0; i < j; i++) {
52 dip = (xfs_dinode_t *)xfs_buf_offset(bp,
53 i * mp->m_sb.sb_inodesize);
54 if (!dip->di_next_unlinked) {
55 xfs_alert(mp,
56 "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
57 bp);
58 ASSERT(dip->di_next_unlinked);
59 }
60 }
61}
62#endif
63
64/*
65 * If we are doing readahead on an inode buffer, we might be in log recovery
66 * reading an inode allocation buffer that hasn't yet been replayed, and hence
67 * has not had the inode cores stamped into it. Hence for readahead, the buffer
68 * may be potentially invalid.
69 *
70 * If the readahead buffer is invalid, we don't want to mark it with an error,
71 * but we do want to clear the DONE status of the buffer so that a followup read
72 * will re-read it from disk. This will ensure that we don't get an unnecessary
73 * warnings during log recovery and we don't get unnecssary panics on debug
74 * kernels.
75 */
76static void
77xfs_inode_buf_verify(
78 struct xfs_buf *bp,
79 bool readahead)
80{
81 struct xfs_mount *mp = bp->b_target->bt_mount;
82 int i;
83 int ni;
84
85 /*
86 * Validate the magic number and version of every inode in the buffer
87 */
88 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
89 for (i = 0; i < ni; i++) {
90 int di_ok;
91 xfs_dinode_t *dip;
92
93 dip = (struct xfs_dinode *)xfs_buf_offset(bp,
94 (i << mp->m_sb.sb_inodelog));
95 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
96 XFS_DINODE_GOOD_VERSION(dip->di_version);
97 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
98 XFS_ERRTAG_ITOBP_INOTOBP,
99 XFS_RANDOM_ITOBP_INOTOBP))) {
100 if (readahead) {
101 bp->b_flags &= ~XBF_DONE;
102 return;
103 }
104
105 xfs_buf_ioerror(bp, EFSCORRUPTED);
106 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
107 mp, dip);
108#ifdef DEBUG
109 xfs_emerg(mp,
110 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
111 (unsigned long long)bp->b_bn, i,
112 be16_to_cpu(dip->di_magic));
113 ASSERT(0);
114#endif
115 }
116 }
117 xfs_inobp_check(mp, bp);
118}
119
120
121static void
122xfs_inode_buf_read_verify(
123 struct xfs_buf *bp)
124{
125 xfs_inode_buf_verify(bp, false);
126}
127
128static void
129xfs_inode_buf_readahead_verify(
130 struct xfs_buf *bp)
131{
132 xfs_inode_buf_verify(bp, true);
133}
134
135static void
136xfs_inode_buf_write_verify(
137 struct xfs_buf *bp)
138{
139 xfs_inode_buf_verify(bp, false);
140}
141
142const struct xfs_buf_ops xfs_inode_buf_ops = {
143 .verify_read = xfs_inode_buf_read_verify,
144 .verify_write = xfs_inode_buf_write_verify,
145};
146
147const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
148 .verify_read = xfs_inode_buf_readahead_verify,
149 .verify_write = xfs_inode_buf_write_verify,
150};
151
152
153/*
154 * This routine is called to map an inode to the buffer containing the on-disk
155 * version of the inode. It returns a pointer to the buffer containing the
156 * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
157 * pointer to the on-disk inode within that buffer.
158 *
159 * If a non-zero error is returned, then the contents of bpp and dipp are
160 * undefined.
161 */
162int
163xfs_imap_to_bp(
164 struct xfs_mount *mp,
165 struct xfs_trans *tp,
166 struct xfs_imap *imap,
167 struct xfs_dinode **dipp,
168 struct xfs_buf **bpp,
169 uint buf_flags,
170 uint iget_flags)
171{
172 struct xfs_buf *bp;
173 int error;
174
175 buf_flags |= XBF_UNMAPPED;
176 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
177 (int)imap->im_len, buf_flags, &bp,
178 &xfs_inode_buf_ops);
179 if (error) {
180 if (error == EAGAIN) {
181 ASSERT(buf_flags & XBF_TRYLOCK);
182 return error;
183 }
184
185 if (error == EFSCORRUPTED &&
186 (iget_flags & XFS_IGET_UNTRUSTED))
187 return XFS_ERROR(EINVAL);
188
189 xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
190 __func__, error);
191 return error;
192 }
193
194 *bpp = bp;
195 *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
196 return 0;
197}
198
199STATIC void
200xfs_dinode_from_disk(
201 xfs_icdinode_t *to,
202 xfs_dinode_t *from)
203{
204 to->di_magic = be16_to_cpu(from->di_magic);
205 to->di_mode = be16_to_cpu(from->di_mode);
206 to->di_version = from ->di_version;
207 to->di_format = from->di_format;
208 to->di_onlink = be16_to_cpu(from->di_onlink);
209 to->di_uid = be32_to_cpu(from->di_uid);
210 to->di_gid = be32_to_cpu(from->di_gid);
211 to->di_nlink = be32_to_cpu(from->di_nlink);
212 to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
213 to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
214 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
215 to->di_flushiter = be16_to_cpu(from->di_flushiter);
216 to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
217 to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
218 to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
219 to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
220 to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
221 to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
222 to->di_size = be64_to_cpu(from->di_size);
223 to->di_nblocks = be64_to_cpu(from->di_nblocks);
224 to->di_extsize = be32_to_cpu(from->di_extsize);
225 to->di_nextents = be32_to_cpu(from->di_nextents);
226 to->di_anextents = be16_to_cpu(from->di_anextents);
227 to->di_forkoff = from->di_forkoff;
228 to->di_aformat = from->di_aformat;
229 to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
230 to->di_dmstate = be16_to_cpu(from->di_dmstate);
231 to->di_flags = be16_to_cpu(from->di_flags);
232 to->di_gen = be32_to_cpu(from->di_gen);
233
234 if (to->di_version == 3) {
235 to->di_changecount = be64_to_cpu(from->di_changecount);
236 to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
237 to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
238 to->di_flags2 = be64_to_cpu(from->di_flags2);
239 to->di_ino = be64_to_cpu(from->di_ino);
240 to->di_lsn = be64_to_cpu(from->di_lsn);
241 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
242 uuid_copy(&to->di_uuid, &from->di_uuid);
243 }
244}
245
246void
247xfs_dinode_to_disk(
248 xfs_dinode_t *to,
249 xfs_icdinode_t *from)
250{
251 to->di_magic = cpu_to_be16(from->di_magic);
252 to->di_mode = cpu_to_be16(from->di_mode);
253 to->di_version = from ->di_version;
254 to->di_format = from->di_format;
255 to->di_onlink = cpu_to_be16(from->di_onlink);
256 to->di_uid = cpu_to_be32(from->di_uid);
257 to->di_gid = cpu_to_be32(from->di_gid);
258 to->di_nlink = cpu_to_be32(from->di_nlink);
259 to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
260 to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
261 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
262 to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
263 to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
264 to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
265 to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
266 to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
267 to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
268 to->di_size = cpu_to_be64(from->di_size);
269 to->di_nblocks = cpu_to_be64(from->di_nblocks);
270 to->di_extsize = cpu_to_be32(from->di_extsize);
271 to->di_nextents = cpu_to_be32(from->di_nextents);
272 to->di_anextents = cpu_to_be16(from->di_anextents);
273 to->di_forkoff = from->di_forkoff;
274 to->di_aformat = from->di_aformat;
275 to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
276 to->di_dmstate = cpu_to_be16(from->di_dmstate);
277 to->di_flags = cpu_to_be16(from->di_flags);
278 to->di_gen = cpu_to_be32(from->di_gen);
279
280 if (from->di_version == 3) {
281 to->di_changecount = cpu_to_be64(from->di_changecount);
282 to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
283 to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
284 to->di_flags2 = cpu_to_be64(from->di_flags2);
285 to->di_ino = cpu_to_be64(from->di_ino);
286 to->di_lsn = cpu_to_be64(from->di_lsn);
287 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
288 uuid_copy(&to->di_uuid, &from->di_uuid);
289 to->di_flushiter = 0;
290 } else {
291 to->di_flushiter = cpu_to_be16(from->di_flushiter);
292 }
293}
294
295static bool
296xfs_dinode_verify(
297 struct xfs_mount *mp,
298 struct xfs_inode *ip,
299 struct xfs_dinode *dip)
300{
301 if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
302 return false;
303
304 /* only version 3 or greater inodes are extensively verified here */
305 if (dip->di_version < 3)
306 return true;
307
308 if (!xfs_sb_version_hascrc(&mp->m_sb))
309 return false;
310 if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
311 offsetof(struct xfs_dinode, di_crc)))
312 return false;
313 if (be64_to_cpu(dip->di_ino) != ip->i_ino)
314 return false;
315 if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
316 return false;
317 return true;
318}
319
320void
321xfs_dinode_calc_crc(
322 struct xfs_mount *mp,
323 struct xfs_dinode *dip)
324{
325 __uint32_t crc;
326
327 if (dip->di_version < 3)
328 return;
329
330 ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
331 crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
332 offsetof(struct xfs_dinode, di_crc));
333 dip->di_crc = xfs_end_cksum(crc);
334}
335
336/*
337 * Read the disk inode attributes into the in-core inode structure.
338 *
339 * For version 5 superblocks, if we are initialising a new inode and we are not
340 * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
341 * inode core with a random generation number. If we are keeping inodes around,
342 * we need to read the inode cluster to get the existing generation number off
343 * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
344 * format) then log recovery is dependent on the di_flushiter field being
345 * initialised from the current on-disk value and hence we must also read the
346 * inode off disk.
347 */
348int
349xfs_iread(
350 xfs_mount_t *mp,
351 xfs_trans_t *tp,
352 xfs_inode_t *ip,
353 uint iget_flags)
354{
355 xfs_buf_t *bp;
356 xfs_dinode_t *dip;
357 int error;
358
359 /*
360 * Fill in the location information in the in-core inode.
361 */
362 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
363 if (error)
364 return error;
365
366 /* shortcut IO on inode allocation if possible */
367 if ((iget_flags & XFS_IGET_CREATE) &&
368 xfs_sb_version_hascrc(&mp->m_sb) &&
369 !(mp->m_flags & XFS_MOUNT_IKEEP)) {
370 /* initialise the on-disk inode core */
371 memset(&ip->i_d, 0, sizeof(ip->i_d));
372 ip->i_d.di_magic = XFS_DINODE_MAGIC;
373 ip->i_d.di_gen = prandom_u32();
374 if (xfs_sb_version_hascrc(&mp->m_sb)) {
375 ip->i_d.di_version = 3;
376 ip->i_d.di_ino = ip->i_ino;
377 uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
378 } else
379 ip->i_d.di_version = 2;
380 return 0;
381 }
382
383 /*
384 * Get pointers to the on-disk inode and the buffer containing it.
385 */
386 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
387 if (error)
388 return error;
389
390 /* even unallocated inodes are verified */
391 if (!xfs_dinode_verify(mp, ip, dip)) {
392 xfs_alert(mp, "%s: validation failed for inode %lld failed",
393 __func__, ip->i_ino);
394
395 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
396 error = XFS_ERROR(EFSCORRUPTED);
397 goto out_brelse;
398 }
399
400 /*
401 * If the on-disk inode is already linked to a directory
402 * entry, copy all of the inode into the in-core inode.
403 * xfs_iformat_fork() handles copying in the inode format
404 * specific information.
405 * Otherwise, just get the truly permanent information.
406 */
407 if (dip->di_mode) {
408 xfs_dinode_from_disk(&ip->i_d, dip);
409 error = xfs_iformat_fork(ip, dip);
410 if (error) {
411#ifdef DEBUG
412 xfs_alert(mp, "%s: xfs_iformat() returned error %d",
413 __func__, error);
414#endif /* DEBUG */
415 goto out_brelse;
416 }
417 } else {
418 /*
419 * Partial initialisation of the in-core inode. Just the bits
420 * that xfs_ialloc won't overwrite or relies on being correct.
421 */
422 ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
423 ip->i_d.di_version = dip->di_version;
424 ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
425 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
426
427 if (dip->di_version == 3) {
428 ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
429 uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
430 }
431
432 /*
433 * Make sure to pull in the mode here as well in
434 * case the inode is released without being used.
435 * This ensures that xfs_inactive() will see that
436 * the inode is already free and not try to mess
437 * with the uninitialized part of it.
438 */
439 ip->i_d.di_mode = 0;
440 }
441
442 /*
443 * The inode format changed when we moved the link count and
444 * made it 32 bits long. If this is an old format inode,
445 * convert it in memory to look like a new one. If it gets
446 * flushed to disk we will convert back before flushing or
447 * logging it. We zero out the new projid field and the old link
448 * count field. We'll handle clearing the pad field (the remains
449 * of the old uuid field) when we actually convert the inode to
450 * the new format. We don't change the version number so that we
451 * can distinguish this from a real new format inode.
452 */
453 if (ip->i_d.di_version == 1) {
454 ip->i_d.di_nlink = ip->i_d.di_onlink;
455 ip->i_d.di_onlink = 0;
456 xfs_set_projid(ip, 0);
457 }
458
459 ip->i_delayed_blks = 0;
460
461 /*
462 * Mark the buffer containing the inode as something to keep
463 * around for a while. This helps to keep recently accessed
464 * meta-data in-core longer.
465 */
466 xfs_buf_set_ref(bp, XFS_INO_REF);
467
468 /*
469 * Use xfs_trans_brelse() to release the buffer containing the on-disk
470 * inode, because it was acquired with xfs_trans_read_buf() in
471 * xfs_imap_to_bp() above. If tp is NULL, this is just a normal
472 * brelse(). If we're within a transaction, then xfs_trans_brelse()
473 * will only release the buffer if it is not dirty within the
474 * transaction. It will be OK to release the buffer in this case,
475 * because inodes on disk are never destroyed and we will be locking the
476 * new in-core inode before putting it in the cache where other
477 * processes can find it. Thus we don't have to worry about the inode
478 * being changed just because we released the buffer.
479 */
480 out_brelse:
481 xfs_trans_brelse(tp, bp);
482 return error;
483}
diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/xfs_inode_buf.h
new file mode 100644
index 000000000000..599e6c0ca2a9
--- /dev/null
+++ b/fs/xfs/xfs_inode_buf.h
@@ -0,0 +1,53 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_INODE_BUF_H__
19#define __XFS_INODE_BUF_H__
20
21struct xfs_inode;
22struct xfs_dinode;
23struct xfs_icdinode;
24
25/*
26 * Inode location information. Stored in the inode and passed to
27 * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
28 */
29struct xfs_imap {
30 xfs_daddr_t im_blkno; /* starting BB of inode chunk */
31 ushort im_len; /* length in BBs of inode chunk */
32 ushort im_boffset; /* inode offset in block in bytes */
33};
34
35int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
36 struct xfs_imap *, struct xfs_dinode **,
37 struct xfs_buf **, uint, uint);
38int xfs_iread(struct xfs_mount *, struct xfs_trans *,
39 struct xfs_inode *, uint);
40void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
41void xfs_dinode_to_disk(struct xfs_dinode *,
42 struct xfs_icdinode *);
43
44#if defined(DEBUG)
45void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
46#else
47#define xfs_inobp_check(mp, bp)
48#endif /* DEBUG */
49
50extern const struct xfs_buf_ops xfs_inode_buf_ops;
51extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
52
53#endif /* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
new file mode 100644
index 000000000000..02f1083955bb
--- /dev/null
+++ b/fs/xfs/xfs_inode_fork.c
@@ -0,0 +1,1920 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include <linux/log2.h>
19
20#include "xfs.h"
21#include "xfs_fs.h"
22#include "xfs_format.h"
23#include "xfs_log.h"
24#include "xfs_inum.h"
25#include "xfs_trans.h"
26#include "xfs_trans_priv.h"
27#include "xfs_sb.h"
28#include "xfs_ag.h"
29#include "xfs_mount.h"
30#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_attr_sf.h"
34#include "xfs_dinode.h"
35#include "xfs_inode.h"
36#include "xfs_buf_item.h"
37#include "xfs_inode_item.h"
38#include "xfs_btree.h"
39#include "xfs_alloc.h"
40#include "xfs_ialloc.h"
41#include "xfs_bmap.h"
42#include "xfs_error.h"
43#include "xfs_quota.h"
44#include "xfs_filestream.h"
45#include "xfs_cksum.h"
46#include "xfs_trace.h"
47#include "xfs_icache.h"
48
49kmem_zone_t *xfs_ifork_zone;
50
51STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
52STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
53STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
54
55#ifdef DEBUG
56/*
57 * Make sure that the extents in the given memory buffer
58 * are valid.
59 */
60void
61xfs_validate_extents(
62 xfs_ifork_t *ifp,
63 int nrecs,
64 xfs_exntfmt_t fmt)
65{
66 xfs_bmbt_irec_t irec;
67 xfs_bmbt_rec_host_t rec;
68 int i;
69
70 for (i = 0; i < nrecs; i++) {
71 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
72 rec.l0 = get_unaligned(&ep->l0);
73 rec.l1 = get_unaligned(&ep->l1);
74 xfs_bmbt_get_all(&rec, &irec);
75 if (fmt == XFS_EXTFMT_NOSTATE)
76 ASSERT(irec.br_state == XFS_EXT_NORM);
77 }
78}
79#else /* DEBUG */
80#define xfs_validate_extents(ifp, nrecs, fmt)
81#endif /* DEBUG */
82
83
84/*
85 * Move inode type and inode format specific information from the
86 * on-disk inode to the in-core inode. For fifos, devs, and sockets
87 * this means set if_rdev to the proper value. For files, directories,
88 * and symlinks this means to bring in the in-line data or extent
89 * pointers. For a file in B-tree format, only the root is immediately
90 * brought in-core. The rest will be in-lined in if_extents when it
91 * is first referenced (see xfs_iread_extents()).
92 */
93int
94xfs_iformat_fork(
95 xfs_inode_t *ip,
96 xfs_dinode_t *dip)
97{
98 xfs_attr_shortform_t *atp;
99 int size;
100 int error = 0;
101 xfs_fsize_t di_size;
102
103 if (unlikely(be32_to_cpu(dip->di_nextents) +
104 be16_to_cpu(dip->di_anextents) >
105 be64_to_cpu(dip->di_nblocks))) {
106 xfs_warn(ip->i_mount,
107 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
108 (unsigned long long)ip->i_ino,
109 (int)(be32_to_cpu(dip->di_nextents) +
110 be16_to_cpu(dip->di_anextents)),
111 (unsigned long long)
112 be64_to_cpu(dip->di_nblocks));
113 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
114 ip->i_mount, dip);
115 return XFS_ERROR(EFSCORRUPTED);
116 }
117
118 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
119 xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
120 (unsigned long long)ip->i_ino,
121 dip->di_forkoff);
122 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
123 ip->i_mount, dip);
124 return XFS_ERROR(EFSCORRUPTED);
125 }
126
127 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
128 !ip->i_mount->m_rtdev_targp)) {
129 xfs_warn(ip->i_mount,
130 "corrupt dinode %Lu, has realtime flag set.",
131 ip->i_ino);
132 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
133 XFS_ERRLEVEL_LOW, ip->i_mount, dip);
134 return XFS_ERROR(EFSCORRUPTED);
135 }
136
137 switch (ip->i_d.di_mode & S_IFMT) {
138 case S_IFIFO:
139 case S_IFCHR:
140 case S_IFBLK:
141 case S_IFSOCK:
142 if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
143 XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
144 ip->i_mount, dip);
145 return XFS_ERROR(EFSCORRUPTED);
146 }
147 ip->i_d.di_size = 0;
148 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
149 break;
150
151 case S_IFREG:
152 case S_IFLNK:
153 case S_IFDIR:
154 switch (dip->di_format) {
155 case XFS_DINODE_FMT_LOCAL:
156 /*
157 * no local regular files yet
158 */
159 if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
160 xfs_warn(ip->i_mount,
161 "corrupt inode %Lu (local format for regular file).",
162 (unsigned long long) ip->i_ino);
163 XFS_CORRUPTION_ERROR("xfs_iformat(4)",
164 XFS_ERRLEVEL_LOW,
165 ip->i_mount, dip);
166 return XFS_ERROR(EFSCORRUPTED);
167 }
168
169 di_size = be64_to_cpu(dip->di_size);
170 if (unlikely(di_size < 0 ||
171 di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
172 xfs_warn(ip->i_mount,
173 "corrupt inode %Lu (bad size %Ld for local inode).",
174 (unsigned long long) ip->i_ino,
175 (long long) di_size);
176 XFS_CORRUPTION_ERROR("xfs_iformat(5)",
177 XFS_ERRLEVEL_LOW,
178 ip->i_mount, dip);
179 return XFS_ERROR(EFSCORRUPTED);
180 }
181
182 size = (int)di_size;
183 error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
184 break;
185 case XFS_DINODE_FMT_EXTENTS:
186 error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
187 break;
188 case XFS_DINODE_FMT_BTREE:
189 error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
190 break;
191 default:
192 XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
193 ip->i_mount);
194 return XFS_ERROR(EFSCORRUPTED);
195 }
196 break;
197
198 default:
199 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
200 return XFS_ERROR(EFSCORRUPTED);
201 }
202 if (error) {
203 return error;
204 }
205 if (!XFS_DFORK_Q(dip))
206 return 0;
207
208 ASSERT(ip->i_afp == NULL);
209 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
210
211 switch (dip->di_aformat) {
212 case XFS_DINODE_FMT_LOCAL:
213 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
214 size = be16_to_cpu(atp->hdr.totsize);
215
216 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
217 xfs_warn(ip->i_mount,
218 "corrupt inode %Lu (bad attr fork size %Ld).",
219 (unsigned long long) ip->i_ino,
220 (long long) size);
221 XFS_CORRUPTION_ERROR("xfs_iformat(8)",
222 XFS_ERRLEVEL_LOW,
223 ip->i_mount, dip);
224 return XFS_ERROR(EFSCORRUPTED);
225 }
226
227 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
228 break;
229 case XFS_DINODE_FMT_EXTENTS:
230 error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
231 break;
232 case XFS_DINODE_FMT_BTREE:
233 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
234 break;
235 default:
236 error = XFS_ERROR(EFSCORRUPTED);
237 break;
238 }
239 if (error) {
240 kmem_zone_free(xfs_ifork_zone, ip->i_afp);
241 ip->i_afp = NULL;
242 xfs_idestroy_fork(ip, XFS_DATA_FORK);
243 }
244 return error;
245}
246
247/*
248 * The file is in-lined in the on-disk inode.
249 * If it fits into if_inline_data, then copy
250 * it there, otherwise allocate a buffer for it
251 * and copy the data there. Either way, set
252 * if_data to point at the data.
253 * If we allocate a buffer for the data, make
254 * sure that its size is a multiple of 4 and
255 * record the real size in i_real_bytes.
256 */
257STATIC int
258xfs_iformat_local(
259 xfs_inode_t *ip,
260 xfs_dinode_t *dip,
261 int whichfork,
262 int size)
263{
264 xfs_ifork_t *ifp;
265 int real_size;
266
267 /*
268 * If the size is unreasonable, then something
269 * is wrong and we just bail out rather than crash in
270 * kmem_alloc() or memcpy() below.
271 */
272 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
273 xfs_warn(ip->i_mount,
274 "corrupt inode %Lu (bad size %d for local fork, size = %d).",
275 (unsigned long long) ip->i_ino, size,
276 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
277 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
278 ip->i_mount, dip);
279 return XFS_ERROR(EFSCORRUPTED);
280 }
281 ifp = XFS_IFORK_PTR(ip, whichfork);
282 real_size = 0;
283 if (size == 0)
284 ifp->if_u1.if_data = NULL;
285 else if (size <= sizeof(ifp->if_u2.if_inline_data))
286 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
287 else {
288 real_size = roundup(size, 4);
289 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
290 }
291 ifp->if_bytes = size;
292 ifp->if_real_bytes = real_size;
293 if (size)
294 memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
295 ifp->if_flags &= ~XFS_IFEXTENTS;
296 ifp->if_flags |= XFS_IFINLINE;
297 return 0;
298}
299
300/*
301 * The file consists of a set of extents all
302 * of which fit into the on-disk inode.
303 * If there are few enough extents to fit into
304 * the if_inline_ext, then copy them there.
305 * Otherwise allocate a buffer for them and copy
306 * them into it. Either way, set if_extents
307 * to point at the extents.
308 */
309STATIC int
310xfs_iformat_extents(
311 xfs_inode_t *ip,
312 xfs_dinode_t *dip,
313 int whichfork)
314{
315 xfs_bmbt_rec_t *dp;
316 xfs_ifork_t *ifp;
317 int nex;
318 int size;
319 int i;
320
321 ifp = XFS_IFORK_PTR(ip, whichfork);
322 nex = XFS_DFORK_NEXTENTS(dip, whichfork);
323 size = nex * (uint)sizeof(xfs_bmbt_rec_t);
324
325 /*
326 * If the number of extents is unreasonable, then something
327 * is wrong and we just bail out rather than crash in
328 * kmem_alloc() or memcpy() below.
329 */
330 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
331 xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
332 (unsigned long long) ip->i_ino, nex);
333 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
334 ip->i_mount, dip);
335 return XFS_ERROR(EFSCORRUPTED);
336 }
337
338 ifp->if_real_bytes = 0;
339 if (nex == 0)
340 ifp->if_u1.if_extents = NULL;
341 else if (nex <= XFS_INLINE_EXTS)
342 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
343 else
344 xfs_iext_add(ifp, 0, nex);
345
346 ifp->if_bytes = size;
347 if (size) {
348 dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
349 xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
350 for (i = 0; i < nex; i++, dp++) {
351 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
352 ep->l0 = get_unaligned_be64(&dp->l0);
353 ep->l1 = get_unaligned_be64(&dp->l1);
354 }
355 XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
356 if (whichfork != XFS_DATA_FORK ||
357 XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
358 if (unlikely(xfs_check_nostate_extents(
359 ifp, 0, nex))) {
360 XFS_ERROR_REPORT("xfs_iformat_extents(2)",
361 XFS_ERRLEVEL_LOW,
362 ip->i_mount);
363 return XFS_ERROR(EFSCORRUPTED);
364 }
365 }
366 ifp->if_flags |= XFS_IFEXTENTS;
367 return 0;
368}
369
370/*
371 * The file has too many extents to fit into
372 * the inode, so they are in B-tree format.
373 * Allocate a buffer for the root of the B-tree
374 * and copy the root into it. The i_extents
375 * field will remain NULL until all of the
376 * extents are read in (when they are needed).
377 */
378STATIC int
379xfs_iformat_btree(
380 xfs_inode_t *ip,
381 xfs_dinode_t *dip,
382 int whichfork)
383{
384 struct xfs_mount *mp = ip->i_mount;
385 xfs_bmdr_block_t *dfp;
386 xfs_ifork_t *ifp;
387 /* REFERENCED */
388 int nrecs;
389 int size;
390
391 ifp = XFS_IFORK_PTR(ip, whichfork);
392 dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
393 size = XFS_BMAP_BROOT_SPACE(mp, dfp);
394 nrecs = be16_to_cpu(dfp->bb_numrecs);
395
396 /*
397 * blow out if -- fork has less extents than can fit in
398 * fork (fork shouldn't be a btree format), root btree
399 * block has more records than can fit into the fork,
400 * or the number of extents is greater than the number of
401 * blocks.
402 */
403 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
404 XFS_IFORK_MAXEXT(ip, whichfork) ||
405 XFS_BMDR_SPACE_CALC(nrecs) >
406 XFS_DFORK_SIZE(dip, mp, whichfork) ||
407 XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
408 xfs_warn(mp, "corrupt inode %Lu (btree).",
409 (unsigned long long) ip->i_ino);
410 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
411 mp, dip);
412 return XFS_ERROR(EFSCORRUPTED);
413 }
414
415 ifp->if_broot_bytes = size;
416 ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
417 ASSERT(ifp->if_broot != NULL);
418 /*
419 * Copy and convert from the on-disk structure
420 * to the in-memory structure.
421 */
422 xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
423 ifp->if_broot, size);
424 ifp->if_flags &= ~XFS_IFEXTENTS;
425 ifp->if_flags |= XFS_IFBROOT;
426
427 return 0;
428}
429
430/*
431 * Read in extents from a btree-format inode.
432 * Allocate and fill in if_extents. Real work is done in xfs_bmap.c.
433 */
434int
435xfs_iread_extents(
436 xfs_trans_t *tp,
437 xfs_inode_t *ip,
438 int whichfork)
439{
440 int error;
441 xfs_ifork_t *ifp;
442 xfs_extnum_t nextents;
443
444 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
445 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
446 ip->i_mount);
447 return XFS_ERROR(EFSCORRUPTED);
448 }
449 nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
450 ifp = XFS_IFORK_PTR(ip, whichfork);
451
452 /*
453 * We know that the size is valid (it's checked in iformat_btree)
454 */
455 ifp->if_bytes = ifp->if_real_bytes = 0;
456 ifp->if_flags |= XFS_IFEXTENTS;
457 xfs_iext_add(ifp, 0, nextents);
458 error = xfs_bmap_read_extents(tp, ip, whichfork);
459 if (error) {
460 xfs_iext_destroy(ifp);
461 ifp->if_flags &= ~XFS_IFEXTENTS;
462 return error;
463 }
464 xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
465 return 0;
466}
467/*
468 * Reallocate the space for if_broot based on the number of records
469 * being added or deleted as indicated in rec_diff. Move the records
470 * and pointers in if_broot to fit the new size. When shrinking this
471 * will eliminate holes between the records and pointers created by
472 * the caller. When growing this will create holes to be filled in
473 * by the caller.
474 *
475 * The caller must not request to add more records than would fit in
476 * the on-disk inode root. If the if_broot is currently NULL, then
477 * if we are adding records, one will be allocated. The caller must also
478 * not request that the number of records go below zero, although
479 * it can go to zero.
480 *
481 * ip -- the inode whose if_broot area is changing
482 * ext_diff -- the change in the number of records, positive or negative,
483 * requested for the if_broot array.
484 */
485void
486xfs_iroot_realloc(
487 xfs_inode_t *ip,
488 int rec_diff,
489 int whichfork)
490{
491 struct xfs_mount *mp = ip->i_mount;
492 int cur_max;
493 xfs_ifork_t *ifp;
494 struct xfs_btree_block *new_broot;
495 int new_max;
496 size_t new_size;
497 char *np;
498 char *op;
499
500 /*
501 * Handle the degenerate case quietly.
502 */
503 if (rec_diff == 0) {
504 return;
505 }
506
507 ifp = XFS_IFORK_PTR(ip, whichfork);
508 if (rec_diff > 0) {
509 /*
510 * If there wasn't any memory allocated before, just
511 * allocate it now and get out.
512 */
513 if (ifp->if_broot_bytes == 0) {
514 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
515 ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
516 ifp->if_broot_bytes = (int)new_size;
517 return;
518 }
519
520 /*
521 * If there is already an existing if_broot, then we need
522 * to realloc() it and shift the pointers to their new
523 * location. The records don't change location because
524 * they are kept butted up against the btree block header.
525 */
526 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
527 new_max = cur_max + rec_diff;
528 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
529 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
530 XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max),
531 KM_SLEEP | KM_NOFS);
532 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
533 ifp->if_broot_bytes);
534 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
535 (int)new_size);
536 ifp->if_broot_bytes = (int)new_size;
537 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
538 XFS_IFORK_SIZE(ip, whichfork));
539 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
540 return;
541 }
542
543 /*
544 * rec_diff is less than 0. In this case, we are shrinking the
545 * if_broot buffer. It must already exist. If we go to zero
546 * records, just get rid of the root and clear the status bit.
547 */
548 ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
549 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
550 new_max = cur_max + rec_diff;
551 ASSERT(new_max >= 0);
552 if (new_max > 0)
553 new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
554 else
555 new_size = 0;
556 if (new_size > 0) {
557 new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
558 /*
559 * First copy over the btree block header.
560 */
561 memcpy(new_broot, ifp->if_broot,
562 XFS_BMBT_BLOCK_LEN(ip->i_mount));
563 } else {
564 new_broot = NULL;
565 ifp->if_flags &= ~XFS_IFBROOT;
566 }
567
568 /*
569 * Only copy the records and pointers if there are any.
570 */
571 if (new_max > 0) {
572 /*
573 * First copy the records.
574 */
575 op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
576 np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
577 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
578
579 /*
580 * Then copy the pointers.
581 */
582 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
583 ifp->if_broot_bytes);
584 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
585 (int)new_size);
586 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
587 }
588 kmem_free(ifp->if_broot);
589 ifp->if_broot = new_broot;
590 ifp->if_broot_bytes = (int)new_size;
591 if (ifp->if_broot)
592 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
593 XFS_IFORK_SIZE(ip, whichfork));
594 return;
595}
596
597
598/*
599 * This is called when the amount of space needed for if_data
600 * is increased or decreased. The change in size is indicated by
601 * the number of bytes that need to be added or deleted in the
602 * byte_diff parameter.
603 *
604 * If the amount of space needed has decreased below the size of the
605 * inline buffer, then switch to using the inline buffer. Otherwise,
606 * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
607 * to what is needed.
608 *
609 * ip -- the inode whose if_data area is changing
610 * byte_diff -- the change in the number of bytes, positive or negative,
611 * requested for the if_data array.
612 */
613void
614xfs_idata_realloc(
615 xfs_inode_t *ip,
616 int byte_diff,
617 int whichfork)
618{
619 xfs_ifork_t *ifp;
620 int new_size;
621 int real_size;
622
623 if (byte_diff == 0) {
624 return;
625 }
626
627 ifp = XFS_IFORK_PTR(ip, whichfork);
628 new_size = (int)ifp->if_bytes + byte_diff;
629 ASSERT(new_size >= 0);
630
631 if (new_size == 0) {
632 if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
633 kmem_free(ifp->if_u1.if_data);
634 }
635 ifp->if_u1.if_data = NULL;
636 real_size = 0;
637 } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
638 /*
639 * If the valid extents/data can fit in if_inline_ext/data,
640 * copy them from the malloc'd vector and free it.
641 */
642 if (ifp->if_u1.if_data == NULL) {
643 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
644 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
645 ASSERT(ifp->if_real_bytes != 0);
646 memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
647 new_size);
648 kmem_free(ifp->if_u1.if_data);
649 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
650 }
651 real_size = 0;
652 } else {
653 /*
654 * Stuck with malloc/realloc.
655 * For inline data, the underlying buffer must be
656 * a multiple of 4 bytes in size so that it can be
657 * logged and stay on word boundaries. We enforce
658 * that here.
659 */
660 real_size = roundup(new_size, 4);
661 if (ifp->if_u1.if_data == NULL) {
662 ASSERT(ifp->if_real_bytes == 0);
663 ifp->if_u1.if_data = kmem_alloc(real_size,
664 KM_SLEEP | KM_NOFS);
665 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
666 /*
667 * Only do the realloc if the underlying size
668 * is really changing.
669 */
670 if (ifp->if_real_bytes != real_size) {
671 ifp->if_u1.if_data =
672 kmem_realloc(ifp->if_u1.if_data,
673 real_size,
674 ifp->if_real_bytes,
675 KM_SLEEP | KM_NOFS);
676 }
677 } else {
678 ASSERT(ifp->if_real_bytes == 0);
679 ifp->if_u1.if_data = kmem_alloc(real_size,
680 KM_SLEEP | KM_NOFS);
681 memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
682 ifp->if_bytes);
683 }
684 }
685 ifp->if_real_bytes = real_size;
686 ifp->if_bytes = new_size;
687 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
688}
689
690void
691xfs_idestroy_fork(
692 xfs_inode_t *ip,
693 int whichfork)
694{
695 xfs_ifork_t *ifp;
696
697 ifp = XFS_IFORK_PTR(ip, whichfork);
698 if (ifp->if_broot != NULL) {
699 kmem_free(ifp->if_broot);
700 ifp->if_broot = NULL;
701 }
702
703 /*
704 * If the format is local, then we can't have an extents
705 * array so just look for an inline data array. If we're
706 * not local then we may or may not have an extents list,
707 * so check and free it up if we do.
708 */
709 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
710 if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
711 (ifp->if_u1.if_data != NULL)) {
712 ASSERT(ifp->if_real_bytes != 0);
713 kmem_free(ifp->if_u1.if_data);
714 ifp->if_u1.if_data = NULL;
715 ifp->if_real_bytes = 0;
716 }
717 } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
718 ((ifp->if_flags & XFS_IFEXTIREC) ||
719 ((ifp->if_u1.if_extents != NULL) &&
720 (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
721 ASSERT(ifp->if_real_bytes != 0);
722 xfs_iext_destroy(ifp);
723 }
724 ASSERT(ifp->if_u1.if_extents == NULL ||
725 ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
726 ASSERT(ifp->if_real_bytes == 0);
727 if (whichfork == XFS_ATTR_FORK) {
728 kmem_zone_free(xfs_ifork_zone, ip->i_afp);
729 ip->i_afp = NULL;
730 }
731}
732
733/*
734 * xfs_iextents_copy()
735 *
736 * This is called to copy the REAL extents (as opposed to the delayed
737 * allocation extents) from the inode into the given buffer. It
738 * returns the number of bytes copied into the buffer.
739 *
740 * If there are no delayed allocation extents, then we can just
741 * memcpy() the extents into the buffer. Otherwise, we need to
742 * examine each extent in turn and skip those which are delayed.
743 */
744int
745xfs_iextents_copy(
746 xfs_inode_t *ip,
747 xfs_bmbt_rec_t *dp,
748 int whichfork)
749{
750 int copied;
751 int i;
752 xfs_ifork_t *ifp;
753 int nrecs;
754 xfs_fsblock_t start_block;
755
756 ifp = XFS_IFORK_PTR(ip, whichfork);
757 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
758 ASSERT(ifp->if_bytes > 0);
759
760 nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
761 XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
762 ASSERT(nrecs > 0);
763
764 /*
765 * There are some delayed allocation extents in the
766 * inode, so copy the extents one at a time and skip
767 * the delayed ones. There must be at least one
768 * non-delayed extent.
769 */
770 copied = 0;
771 for (i = 0; i < nrecs; i++) {
772 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
773 start_block = xfs_bmbt_get_startblock(ep);
774 if (isnullstartblock(start_block)) {
775 /*
776 * It's a delayed allocation extent, so skip it.
777 */
778 continue;
779 }
780
781 /* Translate to on disk format */
782 put_unaligned_be64(ep->l0, &dp->l0);
783 put_unaligned_be64(ep->l1, &dp->l1);
784 dp++;
785 copied++;
786 }
787 ASSERT(copied != 0);
788 xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
789
790 return (copied * (uint)sizeof(xfs_bmbt_rec_t));
791}
792
793/*
794 * Each of the following cases stores data into the same region
795 * of the on-disk inode, so only one of them can be valid at
796 * any given time. While it is possible to have conflicting formats
797 * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
798 * in EXTENTS format, this can only happen when the fork has
799 * changed formats after being modified but before being flushed.
800 * In these cases, the format always takes precedence, because the
801 * format indicates the current state of the fork.
802 */
803void
804xfs_iflush_fork(
805 xfs_inode_t *ip,
806 xfs_dinode_t *dip,
807 xfs_inode_log_item_t *iip,
808 int whichfork,
809 xfs_buf_t *bp)
810{
811 char *cp;
812 xfs_ifork_t *ifp;
813 xfs_mount_t *mp;
814 static const short brootflag[2] =
815 { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
816 static const short dataflag[2] =
817 { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
818 static const short extflag[2] =
819 { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
820
821 if (!iip)
822 return;
823 ifp = XFS_IFORK_PTR(ip, whichfork);
824 /*
825 * This can happen if we gave up in iformat in an error path,
826 * for the attribute fork.
827 */
828 if (!ifp) {
829 ASSERT(whichfork == XFS_ATTR_FORK);
830 return;
831 }
832 cp = XFS_DFORK_PTR(dip, whichfork);
833 mp = ip->i_mount;
834 switch (XFS_IFORK_FORMAT(ip, whichfork)) {
835 case XFS_DINODE_FMT_LOCAL:
836 if ((iip->ili_fields & dataflag[whichfork]) &&
837 (ifp->if_bytes > 0)) {
838 ASSERT(ifp->if_u1.if_data != NULL);
839 ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
840 memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
841 }
842 break;
843
844 case XFS_DINODE_FMT_EXTENTS:
845 ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
846 !(iip->ili_fields & extflag[whichfork]));
847 if ((iip->ili_fields & extflag[whichfork]) &&
848 (ifp->if_bytes > 0)) {
849 ASSERT(xfs_iext_get_ext(ifp, 0));
850 ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
851 (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
852 whichfork);
853 }
854 break;
855
856 case XFS_DINODE_FMT_BTREE:
857 if ((iip->ili_fields & brootflag[whichfork]) &&
858 (ifp->if_broot_bytes > 0)) {
859 ASSERT(ifp->if_broot != NULL);
860 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
861 XFS_IFORK_SIZE(ip, whichfork));
862 xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
863 (xfs_bmdr_block_t *)cp,
864 XFS_DFORK_SIZE(dip, mp, whichfork));
865 }
866 break;
867
868 case XFS_DINODE_FMT_DEV:
869 if (iip->ili_fields & XFS_ILOG_DEV) {
870 ASSERT(whichfork == XFS_DATA_FORK);
871 xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
872 }
873 break;
874
875 case XFS_DINODE_FMT_UUID:
876 if (iip->ili_fields & XFS_ILOG_UUID) {
877 ASSERT(whichfork == XFS_DATA_FORK);
878 memcpy(XFS_DFORK_DPTR(dip),
879 &ip->i_df.if_u2.if_uuid,
880 sizeof(uuid_t));
881 }
882 break;
883
884 default:
885 ASSERT(0);
886 break;
887 }
888}
889
890/*
891 * Return a pointer to the extent record at file index idx.
892 */
893xfs_bmbt_rec_host_t *
894xfs_iext_get_ext(
895 xfs_ifork_t *ifp, /* inode fork pointer */
896 xfs_extnum_t idx) /* index of target extent */
897{
898 ASSERT(idx >= 0);
899 ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
900
901 if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
902 return ifp->if_u1.if_ext_irec->er_extbuf;
903 } else if (ifp->if_flags & XFS_IFEXTIREC) {
904 xfs_ext_irec_t *erp; /* irec pointer */
905 int erp_idx = 0; /* irec index */
906 xfs_extnum_t page_idx = idx; /* ext index in target list */
907
908 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
909 return &erp->er_extbuf[page_idx];
910 } else if (ifp->if_bytes) {
911 return &ifp->if_u1.if_extents[idx];
912 } else {
913 return NULL;
914 }
915}
916
917/*
918 * Insert new item(s) into the extent records for incore inode
919 * fork 'ifp'. 'count' new items are inserted at index 'idx'.
920 */
921void
922xfs_iext_insert(
923 xfs_inode_t *ip, /* incore inode pointer */
924 xfs_extnum_t idx, /* starting index of new items */
925 xfs_extnum_t count, /* number of inserted items */
926 xfs_bmbt_irec_t *new, /* items to insert */
927 int state) /* type of extent conversion */
928{
929 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
930 xfs_extnum_t i; /* extent record index */
931
932 trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
933
934 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
935 xfs_iext_add(ifp, idx, count);
936 for (i = idx; i < idx + count; i++, new++)
937 xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
938}
939
940/*
941 * This is called when the amount of space required for incore file
942 * extents needs to be increased. The ext_diff parameter stores the
943 * number of new extents being added and the idx parameter contains
944 * the extent index where the new extents will be added. If the new
945 * extents are being appended, then we just need to (re)allocate and
946 * initialize the space. Otherwise, if the new extents are being
947 * inserted into the middle of the existing entries, a bit more work
948 * is required to make room for the new extents to be inserted. The
949 * caller is responsible for filling in the new extent entries upon
950 * return.
951 */
952void
953xfs_iext_add(
954 xfs_ifork_t *ifp, /* inode fork pointer */
955 xfs_extnum_t idx, /* index to begin adding exts */
956 int ext_diff) /* number of extents to add */
957{
958 int byte_diff; /* new bytes being added */
959 int new_size; /* size of extents after adding */
960 xfs_extnum_t nextents; /* number of extents in file */
961
962 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
963 ASSERT((idx >= 0) && (idx <= nextents));
964 byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
965 new_size = ifp->if_bytes + byte_diff;
966 /*
967 * If the new number of extents (nextents + ext_diff)
968 * fits inside the inode, then continue to use the inline
969 * extent buffer.
970 */
971 if (nextents + ext_diff <= XFS_INLINE_EXTS) {
972 if (idx < nextents) {
973 memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
974 &ifp->if_u2.if_inline_ext[idx],
975 (nextents - idx) * sizeof(xfs_bmbt_rec_t));
976 memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
977 }
978 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
979 ifp->if_real_bytes = 0;
980 }
981 /*
982 * Otherwise use a linear (direct) extent list.
983 * If the extents are currently inside the inode,
984 * xfs_iext_realloc_direct will switch us from
985 * inline to direct extent allocation mode.
986 */
987 else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
988 xfs_iext_realloc_direct(ifp, new_size);
989 if (idx < nextents) {
990 memmove(&ifp->if_u1.if_extents[idx + ext_diff],
991 &ifp->if_u1.if_extents[idx],
992 (nextents - idx) * sizeof(xfs_bmbt_rec_t));
993 memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
994 }
995 }
996 /* Indirection array */
997 else {
998 xfs_ext_irec_t *erp;
999 int erp_idx = 0;
1000 int page_idx = idx;
1001
1002 ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
1003 if (ifp->if_flags & XFS_IFEXTIREC) {
1004 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
1005 } else {
1006 xfs_iext_irec_init(ifp);
1007 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1008 erp = ifp->if_u1.if_ext_irec;
1009 }
1010 /* Extents fit in target extent page */
1011 if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
1012 if (page_idx < erp->er_extcount) {
1013 memmove(&erp->er_extbuf[page_idx + ext_diff],
1014 &erp->er_extbuf[page_idx],
1015 (erp->er_extcount - page_idx) *
1016 sizeof(xfs_bmbt_rec_t));
1017 memset(&erp->er_extbuf[page_idx], 0, byte_diff);
1018 }
1019 erp->er_extcount += ext_diff;
1020 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
1021 }
1022 /* Insert a new extent page */
1023 else if (erp) {
1024 xfs_iext_add_indirect_multi(ifp,
1025 erp_idx, page_idx, ext_diff);
1026 }
1027 /*
1028 * If extent(s) are being appended to the last page in
1029 * the indirection array and the new extent(s) don't fit
1030 * in the page, then erp is NULL and erp_idx is set to
1031 * the next index needed in the indirection array.
1032 */
1033 else {
1034 int count = ext_diff;
1035
1036 while (count) {
1037 erp = xfs_iext_irec_new(ifp, erp_idx);
1038 erp->er_extcount = count;
1039 count -= MIN(count, (int)XFS_LINEAR_EXTS);
1040 if (count) {
1041 erp_idx++;
1042 }
1043 }
1044 }
1045 }
1046 ifp->if_bytes = new_size;
1047}
1048
1049/*
1050 * This is called when incore extents are being added to the indirection
1051 * array and the new extents do not fit in the target extent list. The
1052 * erp_idx parameter contains the irec index for the target extent list
1053 * in the indirection array, and the idx parameter contains the extent
1054 * index within the list. The number of extents being added is stored
1055 * in the count parameter.
1056 *
1057 * |-------| |-------|
1058 * | | | | idx - number of extents before idx
1059 * | idx | | count |
1060 * | | | | count - number of extents being inserted at idx
1061 * |-------| |-------|
1062 * | count | | nex2 | nex2 - number of extents after idx + count
1063 * |-------| |-------|
1064 */
1065void
1066xfs_iext_add_indirect_multi(
1067 xfs_ifork_t *ifp, /* inode fork pointer */
1068 int erp_idx, /* target extent irec index */
1069 xfs_extnum_t idx, /* index within target list */
1070 int count) /* new extents being added */
1071{
1072 int byte_diff; /* new bytes being added */
1073 xfs_ext_irec_t *erp; /* pointer to irec entry */
1074 xfs_extnum_t ext_diff; /* number of extents to add */
1075 xfs_extnum_t ext_cnt; /* new extents still needed */
1076 xfs_extnum_t nex2; /* extents after idx + count */
1077 xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */
1078 int nlists; /* number of irec's (lists) */
1079
1080 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1081 erp = &ifp->if_u1.if_ext_irec[erp_idx];
1082 nex2 = erp->er_extcount - idx;
1083 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1084
1085 /*
1086 * Save second part of target extent list
1087 * (all extents past */
1088 if (nex2) {
1089 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
1090 nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
1091 memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
1092 erp->er_extcount -= nex2;
1093 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
1094 memset(&erp->er_extbuf[idx], 0, byte_diff);
1095 }
1096
1097 /*
1098 * Add the new extents to the end of the target
1099 * list, then allocate new irec record(s) and
1100 * extent buffer(s) as needed to store the rest
1101 * of the new extents.
1102 */
1103 ext_cnt = count;
1104 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
1105 if (ext_diff) {
1106 erp->er_extcount += ext_diff;
1107 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
1108 ext_cnt -= ext_diff;
1109 }
1110 while (ext_cnt) {
1111 erp_idx++;
1112 erp = xfs_iext_irec_new(ifp, erp_idx);
1113 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
1114 erp->er_extcount = ext_diff;
1115 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
1116 ext_cnt -= ext_diff;
1117 }
1118
1119 /* Add nex2 extents back to indirection array */
1120 if (nex2) {
1121 xfs_extnum_t ext_avail;
1122 int i;
1123
1124 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
1125 ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
1126 i = 0;
1127 /*
1128 * If nex2 extents fit in the current page, append
1129 * nex2_ep after the new extents.
1130 */
1131 if (nex2 <= ext_avail) {
1132 i = erp->er_extcount;
1133 }
1134 /*
1135 * Otherwise, check if space is available in the
1136 * next page.
1137 */
1138 else if ((erp_idx < nlists - 1) &&
1139 (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
1140 ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
1141 erp_idx++;
1142 erp++;
1143 /* Create a hole for nex2 extents */
1144 memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
1145 erp->er_extcount * sizeof(xfs_bmbt_rec_t));
1146 }
1147 /*
1148 * Final choice, create a new extent page for
1149 * nex2 extents.
1150 */
1151 else {
1152 erp_idx++;
1153 erp = xfs_iext_irec_new(ifp, erp_idx);
1154 }
1155 memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
1156 kmem_free(nex2_ep);
1157 erp->er_extcount += nex2;
1158 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
1159 }
1160}
1161
1162/*
1163 * This is called when the amount of space required for incore file
1164 * extents needs to be decreased. The ext_diff parameter stores the
1165 * number of extents to be removed and the idx parameter contains
1166 * the extent index where the extents will be removed from.
1167 *
1168 * If the amount of space needed has decreased below the linear
1169 * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
1170 * extent array. Otherwise, use kmem_realloc() to adjust the
1171 * size to what is needed.
1172 */
1173void
1174xfs_iext_remove(
1175 xfs_inode_t *ip, /* incore inode pointer */
1176 xfs_extnum_t idx, /* index to begin removing exts */
1177 int ext_diff, /* number of extents to remove */
1178 int state) /* type of extent conversion */
1179{
1180 xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
1181 xfs_extnum_t nextents; /* number of extents in file */
1182 int new_size; /* size of extents after removal */
1183
1184 trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
1185
1186 ASSERT(ext_diff > 0);
1187 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1188 new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
1189
1190 if (new_size == 0) {
1191 xfs_iext_destroy(ifp);
1192 } else if (ifp->if_flags & XFS_IFEXTIREC) {
1193 xfs_iext_remove_indirect(ifp, idx, ext_diff);
1194 } else if (ifp->if_real_bytes) {
1195 xfs_iext_remove_direct(ifp, idx, ext_diff);
1196 } else {
1197 xfs_iext_remove_inline(ifp, idx, ext_diff);
1198 }
1199 ifp->if_bytes = new_size;
1200}
1201
1202/*
1203 * This removes ext_diff extents from the inline buffer, beginning
1204 * at extent index idx.
1205 */
1206void
1207xfs_iext_remove_inline(
1208 xfs_ifork_t *ifp, /* inode fork pointer */
1209 xfs_extnum_t idx, /* index to begin removing exts */
1210 int ext_diff) /* number of extents to remove */
1211{
1212 int nextents; /* number of extents in file */
1213
1214 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
1215 ASSERT(idx < XFS_INLINE_EXTS);
1216 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1217 ASSERT(((nextents - ext_diff) > 0) &&
1218 (nextents - ext_diff) < XFS_INLINE_EXTS);
1219
1220 if (idx + ext_diff < nextents) {
1221 memmove(&ifp->if_u2.if_inline_ext[idx],
1222 &ifp->if_u2.if_inline_ext[idx + ext_diff],
1223 (nextents - (idx + ext_diff)) *
1224 sizeof(xfs_bmbt_rec_t));
1225 memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
1226 0, ext_diff * sizeof(xfs_bmbt_rec_t));
1227 } else {
1228 memset(&ifp->if_u2.if_inline_ext[idx], 0,
1229 ext_diff * sizeof(xfs_bmbt_rec_t));
1230 }
1231}
1232
1233/*
1234 * This removes ext_diff extents from a linear (direct) extent list,
1235 * beginning at extent index idx. If the extents are being removed
1236 * from the end of the list (ie. truncate) then we just need to re-
1237 * allocate the list to remove the extra space. Otherwise, if the
1238 * extents are being removed from the middle of the existing extent
1239 * entries, then we first need to move the extent records beginning
1240 * at idx + ext_diff up in the list to overwrite the records being
1241 * removed, then remove the extra space via kmem_realloc.
1242 */
1243void
1244xfs_iext_remove_direct(
1245 xfs_ifork_t *ifp, /* inode fork pointer */
1246 xfs_extnum_t idx, /* index to begin removing exts */
1247 int ext_diff) /* number of extents to remove */
1248{
1249 xfs_extnum_t nextents; /* number of extents in file */
1250 int new_size; /* size of extents after removal */
1251
1252 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
1253 new_size = ifp->if_bytes -
1254 (ext_diff * sizeof(xfs_bmbt_rec_t));
1255 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1256
1257 if (new_size == 0) {
1258 xfs_iext_destroy(ifp);
1259 return;
1260 }
1261 /* Move extents up in the list (if needed) */
1262 if (idx + ext_diff < nextents) {
1263 memmove(&ifp->if_u1.if_extents[idx],
1264 &ifp->if_u1.if_extents[idx + ext_diff],
1265 (nextents - (idx + ext_diff)) *
1266 sizeof(xfs_bmbt_rec_t));
1267 }
1268 memset(&ifp->if_u1.if_extents[nextents - ext_diff],
1269 0, ext_diff * sizeof(xfs_bmbt_rec_t));
1270 /*
1271 * Reallocate the direct extent list. If the extents
1272 * will fit inside the inode then xfs_iext_realloc_direct
1273 * will switch from direct to inline extent allocation
1274 * mode for us.
1275 */
1276 xfs_iext_realloc_direct(ifp, new_size);
1277 ifp->if_bytes = new_size;
1278}
1279
1280/*
1281 * This is called when incore extents are being removed from the
1282 * indirection array and the extents being removed span multiple extent
1283 * buffers. The idx parameter contains the file extent index where we
1284 * want to begin removing extents, and the count parameter contains
1285 * how many extents need to be removed.
1286 *
1287 * |-------| |-------|
1288 * | nex1 | | | nex1 - number of extents before idx
1289 * |-------| | count |
1290 * | | | | count - number of extents being removed at idx
1291 * | count | |-------|
1292 * | | | nex2 | nex2 - number of extents after idx + count
1293 * |-------| |-------|
1294 */
1295void
1296xfs_iext_remove_indirect(
1297 xfs_ifork_t *ifp, /* inode fork pointer */
1298 xfs_extnum_t idx, /* index to begin removing extents */
1299 int count) /* number of extents to remove */
1300{
1301 xfs_ext_irec_t *erp; /* indirection array pointer */
1302 int erp_idx = 0; /* indirection array index */
1303 xfs_extnum_t ext_cnt; /* extents left to remove */
1304 xfs_extnum_t ext_diff; /* extents to remove in current list */
1305 xfs_extnum_t nex1; /* number of extents before idx */
1306 xfs_extnum_t nex2; /* extents after idx + count */
1307 int page_idx = idx; /* index in target extent list */
1308
1309 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1310 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
1311 ASSERT(erp != NULL);
1312 nex1 = page_idx;
1313 ext_cnt = count;
1314 while (ext_cnt) {
1315 nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
1316 ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
1317 /*
1318 * Check for deletion of entire list;
1319 * xfs_iext_irec_remove() updates extent offsets.
1320 */
1321 if (ext_diff == erp->er_extcount) {
1322 xfs_iext_irec_remove(ifp, erp_idx);
1323 ext_cnt -= ext_diff;
1324 nex1 = 0;
1325 if (ext_cnt) {
1326 ASSERT(erp_idx < ifp->if_real_bytes /
1327 XFS_IEXT_BUFSZ);
1328 erp = &ifp->if_u1.if_ext_irec[erp_idx];
1329 nex1 = 0;
1330 continue;
1331 } else {
1332 break;
1333 }
1334 }
1335 /* Move extents up (if needed) */
1336 if (nex2) {
1337 memmove(&erp->er_extbuf[nex1],
1338 &erp->er_extbuf[nex1 + ext_diff],
1339 nex2 * sizeof(xfs_bmbt_rec_t));
1340 }
1341 /* Zero out rest of page */
1342 memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
1343 ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
1344 /* Update remaining counters */
1345 erp->er_extcount -= ext_diff;
1346 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
1347 ext_cnt -= ext_diff;
1348 nex1 = 0;
1349 erp_idx++;
1350 erp++;
1351 }
1352 ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
1353 xfs_iext_irec_compact(ifp);
1354}
1355
1356/*
1357 * Create, destroy, or resize a linear (direct) block of extents.
1358 */
1359void
1360xfs_iext_realloc_direct(
1361 xfs_ifork_t *ifp, /* inode fork pointer */
1362 int new_size) /* new size of extents */
1363{
1364 int rnew_size; /* real new size of extents */
1365
1366 rnew_size = new_size;
1367
1368 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
1369 ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
1370 (new_size != ifp->if_real_bytes)));
1371
1372 /* Free extent records */
1373 if (new_size == 0) {
1374 xfs_iext_destroy(ifp);
1375 }
1376 /* Resize direct extent list and zero any new bytes */
1377 else if (ifp->if_real_bytes) {
1378 /* Check if extents will fit inside the inode */
1379 if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
1380 xfs_iext_direct_to_inline(ifp, new_size /
1381 (uint)sizeof(xfs_bmbt_rec_t));
1382 ifp->if_bytes = new_size;
1383 return;
1384 }
1385 if (!is_power_of_2(new_size)){
1386 rnew_size = roundup_pow_of_two(new_size);
1387 }
1388 if (rnew_size != ifp->if_real_bytes) {
1389 ifp->if_u1.if_extents =
1390 kmem_realloc(ifp->if_u1.if_extents,
1391 rnew_size,
1392 ifp->if_real_bytes, KM_NOFS);
1393 }
1394 if (rnew_size > ifp->if_real_bytes) {
1395 memset(&ifp->if_u1.if_extents[ifp->if_bytes /
1396 (uint)sizeof(xfs_bmbt_rec_t)], 0,
1397 rnew_size - ifp->if_real_bytes);
1398 }
1399 }
1400 /*
1401 * Switch from the inline extent buffer to a direct
1402 * extent list. Be sure to include the inline extent
1403 * bytes in new_size.
1404 */
1405 else {
1406 new_size += ifp->if_bytes;
1407 if (!is_power_of_2(new_size)) {
1408 rnew_size = roundup_pow_of_two(new_size);
1409 }
1410 xfs_iext_inline_to_direct(ifp, rnew_size);
1411 }
1412 ifp->if_real_bytes = rnew_size;
1413 ifp->if_bytes = new_size;
1414}
1415
1416/*
1417 * Switch from linear (direct) extent records to inline buffer.
1418 */
1419void
1420xfs_iext_direct_to_inline(
1421 xfs_ifork_t *ifp, /* inode fork pointer */
1422 xfs_extnum_t nextents) /* number of extents in file */
1423{
1424 ASSERT(ifp->if_flags & XFS_IFEXTENTS);
1425 ASSERT(nextents <= XFS_INLINE_EXTS);
1426 /*
1427 * The inline buffer was zeroed when we switched
1428 * from inline to direct extent allocation mode,
1429 * so we don't need to clear it here.
1430 */
1431 memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
1432 nextents * sizeof(xfs_bmbt_rec_t));
1433 kmem_free(ifp->if_u1.if_extents);
1434 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
1435 ifp->if_real_bytes = 0;
1436}
1437
1438/*
1439 * Switch from inline buffer to linear (direct) extent records.
1440 * new_size should already be rounded up to the next power of 2
1441 * by the caller (when appropriate), so use new_size as it is.
1442 * However, since new_size may be rounded up, we can't update
1443 * if_bytes here. It is the caller's responsibility to update
1444 * if_bytes upon return.
1445 */
1446void
1447xfs_iext_inline_to_direct(
1448 xfs_ifork_t *ifp, /* inode fork pointer */
1449 int new_size) /* number of extents in file */
1450{
1451 ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
1452 memset(ifp->if_u1.if_extents, 0, new_size);
1453 if (ifp->if_bytes) {
1454 memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
1455 ifp->if_bytes);
1456 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
1457 sizeof(xfs_bmbt_rec_t));
1458 }
1459 ifp->if_real_bytes = new_size;
1460}
1461
1462/*
1463 * Resize an extent indirection array to new_size bytes.
1464 */
1465STATIC void
1466xfs_iext_realloc_indirect(
1467 xfs_ifork_t *ifp, /* inode fork pointer */
1468 int new_size) /* new indirection array size */
1469{
1470 int nlists; /* number of irec's (ex lists) */
1471 int size; /* current indirection array size */
1472
1473 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1474 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1475 size = nlists * sizeof(xfs_ext_irec_t);
1476 ASSERT(ifp->if_real_bytes);
1477 ASSERT((new_size >= 0) && (new_size != size));
1478 if (new_size == 0) {
1479 xfs_iext_destroy(ifp);
1480 } else {
1481 ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
1482 kmem_realloc(ifp->if_u1.if_ext_irec,
1483 new_size, size, KM_NOFS);
1484 }
1485}
1486
1487/*
1488 * Switch from indirection array to linear (direct) extent allocations.
1489 */
1490STATIC void
1491xfs_iext_indirect_to_direct(
1492 xfs_ifork_t *ifp) /* inode fork pointer */
1493{
1494 xfs_bmbt_rec_host_t *ep; /* extent record pointer */
1495 xfs_extnum_t nextents; /* number of extents in file */
1496 int size; /* size of file extents */
1497
1498 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1499 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1500 ASSERT(nextents <= XFS_LINEAR_EXTS);
1501 size = nextents * sizeof(xfs_bmbt_rec_t);
1502
1503 xfs_iext_irec_compact_pages(ifp);
1504 ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
1505
1506 ep = ifp->if_u1.if_ext_irec->er_extbuf;
1507 kmem_free(ifp->if_u1.if_ext_irec);
1508 ifp->if_flags &= ~XFS_IFEXTIREC;
1509 ifp->if_u1.if_extents = ep;
1510 ifp->if_bytes = size;
1511 if (nextents < XFS_LINEAR_EXTS) {
1512 xfs_iext_realloc_direct(ifp, size);
1513 }
1514}
1515
1516/*
1517 * Free incore file extents.
1518 */
1519void
1520xfs_iext_destroy(
1521 xfs_ifork_t *ifp) /* inode fork pointer */
1522{
1523 if (ifp->if_flags & XFS_IFEXTIREC) {
1524 int erp_idx;
1525 int nlists;
1526
1527 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1528 for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
1529 xfs_iext_irec_remove(ifp, erp_idx);
1530 }
1531 ifp->if_flags &= ~XFS_IFEXTIREC;
1532 } else if (ifp->if_real_bytes) {
1533 kmem_free(ifp->if_u1.if_extents);
1534 } else if (ifp->if_bytes) {
1535 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
1536 sizeof(xfs_bmbt_rec_t));
1537 }
1538 ifp->if_u1.if_extents = NULL;
1539 ifp->if_real_bytes = 0;
1540 ifp->if_bytes = 0;
1541}
1542
1543/*
1544 * Return a pointer to the extent record for file system block bno.
1545 */
1546xfs_bmbt_rec_host_t * /* pointer to found extent record */
1547xfs_iext_bno_to_ext(
1548 xfs_ifork_t *ifp, /* inode fork pointer */
1549 xfs_fileoff_t bno, /* block number to search for */
1550 xfs_extnum_t *idxp) /* index of target extent */
1551{
1552 xfs_bmbt_rec_host_t *base; /* pointer to first extent */
1553 xfs_filblks_t blockcount = 0; /* number of blocks in extent */
1554 xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
1555 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
1556 int high; /* upper boundary in search */
1557 xfs_extnum_t idx = 0; /* index of target extent */
1558 int low; /* lower boundary in search */
1559 xfs_extnum_t nextents; /* number of file extents */
1560 xfs_fileoff_t startoff = 0; /* start offset of extent */
1561
1562 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1563 if (nextents == 0) {
1564 *idxp = 0;
1565 return NULL;
1566 }
1567 low = 0;
1568 if (ifp->if_flags & XFS_IFEXTIREC) {
1569 /* Find target extent list */
1570 int erp_idx = 0;
1571 erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
1572 base = erp->er_extbuf;
1573 high = erp->er_extcount - 1;
1574 } else {
1575 base = ifp->if_u1.if_extents;
1576 high = nextents - 1;
1577 }
1578 /* Binary search extent records */
1579 while (low <= high) {
1580 idx = (low + high) >> 1;
1581 ep = base + idx;
1582 startoff = xfs_bmbt_get_startoff(ep);
1583 blockcount = xfs_bmbt_get_blockcount(ep);
1584 if (bno < startoff) {
1585 high = idx - 1;
1586 } else if (bno >= startoff + blockcount) {
1587 low = idx + 1;
1588 } else {
1589 /* Convert back to file-based extent index */
1590 if (ifp->if_flags & XFS_IFEXTIREC) {
1591 idx += erp->er_extoff;
1592 }
1593 *idxp = idx;
1594 return ep;
1595 }
1596 }
1597 /* Convert back to file-based extent index */
1598 if (ifp->if_flags & XFS_IFEXTIREC) {
1599 idx += erp->er_extoff;
1600 }
1601 if (bno >= startoff + blockcount) {
1602 if (++idx == nextents) {
1603 ep = NULL;
1604 } else {
1605 ep = xfs_iext_get_ext(ifp, idx);
1606 }
1607 }
1608 *idxp = idx;
1609 return ep;
1610}
1611
1612/*
1613 * Return a pointer to the indirection array entry containing the
1614 * extent record for filesystem block bno. Store the index of the
1615 * target irec in *erp_idxp.
1616 */
1617xfs_ext_irec_t * /* pointer to found extent record */
1618xfs_iext_bno_to_irec(
1619 xfs_ifork_t *ifp, /* inode fork pointer */
1620 xfs_fileoff_t bno, /* block number to search for */
1621 int *erp_idxp) /* irec index of target ext list */
1622{
1623 xfs_ext_irec_t *erp = NULL; /* indirection array pointer */
1624 xfs_ext_irec_t *erp_next; /* next indirection array entry */
1625 int erp_idx; /* indirection array index */
1626 int nlists; /* number of extent irec's (lists) */
1627 int high; /* binary search upper limit */
1628 int low; /* binary search lower limit */
1629
1630 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1631 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1632 erp_idx = 0;
1633 low = 0;
1634 high = nlists - 1;
1635 while (low <= high) {
1636 erp_idx = (low + high) >> 1;
1637 erp = &ifp->if_u1.if_ext_irec[erp_idx];
1638 erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
1639 if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
1640 high = erp_idx - 1;
1641 } else if (erp_next && bno >=
1642 xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
1643 low = erp_idx + 1;
1644 } else {
1645 break;
1646 }
1647 }
1648 *erp_idxp = erp_idx;
1649 return erp;
1650}
1651
1652/*
1653 * Return a pointer to the indirection array entry containing the
1654 * extent record at file extent index *idxp. Store the index of the
1655 * target irec in *erp_idxp and store the page index of the target
1656 * extent record in *idxp.
1657 */
1658xfs_ext_irec_t *
1659xfs_iext_idx_to_irec(
1660 xfs_ifork_t *ifp, /* inode fork pointer */
1661 xfs_extnum_t *idxp, /* extent index (file -> page) */
1662 int *erp_idxp, /* pointer to target irec */
1663 int realloc) /* new bytes were just added */
1664{
1665 xfs_ext_irec_t *prev; /* pointer to previous irec */
1666 xfs_ext_irec_t *erp = NULL; /* pointer to current irec */
1667 int erp_idx; /* indirection array index */
1668 int nlists; /* number of irec's (ex lists) */
1669 int high; /* binary search upper limit */
1670 int low; /* binary search lower limit */
1671 xfs_extnum_t page_idx = *idxp; /* extent index in target list */
1672
1673 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1674 ASSERT(page_idx >= 0);
1675 ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
1676 ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
1677
1678 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1679 erp_idx = 0;
1680 low = 0;
1681 high = nlists - 1;
1682
1683 /* Binary search extent irec's */
1684 while (low <= high) {
1685 erp_idx = (low + high) >> 1;
1686 erp = &ifp->if_u1.if_ext_irec[erp_idx];
1687 prev = erp_idx > 0 ? erp - 1 : NULL;
1688 if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
1689 realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
1690 high = erp_idx - 1;
1691 } else if (page_idx > erp->er_extoff + erp->er_extcount ||
1692 (page_idx == erp->er_extoff + erp->er_extcount &&
1693 !realloc)) {
1694 low = erp_idx + 1;
1695 } else if (page_idx == erp->er_extoff + erp->er_extcount &&
1696 erp->er_extcount == XFS_LINEAR_EXTS) {
1697 ASSERT(realloc);
1698 page_idx = 0;
1699 erp_idx++;
1700 erp = erp_idx < nlists ? erp + 1 : NULL;
1701 break;
1702 } else {
1703 page_idx -= erp->er_extoff;
1704 break;
1705 }
1706 }
1707 *idxp = page_idx;
1708 *erp_idxp = erp_idx;
1709 return(erp);
1710}
1711
1712/*
1713 * Allocate and initialize an indirection array once the space needed
1714 * for incore extents increases above XFS_IEXT_BUFSZ.
1715 */
1716void
1717xfs_iext_irec_init(
1718 xfs_ifork_t *ifp) /* inode fork pointer */
1719{
1720 xfs_ext_irec_t *erp; /* indirection array pointer */
1721 xfs_extnum_t nextents; /* number of extents in file */
1722
1723 ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
1724 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1725 ASSERT(nextents <= XFS_LINEAR_EXTS);
1726
1727 erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
1728
1729 if (nextents == 0) {
1730 ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
1731 } else if (!ifp->if_real_bytes) {
1732 xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
1733 } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
1734 xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
1735 }
1736 erp->er_extbuf = ifp->if_u1.if_extents;
1737 erp->er_extcount = nextents;
1738 erp->er_extoff = 0;
1739
1740 ifp->if_flags |= XFS_IFEXTIREC;
1741 ifp->if_real_bytes = XFS_IEXT_BUFSZ;
1742 ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
1743 ifp->if_u1.if_ext_irec = erp;
1744
1745 return;
1746}
1747
1748/*
1749 * Allocate and initialize a new entry in the indirection array.
1750 */
1751xfs_ext_irec_t *
1752xfs_iext_irec_new(
1753 xfs_ifork_t *ifp, /* inode fork pointer */
1754 int erp_idx) /* index for new irec */
1755{
1756 xfs_ext_irec_t *erp; /* indirection array pointer */
1757 int i; /* loop counter */
1758 int nlists; /* number of irec's (ex lists) */
1759
1760 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1761 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1762
1763 /* Resize indirection array */
1764 xfs_iext_realloc_indirect(ifp, ++nlists *
1765 sizeof(xfs_ext_irec_t));
1766 /*
1767 * Move records down in the array so the
1768 * new page can use erp_idx.
1769 */
1770 erp = ifp->if_u1.if_ext_irec;
1771 for (i = nlists - 1; i > erp_idx; i--) {
1772 memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
1773 }
1774 ASSERT(i == erp_idx);
1775
1776 /* Initialize new extent record */
1777 erp = ifp->if_u1.if_ext_irec;
1778 erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
1779 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
1780 memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
1781 erp[erp_idx].er_extcount = 0;
1782 erp[erp_idx].er_extoff = erp_idx > 0 ?
1783 erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
1784 return (&erp[erp_idx]);
1785}
1786
1787/*
1788 * Remove a record from the indirection array.
1789 */
1790void
1791xfs_iext_irec_remove(
1792 xfs_ifork_t *ifp, /* inode fork pointer */
1793 int erp_idx) /* irec index to remove */
1794{
1795 xfs_ext_irec_t *erp; /* indirection array pointer */
1796 int i; /* loop counter */
1797 int nlists; /* number of irec's (ex lists) */
1798
1799 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1800 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1801 erp = &ifp->if_u1.if_ext_irec[erp_idx];
1802 if (erp->er_extbuf) {
1803 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
1804 -erp->er_extcount);
1805 kmem_free(erp->er_extbuf);
1806 }
1807 /* Compact extent records */
1808 erp = ifp->if_u1.if_ext_irec;
1809 for (i = erp_idx; i < nlists - 1; i++) {
1810 memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
1811 }
1812 /*
1813 * Manually free the last extent record from the indirection
1814 * array. A call to xfs_iext_realloc_indirect() with a size
1815 * of zero would result in a call to xfs_iext_destroy() which
1816 * would in turn call this function again, creating a nasty
1817 * infinite loop.
1818 */
1819 if (--nlists) {
1820 xfs_iext_realloc_indirect(ifp,
1821 nlists * sizeof(xfs_ext_irec_t));
1822 } else {
1823 kmem_free(ifp->if_u1.if_ext_irec);
1824 }
1825 ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
1826}
1827
1828/*
1829 * This is called to clean up large amounts of unused memory allocated
1830 * by the indirection array. Before compacting anything though, verify
1831 * that the indirection array is still needed and switch back to the
1832 * linear extent list (or even the inline buffer) if possible. The
1833 * compaction policy is as follows:
1834 *
1835 * Full Compaction: Extents fit into a single page (or inline buffer)
1836 * Partial Compaction: Extents occupy less than 50% of allocated space
1837 * No Compaction: Extents occupy at least 50% of allocated space
1838 */
1839void
1840xfs_iext_irec_compact(
1841 xfs_ifork_t *ifp) /* inode fork pointer */
1842{
1843 xfs_extnum_t nextents; /* number of extents in file */
1844 int nlists; /* number of irec's (ex lists) */
1845
1846 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1847 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1848 nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
1849
1850 if (nextents == 0) {
1851 xfs_iext_destroy(ifp);
1852 } else if (nextents <= XFS_INLINE_EXTS) {
1853 xfs_iext_indirect_to_direct(ifp);
1854 xfs_iext_direct_to_inline(ifp, nextents);
1855 } else if (nextents <= XFS_LINEAR_EXTS) {
1856 xfs_iext_indirect_to_direct(ifp);
1857 } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
1858 xfs_iext_irec_compact_pages(ifp);
1859 }
1860}
1861
1862/*
1863 * Combine extents from neighboring extent pages.
1864 */
1865void
1866xfs_iext_irec_compact_pages(
1867 xfs_ifork_t *ifp) /* inode fork pointer */
1868{
1869 xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */
1870 int erp_idx = 0; /* indirection array index */
1871 int nlists; /* number of irec's (ex lists) */
1872
1873 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1874 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1875 while (erp_idx < nlists - 1) {
1876 erp = &ifp->if_u1.if_ext_irec[erp_idx];
1877 erp_next = erp + 1;
1878 if (erp_next->er_extcount <=
1879 (XFS_LINEAR_EXTS - erp->er_extcount)) {
1880 memcpy(&erp->er_extbuf[erp->er_extcount],
1881 erp_next->er_extbuf, erp_next->er_extcount *
1882 sizeof(xfs_bmbt_rec_t));
1883 erp->er_extcount += erp_next->er_extcount;
1884 /*
1885 * Free page before removing extent record
1886 * so er_extoffs don't get modified in
1887 * xfs_iext_irec_remove.
1888 */
1889 kmem_free(erp_next->er_extbuf);
1890 erp_next->er_extbuf = NULL;
1891 xfs_iext_irec_remove(ifp, erp_idx + 1);
1892 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1893 } else {
1894 erp_idx++;
1895 }
1896 }
1897}
1898
1899/*
1900 * This is called to update the er_extoff field in the indirection
1901 * array when extents have been added or removed from one of the
1902 * extent lists. erp_idx contains the irec index to begin updating
1903 * at and ext_diff contains the number of extents that were added
1904 * or removed.
1905 */
1906void
1907xfs_iext_irec_update_extoffs(
1908 xfs_ifork_t *ifp, /* inode fork pointer */
1909 int erp_idx, /* irec index to update */
1910 int ext_diff) /* number of new extents */
1911{
1912 int i; /* loop counter */
1913 int nlists; /* number of irec's (ex lists */
1914
1915 ASSERT(ifp->if_flags & XFS_IFEXTIREC);
1916 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
1917 for (i = erp_idx; i < nlists; i++) {
1918 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
1919 }
1920}
diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/xfs_inode_fork.h
new file mode 100644
index 000000000000..28661a0d9058
--- /dev/null
+++ b/fs/xfs/xfs_inode_fork.h
@@ -0,0 +1,171 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_INODE_FORK_H__
19#define __XFS_INODE_FORK_H__
20
21struct xfs_inode_log_item;
22
23/*
24 * The following xfs_ext_irec_t struct introduces a second (top) level
25 * to the in-core extent allocation scheme. These structs are allocated
26 * in a contiguous block, creating an indirection array where each entry
27 * (irec) contains a pointer to a buffer of in-core extent records which
28 * it manages. Each extent buffer is 4k in size, since 4k is the system
29 * page size on Linux i386 and systems with larger page sizes don't seem
30 * to gain much, if anything, by using their native page size as the
31 * extent buffer size. Also, using 4k extent buffers everywhere provides
32 * a consistent interface for CXFS across different platforms.
33 *
34 * There is currently no limit on the number of irec's (extent lists)
35 * allowed, so heavily fragmented files may require an indirection array
36 * which spans multiple system pages of memory. The number of extents
37 * which would require this amount of contiguous memory is very large
38 * and should not cause problems in the foreseeable future. However,
39 * if the memory needed for the contiguous array ever becomes a problem,
40 * it is possible that a third level of indirection may be required.
41 */
42typedef struct xfs_ext_irec {
43 xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */
44 xfs_extnum_t er_extoff; /* extent offset in file */
45 xfs_extnum_t er_extcount; /* number of extents in page/block */
46} xfs_ext_irec_t;
47
48/*
49 * File incore extent information, present for each of data & attr forks.
50 */
51#define XFS_IEXT_BUFSZ 4096
52#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t))
53#define XFS_INLINE_EXTS 2
54#define XFS_INLINE_DATA 32
55typedef struct xfs_ifork {
56 int if_bytes; /* bytes in if_u1 */
57 int if_real_bytes; /* bytes allocated in if_u1 */
58 struct xfs_btree_block *if_broot; /* file's incore btree root */
59 short if_broot_bytes; /* bytes allocated for root */
60 unsigned char if_flags; /* per-fork flags */
61 union {
62 xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
63 xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
64 char *if_data; /* inline file data */
65 } if_u1;
66 union {
67 xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS];
68 /* very small file extents */
69 char if_inline_data[XFS_INLINE_DATA];
70 /* very small file data */
71 xfs_dev_t if_rdev; /* dev number if special */
72 uuid_t if_uuid; /* mount point value */
73 } if_u2;
74} xfs_ifork_t;
75
76/*
77 * Per-fork incore inode flags.
78 */
79#define XFS_IFINLINE 0x01 /* Inline data is read in */
80#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */
81#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */
82#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */
83
84/*
85 * Fork handling.
86 */
87
88#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0)
89#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3))
90
91#define XFS_IFORK_PTR(ip,w) \
92 ((w) == XFS_DATA_FORK ? \
93 &(ip)->i_df : \
94 (ip)->i_afp)
95#define XFS_IFORK_DSIZE(ip) \
96 (XFS_IFORK_Q(ip) ? \
97 XFS_IFORK_BOFF(ip) : \
98 XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
99#define XFS_IFORK_ASIZE(ip) \
100 (XFS_IFORK_Q(ip) ? \
101 XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
102 XFS_IFORK_BOFF(ip) : \
103 0)
104#define XFS_IFORK_SIZE(ip,w) \
105 ((w) == XFS_DATA_FORK ? \
106 XFS_IFORK_DSIZE(ip) : \
107 XFS_IFORK_ASIZE(ip))
108#define XFS_IFORK_FORMAT(ip,w) \
109 ((w) == XFS_DATA_FORK ? \
110 (ip)->i_d.di_format : \
111 (ip)->i_d.di_aformat)
112#define XFS_IFORK_FMT_SET(ip,w,n) \
113 ((w) == XFS_DATA_FORK ? \
114 ((ip)->i_d.di_format = (n)) : \
115 ((ip)->i_d.di_aformat = (n)))
116#define XFS_IFORK_NEXTENTS(ip,w) \
117 ((w) == XFS_DATA_FORK ? \
118 (ip)->i_d.di_nextents : \
119 (ip)->i_d.di_anextents)
120#define XFS_IFORK_NEXT_SET(ip,w,n) \
121 ((w) == XFS_DATA_FORK ? \
122 ((ip)->i_d.di_nextents = (n)) : \
123 ((ip)->i_d.di_anextents = (n)))
124#define XFS_IFORK_MAXEXT(ip, w) \
125 (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
126
127int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
128void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
129 struct xfs_inode_log_item *, int,
130 struct xfs_buf *);
131void xfs_idestroy_fork(struct xfs_inode *, int);
132void xfs_idata_realloc(struct xfs_inode *, int, int);
133void xfs_iroot_realloc(struct xfs_inode *, int, int);
134int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
135int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *,
136 int);
137
138struct xfs_bmbt_rec_host *
139 xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
140void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t,
141 struct xfs_bmbt_irec *, int);
142void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int);
143void xfs_iext_add_indirect_multi(struct xfs_ifork *, int,
144 xfs_extnum_t, int);
145void xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int);
146void xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int);
147void xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int);
148void xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int);
149void xfs_iext_realloc_direct(struct xfs_ifork *, int);
150void xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t);
151void xfs_iext_inline_to_direct(struct xfs_ifork *, int);
152void xfs_iext_destroy(struct xfs_ifork *);
153struct xfs_bmbt_rec_host *
154 xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *);
155struct xfs_ext_irec *
156 xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *);
157struct xfs_ext_irec *
158 xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *,
159 int);
160void xfs_iext_irec_init(struct xfs_ifork *);
161struct xfs_ext_irec *
162 xfs_iext_irec_new(struct xfs_ifork *, int);
163void xfs_iext_irec_remove(struct xfs_ifork *, int);
164void xfs_iext_irec_compact(struct xfs_ifork *);
165void xfs_iext_irec_compact_pages(struct xfs_ifork *);
166void xfs_iext_irec_compact_full(struct xfs_ifork *);
167void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
168
169extern struct kmem_zone *xfs_ifork_zone;
170
171#endif /* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index f76ff52e43c0..378081109844 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -47,32 +47,44 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
47 * inode core, and possibly one for the inode data/extents/b-tree root 47 * inode core, and possibly one for the inode data/extents/b-tree root
48 * and one for the inode attribute data/extents/b-tree root. 48 * and one for the inode attribute data/extents/b-tree root.
49 */ 49 */
50STATIC uint 50STATIC void
51xfs_inode_item_size( 51xfs_inode_item_size(
52 struct xfs_log_item *lip) 52 struct xfs_log_item *lip,
53 int *nvecs,
54 int *nbytes)
53{ 55{
54 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 56 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
55 struct xfs_inode *ip = iip->ili_inode; 57 struct xfs_inode *ip = iip->ili_inode;
56 uint nvecs = 2; 58
59 *nvecs += 2;
60 *nbytes += sizeof(struct xfs_inode_log_format) +
61 xfs_icdinode_size(ip->i_d.di_version);
57 62
58 switch (ip->i_d.di_format) { 63 switch (ip->i_d.di_format) {
59 case XFS_DINODE_FMT_EXTENTS: 64 case XFS_DINODE_FMT_EXTENTS:
60 if ((iip->ili_fields & XFS_ILOG_DEXT) && 65 if ((iip->ili_fields & XFS_ILOG_DEXT) &&
61 ip->i_d.di_nextents > 0 && 66 ip->i_d.di_nextents > 0 &&
62 ip->i_df.if_bytes > 0) 67 ip->i_df.if_bytes > 0) {
63 nvecs++; 68 /* worst case, doesn't subtract delalloc extents */
69 *nbytes += XFS_IFORK_DSIZE(ip);
70 *nvecs += 1;
71 }
64 break; 72 break;
65 73
66 case XFS_DINODE_FMT_BTREE: 74 case XFS_DINODE_FMT_BTREE:
67 if ((iip->ili_fields & XFS_ILOG_DBROOT) && 75 if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
68 ip->i_df.if_broot_bytes > 0) 76 ip->i_df.if_broot_bytes > 0) {
69 nvecs++; 77 *nbytes += ip->i_df.if_broot_bytes;
78 *nvecs += 1;
79 }
70 break; 80 break;
71 81
72 case XFS_DINODE_FMT_LOCAL: 82 case XFS_DINODE_FMT_LOCAL:
73 if ((iip->ili_fields & XFS_ILOG_DDATA) && 83 if ((iip->ili_fields & XFS_ILOG_DDATA) &&
74 ip->i_df.if_bytes > 0) 84 ip->i_df.if_bytes > 0) {
75 nvecs++; 85 *nbytes += roundup(ip->i_df.if_bytes, 4);
86 *nvecs += 1;
87 }
76 break; 88 break;
77 89
78 case XFS_DINODE_FMT_DEV: 90 case XFS_DINODE_FMT_DEV:
@@ -85,7 +97,7 @@ xfs_inode_item_size(
85 } 97 }
86 98
87 if (!XFS_IFORK_Q(ip)) 99 if (!XFS_IFORK_Q(ip))
88 return nvecs; 100 return;
89 101
90 102
91 /* 103 /*
@@ -95,28 +107,33 @@ xfs_inode_item_size(
95 case XFS_DINODE_FMT_EXTENTS: 107 case XFS_DINODE_FMT_EXTENTS:
96 if ((iip->ili_fields & XFS_ILOG_AEXT) && 108 if ((iip->ili_fields & XFS_ILOG_AEXT) &&
97 ip->i_d.di_anextents > 0 && 109 ip->i_d.di_anextents > 0 &&
98 ip->i_afp->if_bytes > 0) 110 ip->i_afp->if_bytes > 0) {
99 nvecs++; 111 /* worst case, doesn't subtract unused space */
112 *nbytes += XFS_IFORK_ASIZE(ip);
113 *nvecs += 1;
114 }
100 break; 115 break;
101 116
102 case XFS_DINODE_FMT_BTREE: 117 case XFS_DINODE_FMT_BTREE:
103 if ((iip->ili_fields & XFS_ILOG_ABROOT) && 118 if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
104 ip->i_afp->if_broot_bytes > 0) 119 ip->i_afp->if_broot_bytes > 0) {
105 nvecs++; 120 *nbytes += ip->i_afp->if_broot_bytes;
121 *nvecs += 1;
122 }
106 break; 123 break;
107 124
108 case XFS_DINODE_FMT_LOCAL: 125 case XFS_DINODE_FMT_LOCAL:
109 if ((iip->ili_fields & XFS_ILOG_ADATA) && 126 if ((iip->ili_fields & XFS_ILOG_ADATA) &&
110 ip->i_afp->if_bytes > 0) 127 ip->i_afp->if_bytes > 0) {
111 nvecs++; 128 *nbytes += roundup(ip->i_afp->if_bytes, 4);
129 *nvecs += 1;
130 }
112 break; 131 break;
113 132
114 default: 133 default:
115 ASSERT(0); 134 ASSERT(0);
116 break; 135 break;
117 } 136 }
118
119 return nvecs;
120} 137}
121 138
122/* 139/*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 779812fb3d80..dce4d656768c 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -18,123 +18,13 @@
18#ifndef __XFS_INODE_ITEM_H__ 18#ifndef __XFS_INODE_ITEM_H__
19#define __XFS_INODE_ITEM_H__ 19#define __XFS_INODE_ITEM_H__
20 20
21/* 21/* kernel only definitions */
22 * This is the structure used to lay out an inode log item in the
23 * log. The size of the inline data/extents/b-tree root to be logged
24 * (if any) is indicated in the ilf_dsize field. Changes to this structure
25 * must be added on to the end.
26 */
27typedef struct xfs_inode_log_format {
28 __uint16_t ilf_type; /* inode log item type */
29 __uint16_t ilf_size; /* size of this item */
30 __uint32_t ilf_fields; /* flags for fields logged */
31 __uint16_t ilf_asize; /* size of attr d/ext/root */
32 __uint16_t ilf_dsize; /* size of data/ext/root */
33 __uint64_t ilf_ino; /* inode number */
34 union {
35 __uint32_t ilfu_rdev; /* rdev value for dev inode*/
36 uuid_t ilfu_uuid; /* mount point value */
37 } ilf_u;
38 __int64_t ilf_blkno; /* blkno of inode buffer */
39 __int32_t ilf_len; /* len of inode buffer */
40 __int32_t ilf_boffset; /* off of inode in buffer */
41} xfs_inode_log_format_t;
42
43typedef struct xfs_inode_log_format_32 {
44 __uint16_t ilf_type; /* inode log item type */
45 __uint16_t ilf_size; /* size of this item */
46 __uint32_t ilf_fields; /* flags for fields logged */
47 __uint16_t ilf_asize; /* size of attr d/ext/root */
48 __uint16_t ilf_dsize; /* size of data/ext/root */
49 __uint64_t ilf_ino; /* inode number */
50 union {
51 __uint32_t ilfu_rdev; /* rdev value for dev inode*/
52 uuid_t ilfu_uuid; /* mount point value */
53 } ilf_u;
54 __int64_t ilf_blkno; /* blkno of inode buffer */
55 __int32_t ilf_len; /* len of inode buffer */
56 __int32_t ilf_boffset; /* off of inode in buffer */
57} __attribute__((packed)) xfs_inode_log_format_32_t;
58
59typedef struct xfs_inode_log_format_64 {
60 __uint16_t ilf_type; /* inode log item type */
61 __uint16_t ilf_size; /* size of this item */
62 __uint32_t ilf_fields; /* flags for fields logged */
63 __uint16_t ilf_asize; /* size of attr d/ext/root */
64 __uint16_t ilf_dsize; /* size of data/ext/root */
65 __uint32_t ilf_pad; /* pad for 64 bit boundary */
66 __uint64_t ilf_ino; /* inode number */
67 union {
68 __uint32_t ilfu_rdev; /* rdev value for dev inode*/
69 uuid_t ilfu_uuid; /* mount point value */
70 } ilf_u;
71 __int64_t ilf_blkno; /* blkno of inode buffer */
72 __int32_t ilf_len; /* len of inode buffer */
73 __int32_t ilf_boffset; /* off of inode in buffer */
74} xfs_inode_log_format_64_t;
75
76/*
77 * Flags for xfs_trans_log_inode flags field.
78 */
79#define XFS_ILOG_CORE 0x001 /* log standard inode fields */
80#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */
81#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */
82#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */
83#define XFS_ILOG_DEV 0x010 /* log the dev field */
84#define XFS_ILOG_UUID 0x020 /* log the uuid field */
85#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */
86#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
87#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
88
89
90/*
91 * The timestamps are dirty, but not necessarily anything else in the inode
92 * core. Unlike the other fields above this one must never make it to disk
93 * in the ilf_fields of the inode_log_format, but is purely store in-memory in
94 * ili_fields in the inode_log_item.
95 */
96#define XFS_ILOG_TIMESTAMP 0x4000
97
98#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
99 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
100 XFS_ILOG_UUID | XFS_ILOG_ADATA | \
101 XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
102
103#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
104 XFS_ILOG_DBROOT)
105
106#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
107 XFS_ILOG_ABROOT)
108
109#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
110 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
111 XFS_ILOG_DEV | XFS_ILOG_UUID | \
112 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
113 XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP)
114
115static inline int xfs_ilog_fbroot(int w)
116{
117 return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
118}
119
120static inline int xfs_ilog_fext(int w)
121{
122 return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
123}
124
125static inline int xfs_ilog_fdata(int w)
126{
127 return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
128}
129
130#ifdef __KERNEL__
131 22
132struct xfs_buf; 23struct xfs_buf;
133struct xfs_bmbt_rec; 24struct xfs_bmbt_rec;
134struct xfs_inode; 25struct xfs_inode;
135struct xfs_mount; 26struct xfs_mount;
136 27
137
138typedef struct xfs_inode_log_item { 28typedef struct xfs_inode_log_item {
139 xfs_log_item_t ili_item; /* common portion */ 29 xfs_log_item_t ili_item; /* common portion */
140 struct xfs_inode *ili_inode; /* inode ptr */ 30 struct xfs_inode *ili_inode; /* inode ptr */
@@ -151,7 +41,6 @@ typedef struct xfs_inode_log_item {
151 xfs_inode_log_format_t ili_format; /* logged structure */ 41 xfs_inode_log_format_t ili_format; /* logged structure */
152} xfs_inode_log_item_t; 42} xfs_inode_log_item_t;
153 43
154
155static inline int xfs_inode_clean(xfs_inode_t *ip) 44static inline int xfs_inode_clean(xfs_inode_t *ip)
156{ 45{
157 return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL); 46 return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL);
@@ -165,6 +54,6 @@ extern void xfs_iflush_abort(struct xfs_inode *, bool);
165extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, 54extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
166 xfs_inode_log_format_t *); 55 xfs_inode_log_format_t *);
167 56
168#endif /* __KERNEL__ */ 57extern struct kmem_zone *xfs_ili_zone;
169 58
170#endif /* __XFS_INODE_ITEM_H__ */ 59#endif /* __XFS_INODE_ITEM_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 6e2bca5d44d6..bdebc21078d7 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
21#include "xfs_trans.h" 22#include "xfs_trans.h"
22#include "xfs_sb.h" 23#include "xfs_sb.h"
@@ -32,17 +33,16 @@
32#include "xfs_error.h" 33#include "xfs_error.h"
33#include "xfs_attr.h" 34#include "xfs_attr.h"
34#include "xfs_bmap.h" 35#include "xfs_bmap.h"
36#include "xfs_bmap_util.h"
35#include "xfs_buf_item.h" 37#include "xfs_buf_item.h"
36#include "xfs_utils.h"
37#include "xfs_dfrag.h"
38#include "xfs_fsops.h" 38#include "xfs_fsops.h"
39#include "xfs_vnodeops.h"
40#include "xfs_discard.h" 39#include "xfs_discard.h"
41#include "xfs_quota.h" 40#include "xfs_quota.h"
42#include "xfs_inode_item.h" 41#include "xfs_inode_item.h"
43#include "xfs_export.h" 42#include "xfs_export.h"
44#include "xfs_trace.h" 43#include "xfs_trace.h"
45#include "xfs_icache.h" 44#include "xfs_icache.h"
45#include "xfs_symlink.h"
46 46
47#include <linux/capability.h> 47#include <linux/capability.h>
48#include <linux/dcache.h> 48#include <linux/dcache.h>
@@ -350,6 +350,40 @@ xfs_readlink_by_handle(
350 return error; 350 return error;
351} 351}
352 352
353int
354xfs_set_dmattrs(
355 xfs_inode_t *ip,
356 u_int evmask,
357 u_int16_t state)
358{
359 xfs_mount_t *mp = ip->i_mount;
360 xfs_trans_t *tp;
361 int error;
362
363 if (!capable(CAP_SYS_ADMIN))
364 return XFS_ERROR(EPERM);
365
366 if (XFS_FORCED_SHUTDOWN(mp))
367 return XFS_ERROR(EIO);
368
369 tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
370 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
371 if (error) {
372 xfs_trans_cancel(tp, 0);
373 return error;
374 }
375 xfs_ilock(ip, XFS_ILOCK_EXCL);
376 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
377
378 ip->i_d.di_dmevmask = evmask;
379 ip->i_d.di_dmstate = state;
380
381 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
382 error = xfs_trans_commit(tp, 0);
383
384 return error;
385}
386
353STATIC int 387STATIC int
354xfs_fssetdm_by_handle( 388xfs_fssetdm_by_handle(
355 struct file *parfilp, 389 struct file *parfilp,
@@ -967,7 +1001,7 @@ xfs_ioctl_setattr(
967 * first do an error checking pass. 1001 * first do an error checking pass.
968 */ 1002 */
969 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); 1003 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
970 code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); 1004 code = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
971 if (code) 1005 if (code)
972 goto error_return; 1006 goto error_return;
973 1007
@@ -981,15 +1015,22 @@ xfs_ioctl_setattr(
981 * to the file owner ID, except in cases where the 1015 * to the file owner ID, except in cases where the
982 * CAP_FSETID capability is applicable. 1016 * CAP_FSETID capability is applicable.
983 */ 1017 */
984 if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) { 1018 if (!inode_owner_or_capable(VFS_I(ip))) {
985 code = XFS_ERROR(EPERM); 1019 code = XFS_ERROR(EPERM);
986 goto error_return; 1020 goto error_return;
987 } 1021 }
988 1022
989 /* 1023 /*
990 * Do a quota reservation only if projid is actually going to change. 1024 * Do a quota reservation only if projid is actually going to change.
1025 * Only allow changing of projid from init_user_ns since it is a
1026 * non user namespace aware identifier.
991 */ 1027 */
992 if (mask & FSX_PROJID) { 1028 if (mask & FSX_PROJID) {
1029 if (current_user_ns() != &init_user_ns) {
1030 code = XFS_ERROR(EINVAL);
1031 goto error_return;
1032 }
1033
993 if (XFS_IS_QUOTA_RUNNING(mp) && 1034 if (XFS_IS_QUOTA_RUNNING(mp) &&
994 XFS_IS_PQUOTA_ON(mp) && 1035 XFS_IS_PQUOTA_ON(mp) &&
995 xfs_get_projid(ip) != fa->fsx_projid) { 1036 xfs_get_projid(ip) != fa->fsx_projid) {
@@ -1103,7 +1144,7 @@ xfs_ioctl_setattr(
1103 * cleared upon successful return from chown() 1144 * cleared upon successful return from chown()
1104 */ 1145 */
1105 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && 1146 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
1106 !capable(CAP_FSETID)) 1147 !inode_capable(VFS_I(ip), CAP_FSETID))
1107 ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); 1148 ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
1108 1149
1109 /* 1150 /*
@@ -1328,6 +1369,75 @@ xfs_ioc_getbmapx(
1328 return 0; 1369 return 0;
1329} 1370}
1330 1371
1372int
1373xfs_ioc_swapext(
1374 xfs_swapext_t *sxp)
1375{
1376 xfs_inode_t *ip, *tip;
1377 struct fd f, tmp;
1378 int error = 0;
1379
1380 /* Pull information for the target fd */
1381 f = fdget((int)sxp->sx_fdtarget);
1382 if (!f.file) {
1383 error = XFS_ERROR(EINVAL);
1384 goto out;
1385 }
1386
1387 if (!(f.file->f_mode & FMODE_WRITE) ||
1388 !(f.file->f_mode & FMODE_READ) ||
1389 (f.file->f_flags & O_APPEND)) {
1390 error = XFS_ERROR(EBADF);
1391 goto out_put_file;
1392 }
1393
1394 tmp = fdget((int)sxp->sx_fdtmp);
1395 if (!tmp.file) {
1396 error = XFS_ERROR(EINVAL);
1397 goto out_put_file;
1398 }
1399
1400 if (!(tmp.file->f_mode & FMODE_WRITE) ||
1401 !(tmp.file->f_mode & FMODE_READ) ||
1402 (tmp.file->f_flags & O_APPEND)) {
1403 error = XFS_ERROR(EBADF);
1404 goto out_put_tmp_file;
1405 }
1406
1407 if (IS_SWAPFILE(file_inode(f.file)) ||
1408 IS_SWAPFILE(file_inode(tmp.file))) {
1409 error = XFS_ERROR(EINVAL);
1410 goto out_put_tmp_file;
1411 }
1412
1413 ip = XFS_I(file_inode(f.file));
1414 tip = XFS_I(file_inode(tmp.file));
1415
1416 if (ip->i_mount != tip->i_mount) {
1417 error = XFS_ERROR(EINVAL);
1418 goto out_put_tmp_file;
1419 }
1420
1421 if (ip->i_ino == tip->i_ino) {
1422 error = XFS_ERROR(EINVAL);
1423 goto out_put_tmp_file;
1424 }
1425
1426 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1427 error = XFS_ERROR(EIO);
1428 goto out_put_tmp_file;
1429 }
1430
1431 error = xfs_swap_extents(ip, tip, sxp);
1432
1433 out_put_tmp_file:
1434 fdput(tmp);
1435 out_put_file:
1436 fdput(f);
1437 out:
1438 return error;
1439}
1440
1331/* 1441/*
1332 * Note: some of the ioctl's return positive numbers as a 1442 * Note: some of the ioctl's return positive numbers as a
1333 * byte count indicating success, such as readlink_by_handle. 1443 * byte count indicating success, such as readlink_by_handle.
@@ -1472,7 +1582,7 @@ xfs_file_ioctl(
1472 error = mnt_want_write_file(filp); 1582 error = mnt_want_write_file(filp);
1473 if (error) 1583 if (error)
1474 return error; 1584 return error;
1475 error = xfs_swapext(&sxp); 1585 error = xfs_ioc_swapext(&sxp);
1476 mnt_drop_write_file(filp); 1586 mnt_drop_write_file(filp);
1477 return -error; 1587 return -error;
1478 } 1588 }
@@ -1610,23 +1720,23 @@ xfs_file_ioctl(
1610 return -error; 1720 return -error;
1611 1721
1612 case XFS_IOC_FREE_EOFBLOCKS: { 1722 case XFS_IOC_FREE_EOFBLOCKS: {
1613 struct xfs_eofblocks eofb; 1723 struct xfs_fs_eofblocks eofb;
1724 struct xfs_eofblocks keofb;
1614 1725
1615 if (copy_from_user(&eofb, arg, sizeof(eofb))) 1726 if (!capable(CAP_SYS_ADMIN))
1616 return -XFS_ERROR(EFAULT); 1727 return -EPERM;
1617 1728
1618 if (eofb.eof_version != XFS_EOFBLOCKS_VERSION) 1729 if (mp->m_flags & XFS_MOUNT_RDONLY)
1619 return -XFS_ERROR(EINVAL); 1730 return -XFS_ERROR(EROFS);
1620 1731
1621 if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID) 1732 if (copy_from_user(&eofb, arg, sizeof(eofb)))
1622 return -XFS_ERROR(EINVAL); 1733 return -XFS_ERROR(EFAULT);
1623 1734
1624 if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) || 1735 error = xfs_fs_eofblocks_from_user(&eofb, &keofb);
1625 memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64))) 1736 if (error)
1626 return -XFS_ERROR(EINVAL); 1737 return -error;
1627 1738
1628 error = xfs_icache_free_eofblocks(mp, &eofb); 1739 return -xfs_icache_free_eofblocks(mp, &keofb);
1629 return -error;
1630 } 1740 }
1631 1741
1632 default: 1742 default:
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index d56173b34a2a..77c02c7900b6 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -27,6 +27,10 @@ xfs_ioc_space(
27 unsigned int cmd, 27 unsigned int cmd,
28 xfs_flock64_t *bf); 28 xfs_flock64_t *bf);
29 29
30int
31xfs_ioc_swapext(
32 xfs_swapext_t *sxp);
33
30extern int 34extern int
31xfs_find_handle( 35xfs_find_handle(
32 unsigned int cmd, 36 unsigned int cmd,
@@ -82,4 +86,10 @@ xfs_file_compat_ioctl(
82 unsigned int cmd, 86 unsigned int cmd,
83 unsigned long arg); 87 unsigned long arg);
84 88
89extern int
90xfs_set_dmattrs(
91 struct xfs_inode *ip,
92 u_int evmask,
93 u_int16_t state);
94
85#endif 95#endif
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index c0c66259cc91..d3ab9534307f 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -33,8 +33,6 @@
33#include "xfs_inode.h" 33#include "xfs_inode.h"
34#include "xfs_itable.h" 34#include "xfs_itable.h"
35#include "xfs_error.h" 35#include "xfs_error.h"
36#include "xfs_dfrag.h"
37#include "xfs_vnodeops.h"
38#include "xfs_fsops.h" 36#include "xfs_fsops.h"
39#include "xfs_alloc.h" 37#include "xfs_alloc.h"
40#include "xfs_rtalloc.h" 38#include "xfs_rtalloc.h"
@@ -644,7 +642,7 @@ xfs_file_compat_ioctl(
644 error = mnt_want_write_file(filp); 642 error = mnt_want_write_file(filp);
645 if (error) 643 if (error)
646 return error; 644 return error;
647 error = xfs_swapext(&sxp); 645 error = xfs_ioc_swapext(&sxp);
648 mnt_drop_write_file(filp); 646 mnt_drop_write_file(filp);
649 return -error; 647 return -error;
650 } 648 }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 6a7096422295..8d4d49b6fbf3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
21#include "xfs_trans.h" 22#include "xfs_trans.h"
22#include "xfs_sb.h" 23#include "xfs_sb.h"
@@ -32,13 +33,13 @@
32#include "xfs_inode_item.h" 33#include "xfs_inode_item.h"
33#include "xfs_btree.h" 34#include "xfs_btree.h"
34#include "xfs_bmap.h" 35#include "xfs_bmap.h"
36#include "xfs_bmap_util.h"
35#include "xfs_rtalloc.h" 37#include "xfs_rtalloc.h"
36#include "xfs_error.h" 38#include "xfs_error.h"
37#include "xfs_itable.h" 39#include "xfs_itable.h"
38#include "xfs_attr.h" 40#include "xfs_attr.h"
39#include "xfs_buf_item.h" 41#include "xfs_buf_item.h"
40#include "xfs_trans_space.h" 42#include "xfs_trans_space.h"
41#include "xfs_utils.h"
42#include "xfs_iomap.h" 43#include "xfs_iomap.h"
43#include "xfs_trace.h" 44#include "xfs_trace.h"
44#include "xfs_icache.h" 45#include "xfs_icache.h"
@@ -187,10 +188,8 @@ xfs_iomap_write_direct(
187 * Allocate and setup the transaction 188 * Allocate and setup the transaction
188 */ 189 */
189 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); 190 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
190 error = xfs_trans_reserve(tp, resblks, 191 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
191 XFS_WRITE_LOG_RES(mp), resrtextents, 192 resblks, resrtextents);
192 XFS_TRANS_PERM_LOG_RES,
193 XFS_WRITE_LOG_COUNT);
194 /* 193 /*
195 * Check for running out of space, note: need lock to return 194 * Check for running out of space, note: need lock to return
196 */ 195 */
@@ -698,10 +697,8 @@ xfs_iomap_write_allocate(
698 tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); 697 tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
699 tp->t_flags |= XFS_TRANS_RESERVE; 698 tp->t_flags |= XFS_TRANS_RESERVE;
700 nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); 699 nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
701 error = xfs_trans_reserve(tp, nres, 700 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
702 XFS_WRITE_LOG_RES(mp), 701 nres, 0);
703 0, XFS_TRANS_PERM_LOG_RES,
704 XFS_WRITE_LOG_COUNT);
705 if (error) { 702 if (error) {
706 xfs_trans_cancel(tp, 0); 703 xfs_trans_cancel(tp, 0);
707 return XFS_ERROR(error); 704 return XFS_ERROR(error);
@@ -864,10 +861,8 @@ xfs_iomap_write_unwritten(
864 sb_start_intwrite(mp->m_super); 861 sb_start_intwrite(mp->m_super);
865 tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); 862 tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
866 tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; 863 tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT;
867 error = xfs_trans_reserve(tp, resblks, 864 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
868 XFS_WRITE_LOG_RES(mp), 0, 865 resblks, 0);
869 XFS_TRANS_PERM_LOG_RES,
870 XFS_WRITE_LOG_COUNT);
871 if (error) { 866 if (error) {
872 xfs_trans_cancel(tp, 0); 867 xfs_trans_cancel(tp, 0);
873 return XFS_ERROR(error); 868 return XFS_ERROR(error);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 96dda62d497b..2b8952d9199b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_acl.h" 21#include "xfs_acl.h"
21#include "xfs_log.h" 22#include "xfs_log.h"
22#include "xfs_trans.h" 23#include "xfs_trans.h"
@@ -29,16 +30,19 @@
29#include "xfs_dinode.h" 30#include "xfs_dinode.h"
30#include "xfs_inode.h" 31#include "xfs_inode.h"
31#include "xfs_bmap.h" 32#include "xfs_bmap.h"
33#include "xfs_bmap_util.h"
32#include "xfs_rtalloc.h" 34#include "xfs_rtalloc.h"
33#include "xfs_error.h" 35#include "xfs_error.h"
34#include "xfs_itable.h" 36#include "xfs_itable.h"
35#include "xfs_attr.h" 37#include "xfs_attr.h"
36#include "xfs_buf_item.h" 38#include "xfs_buf_item.h"
37#include "xfs_utils.h"
38#include "xfs_vnodeops.h"
39#include "xfs_inode_item.h" 39#include "xfs_inode_item.h"
40#include "xfs_trace.h" 40#include "xfs_trace.h"
41#include "xfs_icache.h" 41#include "xfs_icache.h"
42#include "xfs_symlink.h"
43#include "xfs_da_btree.h"
44#include "xfs_dir2_format.h"
45#include "xfs_dir2_priv.h"
42 46
43#include <linux/capability.h> 47#include <linux/capability.h>
44#include <linux/xattr.h> 48#include <linux/xattr.h>
@@ -87,10 +91,12 @@ xfs_init_security(
87static void 91static void
88xfs_dentry_to_name( 92xfs_dentry_to_name(
89 struct xfs_name *namep, 93 struct xfs_name *namep,
90 struct dentry *dentry) 94 struct dentry *dentry,
95 int mode)
91{ 96{
92 namep->name = dentry->d_name.name; 97 namep->name = dentry->d_name.name;
93 namep->len = dentry->d_name.len; 98 namep->len = dentry->d_name.len;
99 namep->type = xfs_mode_to_ftype[(mode & S_IFMT) >> S_SHIFT];
94} 100}
95 101
96STATIC void 102STATIC void
@@ -106,7 +112,7 @@ xfs_cleanup_inode(
106 * xfs_init_security we must back out. 112 * xfs_init_security we must back out.
107 * ENOSPC can hit here, among other things. 113 * ENOSPC can hit here, among other things.
108 */ 114 */
109 xfs_dentry_to_name(&teardown, dentry); 115 xfs_dentry_to_name(&teardown, dentry, 0);
110 116
111 xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); 117 xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
112 iput(inode); 118 iput(inode);
@@ -146,7 +152,7 @@ xfs_vn_mknod(
146 mode &= ~current_umask(); 152 mode &= ~current_umask();
147 } 153 }
148 154
149 xfs_dentry_to_name(&name, dentry); 155 xfs_dentry_to_name(&name, dentry, mode);
150 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); 156 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
151 if (unlikely(error)) 157 if (unlikely(error))
152 goto out_free_acl; 158 goto out_free_acl;
@@ -207,7 +213,7 @@ xfs_vn_lookup(
207 if (dentry->d_name.len >= MAXNAMELEN) 213 if (dentry->d_name.len >= MAXNAMELEN)
208 return ERR_PTR(-ENAMETOOLONG); 214 return ERR_PTR(-ENAMETOOLONG);
209 215
210 xfs_dentry_to_name(&name, dentry); 216 xfs_dentry_to_name(&name, dentry, 0);
211 error = xfs_lookup(XFS_I(dir), &name, &cip, NULL); 217 error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
212 if (unlikely(error)) { 218 if (unlikely(error)) {
213 if (unlikely(error != ENOENT)) 219 if (unlikely(error != ENOENT))
@@ -234,7 +240,7 @@ xfs_vn_ci_lookup(
234 if (dentry->d_name.len >= MAXNAMELEN) 240 if (dentry->d_name.len >= MAXNAMELEN)
235 return ERR_PTR(-ENAMETOOLONG); 241 return ERR_PTR(-ENAMETOOLONG);
236 242
237 xfs_dentry_to_name(&xname, dentry); 243 xfs_dentry_to_name(&xname, dentry, 0);
238 error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name); 244 error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
239 if (unlikely(error)) { 245 if (unlikely(error)) {
240 if (unlikely(error != ENOENT)) 246 if (unlikely(error != ENOENT))
@@ -269,7 +275,7 @@ xfs_vn_link(
269 struct xfs_name name; 275 struct xfs_name name;
270 int error; 276 int error;
271 277
272 xfs_dentry_to_name(&name, dentry); 278 xfs_dentry_to_name(&name, dentry, inode->i_mode);
273 279
274 error = xfs_link(XFS_I(dir), XFS_I(inode), &name); 280 error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
275 if (unlikely(error)) 281 if (unlikely(error))
@@ -288,7 +294,7 @@ xfs_vn_unlink(
288 struct xfs_name name; 294 struct xfs_name name;
289 int error; 295 int error;
290 296
291 xfs_dentry_to_name(&name, dentry); 297 xfs_dentry_to_name(&name, dentry, 0);
292 298
293 error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode)); 299 error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
294 if (error) 300 if (error)
@@ -318,7 +324,7 @@ xfs_vn_symlink(
318 324
319 mode = S_IFLNK | 325 mode = S_IFLNK |
320 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); 326 (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
321 xfs_dentry_to_name(&name, dentry); 327 xfs_dentry_to_name(&name, dentry, mode);
322 328
323 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip); 329 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
324 if (unlikely(error)) 330 if (unlikely(error))
@@ -350,12 +356,12 @@ xfs_vn_rename(
350 struct xfs_name oname; 356 struct xfs_name oname;
351 struct xfs_name nname; 357 struct xfs_name nname;
352 358
353 xfs_dentry_to_name(&oname, odentry); 359 xfs_dentry_to_name(&oname, odentry, 0);
354 xfs_dentry_to_name(&nname, ndentry); 360 xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode);
355 361
356 return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), 362 return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
357 XFS_I(ndir), &nname, new_inode ? 363 XFS_I(ndir), &nname, new_inode ?
358 XFS_I(new_inode) : NULL); 364 XFS_I(new_inode) : NULL);
359} 365}
360 366
361/* 367/*
@@ -420,8 +426,8 @@ xfs_vn_getattr(
420 stat->dev = inode->i_sb->s_dev; 426 stat->dev = inode->i_sb->s_dev;
421 stat->mode = ip->i_d.di_mode; 427 stat->mode = ip->i_d.di_mode;
422 stat->nlink = ip->i_d.di_nlink; 428 stat->nlink = ip->i_d.di_nlink;
423 stat->uid = ip->i_d.di_uid; 429 stat->uid = inode->i_uid;
424 stat->gid = ip->i_d.di_gid; 430 stat->gid = inode->i_gid;
425 stat->ino = ip->i_ino; 431 stat->ino = ip->i_ino;
426 stat->atime = inode->i_atime; 432 stat->atime = inode->i_atime;
427 stat->mtime = inode->i_mtime; 433 stat->mtime = inode->i_mtime;
@@ -485,8 +491,8 @@ xfs_setattr_nonsize(
485 int mask = iattr->ia_valid; 491 int mask = iattr->ia_valid;
486 xfs_trans_t *tp; 492 xfs_trans_t *tp;
487 int error; 493 int error;
488 uid_t uid = 0, iuid = 0; 494 kuid_t uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID;
489 gid_t gid = 0, igid = 0; 495 kgid_t gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID;
490 struct xfs_dquot *udqp = NULL, *gdqp = NULL; 496 struct xfs_dquot *udqp = NULL, *gdqp = NULL;
491 struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL; 497 struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL;
492 498
@@ -522,13 +528,13 @@ xfs_setattr_nonsize(
522 uid = iattr->ia_uid; 528 uid = iattr->ia_uid;
523 qflags |= XFS_QMOPT_UQUOTA; 529 qflags |= XFS_QMOPT_UQUOTA;
524 } else { 530 } else {
525 uid = ip->i_d.di_uid; 531 uid = inode->i_uid;
526 } 532 }
527 if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { 533 if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
528 gid = iattr->ia_gid; 534 gid = iattr->ia_gid;
529 qflags |= XFS_QMOPT_GQUOTA; 535 qflags |= XFS_QMOPT_GQUOTA;
530 } else { 536 } else {
531 gid = ip->i_d.di_gid; 537 gid = inode->i_gid;
532 } 538 }
533 539
534 /* 540 /*
@@ -538,14 +544,16 @@ xfs_setattr_nonsize(
538 */ 544 */
539 ASSERT(udqp == NULL); 545 ASSERT(udqp == NULL);
540 ASSERT(gdqp == NULL); 546 ASSERT(gdqp == NULL);
541 error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip), 547 error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid),
542 qflags, &udqp, &gdqp, NULL); 548 xfs_kgid_to_gid(gid),
549 xfs_get_projid(ip),
550 qflags, &udqp, &gdqp, NULL);
543 if (error) 551 if (error)
544 return error; 552 return error;
545 } 553 }
546 554
547 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); 555 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
548 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); 556 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
549 if (error) 557 if (error)
550 goto out_dqrele; 558 goto out_dqrele;
551 559
@@ -561,8 +569,8 @@ xfs_setattr_nonsize(
561 * while we didn't have the inode locked, inode's dquot(s) 569 * while we didn't have the inode locked, inode's dquot(s)
562 * would have changed also. 570 * would have changed also.
563 */ 571 */
564 iuid = ip->i_d.di_uid; 572 iuid = inode->i_uid;
565 igid = ip->i_d.di_gid; 573 igid = inode->i_gid;
566 gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; 574 gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
567 uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; 575 uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
568 576
@@ -571,8 +579,8 @@ xfs_setattr_nonsize(
571 * going to change. 579 * going to change.
572 */ 580 */
573 if (XFS_IS_QUOTA_RUNNING(mp) && 581 if (XFS_IS_QUOTA_RUNNING(mp) &&
574 ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) || 582 ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) ||
575 (XFS_IS_GQUOTA_ON(mp) && igid != gid))) { 583 (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) {
576 ASSERT(tp); 584 ASSERT(tp);
577 error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, 585 error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
578 NULL, capable(CAP_FOWNER) ? 586 NULL, capable(CAP_FOWNER) ?
@@ -602,17 +610,17 @@ xfs_setattr_nonsize(
602 * Change the ownerships and register quota modifications 610 * Change the ownerships and register quota modifications
603 * in the transaction. 611 * in the transaction.
604 */ 612 */
605 if (iuid != uid) { 613 if (!uid_eq(iuid, uid)) {
606 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) { 614 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
607 ASSERT(mask & ATTR_UID); 615 ASSERT(mask & ATTR_UID);
608 ASSERT(udqp); 616 ASSERT(udqp);
609 olddquot1 = xfs_qm_vop_chown(tp, ip, 617 olddquot1 = xfs_qm_vop_chown(tp, ip,
610 &ip->i_udquot, udqp); 618 &ip->i_udquot, udqp);
611 } 619 }
612 ip->i_d.di_uid = uid; 620 ip->i_d.di_uid = xfs_kuid_to_uid(uid);
613 inode->i_uid = uid; 621 inode->i_uid = uid;
614 } 622 }
615 if (igid != gid) { 623 if (!gid_eq(igid, gid)) {
616 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { 624 if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
617 ASSERT(!XFS_IS_PQUOTA_ON(mp)); 625 ASSERT(!XFS_IS_PQUOTA_ON(mp));
618 ASSERT(mask & ATTR_GID); 626 ASSERT(mask & ATTR_GID);
@@ -620,7 +628,7 @@ xfs_setattr_nonsize(
620 olddquot2 = xfs_qm_vop_chown(tp, ip, 628 olddquot2 = xfs_qm_vop_chown(tp, ip,
621 &ip->i_gdquot, gdqp); 629 &ip->i_gdquot, gdqp);
622 } 630 }
623 ip->i_d.di_gid = gid; 631 ip->i_d.di_gid = xfs_kgid_to_gid(gid);
624 inode->i_gid = gid; 632 inode->i_gid = gid;
625 } 633 }
626 } 634 }
@@ -807,9 +815,7 @@ xfs_setattr_size(
807 goto out_unlock; 815 goto out_unlock;
808 816
809 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); 817 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
810 error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 818 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
811 XFS_TRANS_PERM_LOG_RES,
812 XFS_ITRUNCATE_LOG_COUNT);
813 if (error) 819 if (error)
814 goto out_trans_cancel; 820 goto out_trans_cancel;
815 821
@@ -932,7 +938,7 @@ xfs_vn_update_time(
932 trace_xfs_update_time(ip); 938 trace_xfs_update_time(ip);
933 939
934 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); 940 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
935 error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); 941 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
936 if (error) { 942 if (error) {
937 xfs_trans_cancel(tp, 0); 943 xfs_trans_cancel(tp, 0);
938 return -error; 944 return -error;
@@ -1173,8 +1179,8 @@ xfs_setup_inode(
1173 1179
1174 inode->i_mode = ip->i_d.di_mode; 1180 inode->i_mode = ip->i_d.di_mode;
1175 set_nlink(inode, ip->i_d.di_nlink); 1181 set_nlink(inode, ip->i_d.di_nlink);
1176 inode->i_uid = ip->i_d.di_uid; 1182 inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid);
1177 inode->i_gid = ip->i_d.di_gid; 1183 inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid);
1178 1184
1179 switch (inode->i_mode & S_IFMT) { 1185 switch (inode->i_mode & S_IFMT) {
1180 case S_IFBLK: 1186 case S_IFBLK:
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index ef41c92ce66e..d81fb41205ec 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -27,4 +27,17 @@ extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
27 27
28extern void xfs_setup_inode(struct xfs_inode *); 28extern void xfs_setup_inode(struct xfs_inode *);
29 29
30/*
31 * Internal setattr interfaces.
32 */
33#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */
34#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if op would block */
35#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */
36#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */
37#define XFS_ATTR_SYNC 0x10 /* synchronous operation required */
38
39extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
40 int flags);
41extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap, int flags);
42
30#endif /* __XFS_IOPS_H__ */ 43#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 800f896a6cc4..f9bb590acc0e 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -32,6 +32,38 @@
32# define XFS_BIG_INUMS 0 32# define XFS_BIG_INUMS 0
33#endif 33#endif
34 34
35/*
36 * Kernel specific type declarations for XFS
37 */
38typedef signed char __int8_t;
39typedef unsigned char __uint8_t;
40typedef signed short int __int16_t;
41typedef unsigned short int __uint16_t;
42typedef signed int __int32_t;
43typedef unsigned int __uint32_t;
44typedef signed long long int __int64_t;
45typedef unsigned long long int __uint64_t;
46
47typedef __uint32_t inst_t; /* an instruction */
48
49typedef __s64 xfs_off_t; /* <file offset> type */
50typedef unsigned long long xfs_ino_t; /* <inode> type */
51typedef __s64 xfs_daddr_t; /* <disk address> type */
52typedef char * xfs_caddr_t; /* <core address> type */
53typedef __u32 xfs_dev_t;
54typedef __u32 xfs_nlink_t;
55
56/* __psint_t is the same size as a pointer */
57#if (BITS_PER_LONG == 32)
58typedef __int32_t __psint_t;
59typedef __uint32_t __psunsigned_t;
60#elif (BITS_PER_LONG == 64)
61typedef __int64_t __psint_t;
62typedef __uint64_t __psunsigned_t;
63#else
64#error BITS_PER_LONG must be 32 or 64
65#endif
66
35#include "xfs_types.h" 67#include "xfs_types.h"
36 68
37#include "kmem.h" 69#include "kmem.h"
@@ -114,8 +146,6 @@
114#define xfs_inherit_sync xfs_params.inherit_sync.val 146#define xfs_inherit_sync xfs_params.inherit_sync.val
115#define xfs_inherit_nodump xfs_params.inherit_nodump.val 147#define xfs_inherit_nodump xfs_params.inherit_nodump.val
116#define xfs_inherit_noatime xfs_params.inherit_noatim.val 148#define xfs_inherit_noatime xfs_params.inherit_noatim.val
117#define xfs_buf_timer_centisecs xfs_params.xfs_buf_timer.val
118#define xfs_buf_age_centisecs xfs_params.xfs_buf_age.val
119#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val 149#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val
120#define xfs_rotorstep xfs_params.rotorstep.val 150#define xfs_rotorstep xfs_params.rotorstep.val
121#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val 151#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val
@@ -159,6 +189,32 @@
159#define MAX(a,b) (max(a,b)) 189#define MAX(a,b) (max(a,b))
160#define howmany(x, y) (((x)+((y)-1))/(y)) 190#define howmany(x, y) (((x)+((y)-1))/(y))
161 191
192/* Kernel uid/gid conversion. These are used to convert to/from the on disk
193 * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
194 * The conversion here is type only, the value will remain the same since we
195 * are converting to the init_user_ns. The uid is later mapped to a particular
196 * user namespace value when crossing the kernel/user boundary.
197 */
198static inline __uint32_t xfs_kuid_to_uid(kuid_t uid)
199{
200 return from_kuid(&init_user_ns, uid);
201}
202
203static inline kuid_t xfs_uid_to_kuid(__uint32_t uid)
204{
205 return make_kuid(&init_user_ns, uid);
206}
207
208static inline __uint32_t xfs_kgid_to_gid(kgid_t gid)
209{
210 return from_kgid(&init_user_ns, gid);
211}
212
213static inline kgid_t xfs_gid_to_kgid(__uint32_t gid)
214{
215 return make_kgid(&init_user_ns, gid);
216}
217
162/* 218/*
163 * Various platform dependent calls that don't fit anywhere else 219 * Various platform dependent calls that don't fit anywhere else
164 */ 220 */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index d852a2b3e1fd..5372d58ef93a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -614,7 +614,8 @@ xfs_log_mount(
614 xfs_daddr_t blk_offset, 614 xfs_daddr_t blk_offset,
615 int num_bblks) 615 int num_bblks)
616{ 616{
617 int error; 617 int error = 0;
618 int min_logfsbs;
618 619
619 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) 620 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
620 xfs_notice(mp, "Mounting Filesystem"); 621 xfs_notice(mp, "Mounting Filesystem");
@@ -631,6 +632,50 @@ xfs_log_mount(
631 } 632 }
632 633
633 /* 634 /*
635 * Validate the given log space and drop a critical message via syslog
636 * if the log size is too small that would lead to some unexpected
637 * situations in transaction log space reservation stage.
638 *
639 * Note: we can't just reject the mount if the validation fails. This
640 * would mean that people would have to downgrade their kernel just to
641 * remedy the situation as there is no way to grow the log (short of
642 * black magic surgery with xfs_db).
643 *
644 * We can, however, reject mounts for CRC format filesystems, as the
645 * mkfs binary being used to make the filesystem should never create a
646 * filesystem with a log that is too small.
647 */
648 min_logfsbs = xfs_log_calc_minimum_size(mp);
649
650 if (mp->m_sb.sb_logblocks < min_logfsbs) {
651 xfs_warn(mp,
652 "Log size %d blocks too small, minimum size is %d blocks",
653 mp->m_sb.sb_logblocks, min_logfsbs);
654 error = EINVAL;
655 } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) {
656 xfs_warn(mp,
657 "Log size %d blocks too large, maximum size is %lld blocks",
658 mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS);
659 error = EINVAL;
660 } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) {
661 xfs_warn(mp,
662 "log size %lld bytes too large, maximum size is %lld bytes",
663 XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks),
664 XFS_MAX_LOG_BYTES);
665 error = EINVAL;
666 }
667 if (error) {
668 if (xfs_sb_version_hascrc(&mp->m_sb)) {
669 xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!");
670 ASSERT(0);
671 goto out_free_log;
672 }
673 xfs_crit(mp,
674"Log size out of supported range. Continuing onwards, but if log hangs are\n"
675"experienced then please report this message in the bug report.");
676 }
677
678 /*
634 * Initialize the AIL now we have a log. 679 * Initialize the AIL now we have a log.
635 */ 680 */
636 error = xfs_trans_ail_init(mp); 681 error = xfs_trans_ail_init(mp);
@@ -720,7 +765,7 @@ xfs_log_mount_finish(xfs_mount_t *mp)
720 * Unmount record used to have a string "Unmount filesystem--" in the 765 * Unmount record used to have a string "Unmount filesystem--" in the
721 * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). 766 * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE).
722 * We just write the magic number now since that particular field isn't 767 * We just write the magic number now since that particular field isn't
723 * currently architecture converted and "nUmount" is a bit foo. 768 * currently architecture converted and "Unmount" is a bit foo.
724 * As far as I know, there weren't any dependencies on the old behaviour. 769 * As far as I know, there weren't any dependencies on the old behaviour.
725 */ 770 */
726 771
@@ -1941,7 +1986,7 @@ xlog_print_tic_res(
1941 1986
1942 xfs_alert_tag(mp, XFS_PTAG_LOGRES, 1987 xfs_alert_tag(mp, XFS_PTAG_LOGRES,
1943 "xlog_write: reservation ran out. Need to up reservation"); 1988 "xlog_write: reservation ran out. Need to up reservation");
1944 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 1989 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
1945} 1990}
1946 1991
1947/* 1992/*
@@ -2044,7 +2089,7 @@ xlog_write_setup_ophdr(
2044 * Set up the parameters of the region copy into the log. This has 2089 * Set up the parameters of the region copy into the log. This has
2045 * to handle region write split across multiple log buffers - this 2090 * to handle region write split across multiple log buffers - this
2046 * state is kept external to this function so that this code can 2091 * state is kept external to this function so that this code can
2047 * can be written in an obvious, self documenting manner. 2092 * be written in an obvious, self documenting manner.
2048 */ 2093 */
2049static int 2094static int
2050xlog_write_setup_copy( 2095xlog_write_setup_copy(
@@ -3391,24 +3436,17 @@ xfs_log_ticket_get(
3391} 3436}
3392 3437
3393/* 3438/*
3394 * Allocate and initialise a new log ticket. 3439 * Figure out the total log space unit (in bytes) that would be
3440 * required for a log ticket.
3395 */ 3441 */
3396struct xlog_ticket * 3442int
3397xlog_ticket_alloc( 3443xfs_log_calc_unit_res(
3398 struct xlog *log, 3444 struct xfs_mount *mp,
3399 int unit_bytes, 3445 int unit_bytes)
3400 int cnt,
3401 char client,
3402 bool permanent,
3403 xfs_km_flags_t alloc_flags)
3404{ 3446{
3405 struct xlog_ticket *tic; 3447 struct xlog *log = mp->m_log;
3406 uint num_headers; 3448 int iclog_space;
3407 int iclog_space; 3449 uint num_headers;
3408
3409 tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
3410 if (!tic)
3411 return NULL;
3412 3450
3413 /* 3451 /*
3414 * Permanent reservations have up to 'cnt'-1 active log operations 3452 * Permanent reservations have up to 'cnt'-1 active log operations
@@ -3483,20 +3521,43 @@ xlog_ticket_alloc(
3483 unit_bytes += log->l_iclog_hsize; 3521 unit_bytes += log->l_iclog_hsize;
3484 3522
3485 /* for roundoff padding for transaction data and one for commit record */ 3523 /* for roundoff padding for transaction data and one for commit record */
3486 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && 3524 if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) {
3487 log->l_mp->m_sb.sb_logsunit > 1) {
3488 /* log su roundoff */ 3525 /* log su roundoff */
3489 unit_bytes += 2*log->l_mp->m_sb.sb_logsunit; 3526 unit_bytes += 2 * mp->m_sb.sb_logsunit;
3490 } else { 3527 } else {
3491 /* BB roundoff */ 3528 /* BB roundoff */
3492 unit_bytes += 2*BBSIZE; 3529 unit_bytes += 2 * BBSIZE;
3493 } 3530 }
3494 3531
3532 return unit_bytes;
3533}
3534
3535/*
3536 * Allocate and initialise a new log ticket.
3537 */
3538struct xlog_ticket *
3539xlog_ticket_alloc(
3540 struct xlog *log,
3541 int unit_bytes,
3542 int cnt,
3543 char client,
3544 bool permanent,
3545 xfs_km_flags_t alloc_flags)
3546{
3547 struct xlog_ticket *tic;
3548 int unit_res;
3549
3550 tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
3551 if (!tic)
3552 return NULL;
3553
3554 unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes);
3555
3495 atomic_set(&tic->t_ref, 1); 3556 atomic_set(&tic->t_ref, 1);
3496 tic->t_task = current; 3557 tic->t_task = current;
3497 INIT_LIST_HEAD(&tic->t_queue); 3558 INIT_LIST_HEAD(&tic->t_queue);
3498 tic->t_unit_res = unit_bytes; 3559 tic->t_unit_res = unit_res;
3499 tic->t_curr_res = unit_bytes; 3560 tic->t_curr_res = unit_res;
3500 tic->t_cnt = cnt; 3561 tic->t_cnt = cnt;
3501 tic->t_ocnt = cnt; 3562 tic->t_ocnt = cnt;
3502 tic->t_tid = prandom_u32(); 3563 tic->t_tid = prandom_u32();
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index fb630e496c12..1c458487f000 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -18,14 +18,30 @@
18#ifndef __XFS_LOG_H__ 18#ifndef __XFS_LOG_H__
19#define __XFS_LOG_H__ 19#define __XFS_LOG_H__
20 20
21/* get lsn fields */ 21#include "xfs_log_format.h"
22#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
23#define BLOCK_LSN(lsn) ((uint)(lsn))
24 22
25/* this is used in a spot where we might otherwise double-endian-flip */ 23struct xfs_log_vec {
26#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0]) 24 struct xfs_log_vec *lv_next; /* next lv in build list */
25 int lv_niovecs; /* number of iovecs in lv */
26 struct xfs_log_iovec *lv_iovecp; /* iovec array */
27 struct xfs_log_item *lv_item; /* owner */
28 char *lv_buf; /* formatted buffer */
29 int lv_buf_len; /* size of formatted buffer */
30 int lv_size; /* size of allocated lv */
31};
32
33#define XFS_LOG_VEC_ORDERED (-1)
34
35/*
36 * Structure used to pass callback function and the function's argument
37 * to the log manager.
38 */
39typedef struct xfs_log_callback {
40 struct xfs_log_callback *cb_next;
41 void (*cb_func)(void *, int);
42 void *cb_arg;
43} xfs_log_callback_t;
27 44
28#ifdef __KERNEL__
29/* 45/*
30 * By comparing each component, we don't have to worry about extra 46 * By comparing each component, we don't have to worry about extra
31 * endian issues in treating two 32 bit numbers as one 64 bit number 47 * endian issues in treating two 32 bit numbers as one 64 bit number
@@ -59,67 +75,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
59 */ 75 */
60#define XFS_LOG_SYNC 0x1 76#define XFS_LOG_SYNC 0x1
61 77
62#endif /* __KERNEL__ */
63
64
65/* Log Clients */
66#define XFS_TRANSACTION 0x69
67#define XFS_VOLUME 0x2
68#define XFS_LOG 0xaa
69
70
71/* Region types for iovec's i_type */
72#define XLOG_REG_TYPE_BFORMAT 1
73#define XLOG_REG_TYPE_BCHUNK 2
74#define XLOG_REG_TYPE_EFI_FORMAT 3
75#define XLOG_REG_TYPE_EFD_FORMAT 4
76#define XLOG_REG_TYPE_IFORMAT 5
77#define XLOG_REG_TYPE_ICORE 6
78#define XLOG_REG_TYPE_IEXT 7
79#define XLOG_REG_TYPE_IBROOT 8
80#define XLOG_REG_TYPE_ILOCAL 9
81#define XLOG_REG_TYPE_IATTR_EXT 10
82#define XLOG_REG_TYPE_IATTR_BROOT 11
83#define XLOG_REG_TYPE_IATTR_LOCAL 12
84#define XLOG_REG_TYPE_QFORMAT 13
85#define XLOG_REG_TYPE_DQUOT 14
86#define XLOG_REG_TYPE_QUOTAOFF 15
87#define XLOG_REG_TYPE_LRHEADER 16
88#define XLOG_REG_TYPE_UNMOUNT 17
89#define XLOG_REG_TYPE_COMMIT 18
90#define XLOG_REG_TYPE_TRANSHDR 19
91#define XLOG_REG_TYPE_ICREATE 20
92#define XLOG_REG_TYPE_MAX 20
93
94typedef struct xfs_log_iovec {
95 void *i_addr; /* beginning address of region */
96 int i_len; /* length in bytes of region */
97 uint i_type; /* type of region */
98} xfs_log_iovec_t;
99
100struct xfs_log_vec {
101 struct xfs_log_vec *lv_next; /* next lv in build list */
102 int lv_niovecs; /* number of iovecs in lv */
103 struct xfs_log_iovec *lv_iovecp; /* iovec array */
104 struct xfs_log_item *lv_item; /* owner */
105 char *lv_buf; /* formatted buffer */
106 int lv_buf_len; /* size of formatted buffer */
107};
108
109#define XFS_LOG_VEC_ORDERED (-1)
110
111/*
112 * Structure used to pass callback function and the function's argument
113 * to the log manager.
114 */
115typedef struct xfs_log_callback {
116 struct xfs_log_callback *cb_next;
117 void (*cb_func)(void *, int);
118 void *cb_arg;
119} xfs_log_callback_t;
120
121
122#ifdef __KERNEL__
123/* Log manager interfaces */ 78/* Log manager interfaces */
124struct xfs_mount; 79struct xfs_mount;
125struct xlog_in_core; 80struct xlog_in_core;
@@ -188,5 +143,4 @@ void xfs_log_work_queue(struct xfs_mount *mp);
188void xfs_log_worker(struct work_struct *work); 143void xfs_log_worker(struct work_struct *work);
189void xfs_log_quiesce(struct xfs_mount *mp); 144void xfs_log_quiesce(struct xfs_mount *mp);
190 145
191#endif
192#endif /* __XFS_LOG_H__ */ 146#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 02b9cf3f8252..cfe97973ba36 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -80,6 +80,83 @@ xlog_cil_init_post_recovery(
80 log->l_curr_block); 80 log->l_curr_block);
81} 81}
82 82
83STATIC int
84xlog_cil_lv_item_format(
85 struct xfs_log_item *lip,
86 struct xfs_log_vec *lv)
87{
88 int index;
89 char *ptr;
90
91 /* format new vectors into array */
92 lip->li_ops->iop_format(lip, lv->lv_iovecp);
93
94 /* copy data into existing array */
95 ptr = lv->lv_buf;
96 for (index = 0; index < lv->lv_niovecs; index++) {
97 struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
98
99 memcpy(ptr, vec->i_addr, vec->i_len);
100 vec->i_addr = ptr;
101 ptr += vec->i_len;
102 }
103
104 /*
105 * some size calculations for log vectors over-estimate, so the caller
106 * doesn't know the amount of space actually used by the item. Return
107 * the byte count to the caller so they can check and store it
108 * appropriately.
109 */
110 return ptr - lv->lv_buf;
111}
112
113/*
114 * Prepare the log item for insertion into the CIL. Calculate the difference in
115 * log space and vectors it will consume, and if it is a new item pin it as
116 * well.
117 */
118STATIC void
119xfs_cil_prepare_item(
120 struct xlog *log,
121 struct xfs_log_vec *lv,
122 struct xfs_log_vec *old_lv,
123 int *diff_len,
124 int *diff_iovecs)
125{
126 /* Account for the new LV being passed in */
127 if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
128 *diff_len += lv->lv_buf_len;
129 *diff_iovecs += lv->lv_niovecs;
130 }
131
132 /*
133 * If there is no old LV, this is the first time we've seen the item in
134 * this CIL context and so we need to pin it. If we are replacing the
135 * old_lv, then remove the space it accounts for and free it.
136 */
137 if (!old_lv)
138 lv->lv_item->li_ops->iop_pin(lv->lv_item);
139 else if (old_lv != lv) {
140 ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
141
142 *diff_len -= old_lv->lv_buf_len;
143 *diff_iovecs -= old_lv->lv_niovecs;
144 kmem_free(old_lv);
145 }
146
147 /* attach new log vector to log item */
148 lv->lv_item->li_lv = lv;
149
150 /*
151 * If this is the first time the item is being committed to the
152 * CIL, store the sequence number on the log item so we can
153 * tell in future commits whether this is the first checkpoint
154 * the item is being committed into.
155 */
156 if (!lv->lv_item->li_seq)
157 lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
158}
159
83/* 160/*
84 * Format log item into a flat buffers 161 * Format log item into a flat buffers
85 * 162 *
@@ -106,35 +183,39 @@ xlog_cil_init_post_recovery(
106 * format the regions into the iclog as though they are being formatted 183 * format the regions into the iclog as though they are being formatted
107 * directly out of the objects themselves. 184 * directly out of the objects themselves.
108 */ 185 */
109static struct xfs_log_vec * 186static void
110xlog_cil_prepare_log_vecs( 187xlog_cil_insert_format_items(
111 struct xfs_trans *tp) 188 struct xlog *log,
189 struct xfs_trans *tp,
190 int *diff_len,
191 int *diff_iovecs)
112{ 192{
113 struct xfs_log_item_desc *lidp; 193 struct xfs_log_item_desc *lidp;
114 struct xfs_log_vec *lv = NULL;
115 struct xfs_log_vec *ret_lv = NULL;
116 194
117 195
118 /* Bail out if we didn't find a log item. */ 196 /* Bail out if we didn't find a log item. */
119 if (list_empty(&tp->t_items)) { 197 if (list_empty(&tp->t_items)) {
120 ASSERT(0); 198 ASSERT(0);
121 return NULL; 199 return;
122 } 200 }
123 201
124 list_for_each_entry(lidp, &tp->t_items, lid_trans) { 202 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
125 struct xfs_log_vec *new_lv; 203 struct xfs_log_item *lip = lidp->lid_item;
126 void *ptr; 204 struct xfs_log_vec *lv;
127 int index; 205 struct xfs_log_vec *old_lv;
128 int len = 0; 206 int niovecs = 0;
129 uint niovecs; 207 int nbytes = 0;
208 int buf_size;
130 bool ordered = false; 209 bool ordered = false;
131 210
132 /* Skip items which aren't dirty in this transaction. */ 211 /* Skip items which aren't dirty in this transaction. */
133 if (!(lidp->lid_flags & XFS_LID_DIRTY)) 212 if (!(lidp->lid_flags & XFS_LID_DIRTY))
134 continue; 213 continue;
135 214
215 /* get number of vecs and size of data to be stored */
216 lip->li_ops->iop_size(lip, &niovecs, &nbytes);
217
136 /* Skip items that do not have any vectors for writing */ 218 /* Skip items that do not have any vectors for writing */
137 niovecs = IOP_SIZE(lidp->lid_item);
138 if (!niovecs) 219 if (!niovecs)
139 continue; 220 continue;
140 221
@@ -146,109 +227,63 @@ xlog_cil_prepare_log_vecs(
146 if (niovecs == XFS_LOG_VEC_ORDERED) { 227 if (niovecs == XFS_LOG_VEC_ORDERED) {
147 ordered = true; 228 ordered = true;
148 niovecs = 0; 229 niovecs = 0;
230 nbytes = 0;
149 } 231 }
150 232
151 new_lv = kmem_zalloc(sizeof(*new_lv) + 233 /* grab the old item if it exists for reservation accounting */
152 niovecs * sizeof(struct xfs_log_iovec), 234 old_lv = lip->li_lv;
153 KM_SLEEP|KM_NOFS);
154
155 new_lv->lv_item = lidp->lid_item;
156 new_lv->lv_niovecs = niovecs;
157 if (ordered) {
158 /* track as an ordered logvec */
159 new_lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
160 goto next;
161 }
162
163 /* The allocated iovec region lies beyond the log vector. */
164 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
165 235
166 /* build the vector array and calculate it's length */ 236 /* calc buffer size */
167 IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp); 237 buf_size = sizeof(struct xfs_log_vec) + nbytes +
168 for (index = 0; index < new_lv->lv_niovecs; index++) 238 niovecs * sizeof(struct xfs_log_iovec);
169 len += new_lv->lv_iovecp[index].i_len;
170 239
171 new_lv->lv_buf_len = len; 240 /* compare to existing item size */
172 new_lv->lv_buf = kmem_alloc(new_lv->lv_buf_len, 241 if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
173 KM_SLEEP|KM_NOFS); 242 /* same or smaller, optimise common overwrite case */
174 ptr = new_lv->lv_buf; 243 lv = lip->li_lv;
244 lv->lv_next = NULL;
175 245
176 for (index = 0; index < new_lv->lv_niovecs; index++) { 246 if (ordered)
177 struct xfs_log_iovec *vec = &new_lv->lv_iovecp[index]; 247 goto insert;
178 248
179 memcpy(ptr, vec->i_addr, vec->i_len); 249 /*
180 vec->i_addr = ptr; 250 * set the item up as though it is a new insertion so
181 ptr += vec->i_len; 251 * that the space reservation accounting is correct.
182 } 252 */
183 ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len); 253 *diff_iovecs -= lv->lv_niovecs;
184 254 *diff_len -= lv->lv_buf_len;
185next:
186 if (!ret_lv)
187 ret_lv = new_lv;
188 else
189 lv->lv_next = new_lv;
190 lv = new_lv;
191 }
192
193 return ret_lv;
194}
195
196/*
197 * Prepare the log item for insertion into the CIL. Calculate the difference in
198 * log space and vectors it will consume, and if it is a new item pin it as
199 * well.
200 */
201STATIC void
202xfs_cil_prepare_item(
203 struct xlog *log,
204 struct xfs_log_vec *lv,
205 int *len,
206 int *diff_iovecs)
207{
208 struct xfs_log_vec *old = lv->lv_item->li_lv;
209 255
210 if (old) { 256 /* Ensure the lv is set up according to ->iop_size */
211 /* existing lv on log item, space used is a delta */ 257 lv->lv_niovecs = niovecs;
212 ASSERT((old->lv_buf && old->lv_buf_len && old->lv_niovecs) || 258 lv->lv_buf = (char *)lv + buf_size - nbytes;
213 old->lv_buf_len == XFS_LOG_VEC_ORDERED);
214 259
215 /* 260 lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
216 * If the new item is ordered, keep the old one that is already 261 goto insert;
217 * tracking dirty or ordered regions
218 */
219 if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
220 ASSERT(!lv->lv_buf);
221 kmem_free(lv);
222 return;
223 } 262 }
224 263
225 *len += lv->lv_buf_len - old->lv_buf_len; 264 /* allocate new data chunk */
226 *diff_iovecs += lv->lv_niovecs - old->lv_niovecs; 265 lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
227 kmem_free(old->lv_buf); 266 lv->lv_item = lip;
228 kmem_free(old); 267 lv->lv_size = buf_size;
229 } else { 268 lv->lv_niovecs = niovecs;
230 /* new lv, must pin the log item */ 269 if (ordered) {
231 ASSERT(!lv->lv_item->li_lv); 270 /* track as an ordered logvec */
232 271 ASSERT(lip->li_lv == NULL);
233 if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) { 272 lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
234 *len += lv->lv_buf_len; 273 goto insert;
235 *diff_iovecs += lv->lv_niovecs;
236 } 274 }
237 IOP_PIN(lv->lv_item);
238 275
239 } 276 /* The allocated iovec region lies beyond the log vector. */
277 lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
240 278
241 /* attach new log vector to log item */ 279 /* The allocated data region lies beyond the iovec region */
242 lv->lv_item->li_lv = lv; 280 lv->lv_buf = (char *)lv + buf_size - nbytes;
243 281
244 /* 282 lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
245 * If this is the first time the item is being committed to the 283insert:
246 * CIL, store the sequence number on the log item so we can 284 ASSERT(lv->lv_buf_len <= nbytes);
247 * tell in future commits whether this is the first checkpoint 285 xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
248 * the item is being committed into. 286 }
249 */
250 if (!lv->lv_item->li_seq)
251 lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
252} 287}
253 288
254/* 289/*
@@ -261,53 +296,47 @@ xfs_cil_prepare_item(
261static void 296static void
262xlog_cil_insert_items( 297xlog_cil_insert_items(
263 struct xlog *log, 298 struct xlog *log,
264 struct xfs_log_vec *log_vector, 299 struct xfs_trans *tp)
265 struct xlog_ticket *ticket)
266{ 300{
267 struct xfs_cil *cil = log->l_cilp; 301 struct xfs_cil *cil = log->l_cilp;
268 struct xfs_cil_ctx *ctx = cil->xc_ctx; 302 struct xfs_cil_ctx *ctx = cil->xc_ctx;
269 struct xfs_log_vec *lv; 303 struct xfs_log_item_desc *lidp;
270 int len = 0; 304 int len = 0;
271 int diff_iovecs = 0; 305 int diff_iovecs = 0;
272 int iclog_space; 306 int iclog_space;
273 307
274 ASSERT(log_vector); 308 ASSERT(tp);
275 309
276 /* 310 /*
277 * Do all the accounting aggregation and switching of log vectors
278 * around in a separate loop to the insertion of items into the CIL.
279 * Then we can do a separate loop to update the CIL within a single
280 * lock/unlock pair. This reduces the number of round trips on the CIL
281 * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
282 * hold time for the transaction commit.
283 *
284 * If this is the first time the item is being placed into the CIL in
285 * this context, pin it so it can't be written to disk until the CIL is
286 * flushed to the iclog and the iclog written to disk.
287 *
288 * We can do this safely because the context can't checkpoint until we 311 * We can do this safely because the context can't checkpoint until we
289 * are done so it doesn't matter exactly how we update the CIL. 312 * are done so it doesn't matter exactly how we update the CIL.
290 */ 313 */
314 xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs);
315
316 /*
317 * Now (re-)position everything modified at the tail of the CIL.
318 * We do this here so we only need to take the CIL lock once during
319 * the transaction commit.
320 */
291 spin_lock(&cil->xc_cil_lock); 321 spin_lock(&cil->xc_cil_lock);
292 for (lv = log_vector; lv; ) { 322 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
293 struct xfs_log_vec *next = lv->lv_next; 323 struct xfs_log_item *lip = lidp->lid_item;
294 324
295 ASSERT(lv->lv_item->li_lv || list_empty(&lv->lv_item->li_cil)); 325 /* Skip items which aren't dirty in this transaction. */
296 lv->lv_next = NULL; 326 if (!(lidp->lid_flags & XFS_LID_DIRTY))
327 continue;
297 328
298 /* 329 list_move_tail(&lip->li_cil, &cil->xc_cil);
299 * xfs_cil_prepare_item() may free the lv, so move the item on
300 * the CIL first.
301 */
302 list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
303 xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
304 lv = next;
305 } 330 }
306 331
307 /* account for space used by new iovec headers */ 332 /* account for space used by new iovec headers */
308 len += diff_iovecs * sizeof(xlog_op_header_t); 333 len += diff_iovecs * sizeof(xlog_op_header_t);
309 ctx->nvecs += diff_iovecs; 334 ctx->nvecs += diff_iovecs;
310 335
336 /* attach the transaction to the CIL if it has any busy extents */
337 if (!list_empty(&tp->t_busy))
338 list_splice_init(&tp->t_busy, &ctx->busy_extents);
339
311 /* 340 /*
312 * Now transfer enough transaction reservation to the context ticket 341 * Now transfer enough transaction reservation to the context ticket
313 * for the checkpoint. The context ticket is special - the unit 342 * for the checkpoint. The context ticket is special - the unit
@@ -316,10 +345,8 @@ xlog_cil_insert_items(
316 * during the transaction commit. 345 * during the transaction commit.
317 */ 346 */
318 if (ctx->ticket->t_curr_res == 0) { 347 if (ctx->ticket->t_curr_res == 0) {
319 /* first commit in checkpoint, steal the header reservation */
320 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
321 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; 348 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
322 ticket->t_curr_res -= ctx->ticket->t_unit_res; 349 tp->t_ticket->t_curr_res -= ctx->ticket->t_unit_res;
323 } 350 }
324 351
325 /* do we need space for more log record headers? */ 352 /* do we need space for more log record headers? */
@@ -333,10 +360,10 @@ xlog_cil_insert_items(
333 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); 360 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
334 ctx->ticket->t_unit_res += hdrs; 361 ctx->ticket->t_unit_res += hdrs;
335 ctx->ticket->t_curr_res += hdrs; 362 ctx->ticket->t_curr_res += hdrs;
336 ticket->t_curr_res -= hdrs; 363 tp->t_ticket->t_curr_res -= hdrs;
337 ASSERT(ticket->t_curr_res >= len); 364 ASSERT(tp->t_ticket->t_curr_res >= len);
338 } 365 }
339 ticket->t_curr_res -= len; 366 tp->t_ticket->t_curr_res -= len;
340 ctx->space_used += len; 367 ctx->space_used += len;
341 368
342 spin_unlock(&cil->xc_cil_lock); 369 spin_unlock(&cil->xc_cil_lock);
@@ -350,7 +377,6 @@ xlog_cil_free_logvec(
350 377
351 for (lv = log_vector; lv; ) { 378 for (lv = log_vector; lv; ) {
352 struct xfs_log_vec *next = lv->lv_next; 379 struct xfs_log_vec *next = lv->lv_next;
353 kmem_free(lv->lv_buf);
354 kmem_free(lv); 380 kmem_free(lv);
355 lv = next; 381 lv = next;
356 } 382 }
@@ -376,9 +402,9 @@ xlog_cil_committed(
376 xfs_extent_busy_clear(mp, &ctx->busy_extents, 402 xfs_extent_busy_clear(mp, &ctx->busy_extents,
377 (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); 403 (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
378 404
379 spin_lock(&ctx->cil->xc_cil_lock); 405 spin_lock(&ctx->cil->xc_push_lock);
380 list_del(&ctx->committing); 406 list_del(&ctx->committing);
381 spin_unlock(&ctx->cil->xc_cil_lock); 407 spin_unlock(&ctx->cil->xc_push_lock);
382 408
383 xlog_cil_free_logvec(ctx->lv_chain); 409 xlog_cil_free_logvec(ctx->lv_chain);
384 410
@@ -433,7 +459,7 @@ xlog_cil_push(
433 down_write(&cil->xc_ctx_lock); 459 down_write(&cil->xc_ctx_lock);
434 ctx = cil->xc_ctx; 460 ctx = cil->xc_ctx;
435 461
436 spin_lock(&cil->xc_cil_lock); 462 spin_lock(&cil->xc_push_lock);
437 push_seq = cil->xc_push_seq; 463 push_seq = cil->xc_push_seq;
438 ASSERT(push_seq <= ctx->sequence); 464 ASSERT(push_seq <= ctx->sequence);
439 465
@@ -444,10 +470,10 @@ xlog_cil_push(
444 */ 470 */
445 if (list_empty(&cil->xc_cil)) { 471 if (list_empty(&cil->xc_cil)) {
446 cil->xc_push_seq = 0; 472 cil->xc_push_seq = 0;
447 spin_unlock(&cil->xc_cil_lock); 473 spin_unlock(&cil->xc_push_lock);
448 goto out_skip; 474 goto out_skip;
449 } 475 }
450 spin_unlock(&cil->xc_cil_lock); 476 spin_unlock(&cil->xc_push_lock);
451 477
452 478
453 /* check for a previously pushed seqeunce */ 479 /* check for a previously pushed seqeunce */
@@ -515,9 +541,9 @@ xlog_cil_push(
515 * that higher sequences will wait for us to write out a commit record 541 * that higher sequences will wait for us to write out a commit record
516 * before they do. 542 * before they do.
517 */ 543 */
518 spin_lock(&cil->xc_cil_lock); 544 spin_lock(&cil->xc_push_lock);
519 list_add(&ctx->committing, &cil->xc_committing); 545 list_add(&ctx->committing, &cil->xc_committing);
520 spin_unlock(&cil->xc_cil_lock); 546 spin_unlock(&cil->xc_push_lock);
521 up_write(&cil->xc_ctx_lock); 547 up_write(&cil->xc_ctx_lock);
522 548
523 /* 549 /*
@@ -552,7 +578,7 @@ xlog_cil_push(
552 * order the commit records so replay will get them in the right order. 578 * order the commit records so replay will get them in the right order.
553 */ 579 */
554restart: 580restart:
555 spin_lock(&cil->xc_cil_lock); 581 spin_lock(&cil->xc_push_lock);
556 list_for_each_entry(new_ctx, &cil->xc_committing, committing) { 582 list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
557 /* 583 /*
558 * Higher sequences will wait for this one so skip them. 584 * Higher sequences will wait for this one so skip them.
@@ -565,11 +591,11 @@ restart:
565 * It is still being pushed! Wait for the push to 591 * It is still being pushed! Wait for the push to
566 * complete, then start again from the beginning. 592 * complete, then start again from the beginning.
567 */ 593 */
568 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock); 594 xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
569 goto restart; 595 goto restart;
570 } 596 }
571 } 597 }
572 spin_unlock(&cil->xc_cil_lock); 598 spin_unlock(&cil->xc_push_lock);
573 599
574 /* xfs_log_done always frees the ticket on error. */ 600 /* xfs_log_done always frees the ticket on error. */
575 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); 601 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
@@ -588,10 +614,10 @@ restart:
588 * callbacks to the iclog we can assign the commit LSN to the context 614 * callbacks to the iclog we can assign the commit LSN to the context
589 * and wake up anyone who is waiting for the commit to complete. 615 * and wake up anyone who is waiting for the commit to complete.
590 */ 616 */
591 spin_lock(&cil->xc_cil_lock); 617 spin_lock(&cil->xc_push_lock);
592 ctx->commit_lsn = commit_lsn; 618 ctx->commit_lsn = commit_lsn;
593 wake_up_all(&cil->xc_commit_wait); 619 wake_up_all(&cil->xc_commit_wait);
594 spin_unlock(&cil->xc_cil_lock); 620 spin_unlock(&cil->xc_push_lock);
595 621
596 /* release the hounds! */ 622 /* release the hounds! */
597 return xfs_log_release_iclog(log->l_mp, commit_iclog); 623 return xfs_log_release_iclog(log->l_mp, commit_iclog);
@@ -644,12 +670,12 @@ xlog_cil_push_background(
644 if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) 670 if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
645 return; 671 return;
646 672
647 spin_lock(&cil->xc_cil_lock); 673 spin_lock(&cil->xc_push_lock);
648 if (cil->xc_push_seq < cil->xc_current_sequence) { 674 if (cil->xc_push_seq < cil->xc_current_sequence) {
649 cil->xc_push_seq = cil->xc_current_sequence; 675 cil->xc_push_seq = cil->xc_current_sequence;
650 queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); 676 queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
651 } 677 }
652 spin_unlock(&cil->xc_cil_lock); 678 spin_unlock(&cil->xc_push_lock);
653 679
654} 680}
655 681
@@ -672,14 +698,14 @@ xlog_cil_push_foreground(
672 * If the CIL is empty or we've already pushed the sequence then 698 * If the CIL is empty or we've already pushed the sequence then
673 * there's no work we need to do. 699 * there's no work we need to do.
674 */ 700 */
675 spin_lock(&cil->xc_cil_lock); 701 spin_lock(&cil->xc_push_lock);
676 if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { 702 if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
677 spin_unlock(&cil->xc_cil_lock); 703 spin_unlock(&cil->xc_push_lock);
678 return; 704 return;
679 } 705 }
680 706
681 cil->xc_push_seq = push_seq; 707 cil->xc_push_seq = push_seq;
682 spin_unlock(&cil->xc_cil_lock); 708 spin_unlock(&cil->xc_push_lock);
683 709
684 /* do the push now */ 710 /* do the push now */
685 xlog_cil_push(log); 711 xlog_cil_push(log);
@@ -706,43 +732,25 @@ xfs_log_commit_cil(
706 int flags) 732 int flags)
707{ 733{
708 struct xlog *log = mp->m_log; 734 struct xlog *log = mp->m_log;
735 struct xfs_cil *cil = log->l_cilp;
709 int log_flags = 0; 736 int log_flags = 0;
710 struct xfs_log_vec *log_vector;
711 737
712 if (flags & XFS_TRANS_RELEASE_LOG_RES) 738 if (flags & XFS_TRANS_RELEASE_LOG_RES)
713 log_flags = XFS_LOG_REL_PERM_RESERV; 739 log_flags = XFS_LOG_REL_PERM_RESERV;
714 740
715 /*
716 * Do all the hard work of formatting items (including memory
717 * allocation) outside the CIL context lock. This prevents stalling CIL
718 * pushes when we are low on memory and a transaction commit spends a
719 * lot of time in memory reclaim.
720 */
721 log_vector = xlog_cil_prepare_log_vecs(tp);
722 if (!log_vector)
723 return ENOMEM;
724
725 /* lock out background commit */ 741 /* lock out background commit */
726 down_read(&log->l_cilp->xc_ctx_lock); 742 down_read(&cil->xc_ctx_lock);
727 if (commit_lsn)
728 *commit_lsn = log->l_cilp->xc_ctx->sequence;
729 743
730 /* xlog_cil_insert_items() destroys log_vector list */ 744 xlog_cil_insert_items(log, tp);
731 xlog_cil_insert_items(log, log_vector, tp->t_ticket);
732 745
733 /* check we didn't blow the reservation */ 746 /* check we didn't blow the reservation */
734 if (tp->t_ticket->t_curr_res < 0) 747 if (tp->t_ticket->t_curr_res < 0)
735 xlog_print_tic_res(log->l_mp, tp->t_ticket); 748 xlog_print_tic_res(mp, tp->t_ticket);
736 749
737 /* attach the transaction to the CIL if it has any busy extents */ 750 tp->t_commit_lsn = cil->xc_ctx->sequence;
738 if (!list_empty(&tp->t_busy)) { 751 if (commit_lsn)
739 spin_lock(&log->l_cilp->xc_cil_lock); 752 *commit_lsn = tp->t_commit_lsn;
740 list_splice_init(&tp->t_busy,
741 &log->l_cilp->xc_ctx->busy_extents);
742 spin_unlock(&log->l_cilp->xc_cil_lock);
743 }
744 753
745 tp->t_commit_lsn = *commit_lsn;
746 xfs_log_done(mp, tp->t_ticket, NULL, log_flags); 754 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
747 xfs_trans_unreserve_and_mod_sb(tp); 755 xfs_trans_unreserve_and_mod_sb(tp);
748 756
@@ -757,11 +765,11 @@ xfs_log_commit_cil(
757 * the log items. This affects (at least) processing of stale buffers, 765 * the log items. This affects (at least) processing of stale buffers,
758 * inodes and EFIs. 766 * inodes and EFIs.
759 */ 767 */
760 xfs_trans_free_items(tp, *commit_lsn, 0); 768 xfs_trans_free_items(tp, tp->t_commit_lsn, 0);
761 769
762 xlog_cil_push_background(log); 770 xlog_cil_push_background(log);
763 771
764 up_read(&log->l_cilp->xc_ctx_lock); 772 up_read(&cil->xc_ctx_lock);
765 return 0; 773 return 0;
766} 774}
767 775
@@ -800,7 +808,7 @@ xlog_cil_force_lsn(
800 * on commits for those as well. 808 * on commits for those as well.
801 */ 809 */
802restart: 810restart:
803 spin_lock(&cil->xc_cil_lock); 811 spin_lock(&cil->xc_push_lock);
804 list_for_each_entry(ctx, &cil->xc_committing, committing) { 812 list_for_each_entry(ctx, &cil->xc_committing, committing) {
805 if (ctx->sequence > sequence) 813 if (ctx->sequence > sequence)
806 continue; 814 continue;
@@ -809,7 +817,7 @@ restart:
809 * It is still being pushed! Wait for the push to 817 * It is still being pushed! Wait for the push to
810 * complete, then start again from the beginning. 818 * complete, then start again from the beginning.
811 */ 819 */
812 xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock); 820 xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock);
813 goto restart; 821 goto restart;
814 } 822 }
815 if (ctx->sequence != sequence) 823 if (ctx->sequence != sequence)
@@ -817,7 +825,7 @@ restart:
817 /* found it! */ 825 /* found it! */
818 commit_lsn = ctx->commit_lsn; 826 commit_lsn = ctx->commit_lsn;
819 } 827 }
820 spin_unlock(&cil->xc_cil_lock); 828 spin_unlock(&cil->xc_push_lock);
821 return commit_lsn; 829 return commit_lsn;
822} 830}
823 831
@@ -875,6 +883,7 @@ xlog_cil_init(
875 INIT_LIST_HEAD(&cil->xc_cil); 883 INIT_LIST_HEAD(&cil->xc_cil);
876 INIT_LIST_HEAD(&cil->xc_committing); 884 INIT_LIST_HEAD(&cil->xc_committing);
877 spin_lock_init(&cil->xc_cil_lock); 885 spin_lock_init(&cil->xc_cil_lock);
886 spin_lock_init(&cil->xc_push_lock);
878 init_rwsem(&cil->xc_ctx_lock); 887 init_rwsem(&cil->xc_ctx_lock);
879 init_waitqueue_head(&cil->xc_commit_wait); 888 init_waitqueue_head(&cil->xc_commit_wait);
880 889
diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h
new file mode 100644
index 000000000000..31e3a06c4644
--- /dev/null
+++ b/fs/xfs/xfs_log_format.h
@@ -0,0 +1,852 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_LOG_FORMAT_H__
19#define __XFS_LOG_FORMAT_H__
20
21struct xfs_mount;
22struct xfs_trans_res;
23
24/*
25 * On-disk Log Format definitions.
26 *
27 * This file contains all the on-disk format definitions used within the log. It
28 * includes the physical log structure itself, as well as all the log item
29 * format structures that are written into the log and intepreted by log
30 * recovery. We start with the physical log format definitions, and then work
31 * through all the log items definitions and everything they encode into the
32 * log.
33 */
34typedef __uint32_t xlog_tid_t;
35
36#define XLOG_MIN_ICLOGS 2
37#define XLOG_MAX_ICLOGS 8
38#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */
39#define XLOG_VERSION_1 1
40#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */
41#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2)
42#define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */
43#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */
44#define XLOG_MAX_RECORD_BSIZE (256*1024)
45#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */
46#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */
47#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */
48#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */
49#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
50 (log)->l_mp->m_sb.sb_logsunit)
51#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
52
53#define XLOG_HEADER_SIZE 512
54
55/* Minimum number of transactions that must fit in the log (defined by mkfs) */
56#define XFS_MIN_LOG_FACTOR 3
57
58#define XLOG_REC_SHIFT(log) \
59 BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
60 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
61#define XLOG_TOTAL_REC_SHIFT(log) \
62 BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
63 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
64
65/* get lsn fields */
66#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
67#define BLOCK_LSN(lsn) ((uint)(lsn))
68
69/* this is used in a spot where we might otherwise double-endian-flip */
70#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0])
71
72static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
73{
74 return ((xfs_lsn_t)cycle << 32) | block;
75}
76
77static inline uint xlog_get_cycle(char *ptr)
78{
79 if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
80 return be32_to_cpu(*((__be32 *)ptr + 1));
81 else
82 return be32_to_cpu(*(__be32 *)ptr);
83}
84
85/* Log Clients */
86#define XFS_TRANSACTION 0x69
87#define XFS_VOLUME 0x2
88#define XFS_LOG 0xaa
89
90#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */
91
92/* Region types for iovec's i_type */
93#define XLOG_REG_TYPE_BFORMAT 1
94#define XLOG_REG_TYPE_BCHUNK 2
95#define XLOG_REG_TYPE_EFI_FORMAT 3
96#define XLOG_REG_TYPE_EFD_FORMAT 4
97#define XLOG_REG_TYPE_IFORMAT 5
98#define XLOG_REG_TYPE_ICORE 6
99#define XLOG_REG_TYPE_IEXT 7
100#define XLOG_REG_TYPE_IBROOT 8
101#define XLOG_REG_TYPE_ILOCAL 9
102#define XLOG_REG_TYPE_IATTR_EXT 10
103#define XLOG_REG_TYPE_IATTR_BROOT 11
104#define XLOG_REG_TYPE_IATTR_LOCAL 12
105#define XLOG_REG_TYPE_QFORMAT 13
106#define XLOG_REG_TYPE_DQUOT 14
107#define XLOG_REG_TYPE_QUOTAOFF 15
108#define XLOG_REG_TYPE_LRHEADER 16
109#define XLOG_REG_TYPE_UNMOUNT 17
110#define XLOG_REG_TYPE_COMMIT 18
111#define XLOG_REG_TYPE_TRANSHDR 19
112#define XLOG_REG_TYPE_ICREATE 20
113#define XLOG_REG_TYPE_MAX 20
114
115/*
116 * Flags to log operation header
117 *
118 * The first write of a new transaction will be preceded with a start
119 * record, XLOG_START_TRANS. Once a transaction is committed, a commit
120 * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into
121 * the remainder of the current active in-core log, it is split up into
122 * multiple regions. Each partial region will be marked with a
123 * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
124 *
125 */
126#define XLOG_START_TRANS 0x01 /* Start a new transaction */
127#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */
128#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */
129#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */
130#define XLOG_END_TRANS 0x10 /* End a continued transaction */
131#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */
132
133
134typedef struct xlog_op_header {
135 __be32 oh_tid; /* transaction id of operation : 4 b */
136 __be32 oh_len; /* bytes in data region : 4 b */
137 __u8 oh_clientid; /* who sent me this : 1 b */
138 __u8 oh_flags; /* : 1 b */
139 __u16 oh_res2; /* 32 bit align : 2 b */
140} xlog_op_header_t;
141
142/* valid values for h_fmt */
143#define XLOG_FMT_UNKNOWN 0
144#define XLOG_FMT_LINUX_LE 1
145#define XLOG_FMT_LINUX_BE 2
146#define XLOG_FMT_IRIX_BE 3
147
148/* our fmt */
149#ifdef XFS_NATIVE_HOST
150#define XLOG_FMT XLOG_FMT_LINUX_BE
151#else
152#define XLOG_FMT XLOG_FMT_LINUX_LE
153#endif
154
155typedef struct xlog_rec_header {
156 __be32 h_magicno; /* log record (LR) identifier : 4 */
157 __be32 h_cycle; /* write cycle of log : 4 */
158 __be32 h_version; /* LR version : 4 */
159 __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */
160 __be64 h_lsn; /* lsn of this LR : 8 */
161 __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */
162 __le32 h_crc; /* crc of log record : 4 */
163 __be32 h_prev_block; /* block number to previous LR : 4 */
164 __be32 h_num_logops; /* number of log operations in this LR : 4 */
165 __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
166 /* new fields */
167 __be32 h_fmt; /* format of log record : 4 */
168 uuid_t h_fs_uuid; /* uuid of FS : 16 */
169 __be32 h_size; /* iclog size : 4 */
170} xlog_rec_header_t;
171
172typedef struct xlog_rec_ext_header {
173 __be32 xh_cycle; /* write cycle of log : 4 */
174 __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */
175} xlog_rec_ext_header_t;
176
177/*
178 * Quite misnamed, because this union lays out the actual on-disk log buffer.
179 */
180typedef union xlog_in_core2 {
181 xlog_rec_header_t hic_header;
182 xlog_rec_ext_header_t hic_xheader;
183 char hic_sector[XLOG_HEADER_SIZE];
184} xlog_in_core_2_t;
185
186/* not an on-disk structure, but needed by log recovery in userspace */
187typedef struct xfs_log_iovec {
188 void *i_addr; /* beginning address of region */
189 int i_len; /* length in bytes of region */
190 uint i_type; /* type of region */
191} xfs_log_iovec_t;
192
193
194/*
195 * Transaction Header definitions.
196 *
197 * This is the structure written in the log at the head of every transaction. It
198 * identifies the type and id of the transaction, and contains the number of
199 * items logged by the transaction so we know how many to expect during
200 * recovery.
201 *
202 * Do not change the below structure without redoing the code in
203 * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
204 */
205typedef struct xfs_trans_header {
206 uint th_magic; /* magic number */
207 uint th_type; /* transaction type */
208 __int32_t th_tid; /* transaction id (unused) */
209 uint th_num_items; /* num items logged by trans */
210} xfs_trans_header_t;
211
212#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */
213
214/*
215 * Log item types.
216 */
217#define XFS_LI_EFI 0x1236
218#define XFS_LI_EFD 0x1237
219#define XFS_LI_IUNLINK 0x1238
220#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */
221#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */
222#define XFS_LI_DQUOT 0x123d
223#define XFS_LI_QUOTAOFF 0x123e
224#define XFS_LI_ICREATE 0x123f
225
226#define XFS_LI_TYPE_DESC \
227 { XFS_LI_EFI, "XFS_LI_EFI" }, \
228 { XFS_LI_EFD, "XFS_LI_EFD" }, \
229 { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \
230 { XFS_LI_INODE, "XFS_LI_INODE" }, \
231 { XFS_LI_BUF, "XFS_LI_BUF" }, \
232 { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \
233 { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \
234 { XFS_LI_ICREATE, "XFS_LI_ICREATE" }
235
236/*
237 * Transaction types. Used to distinguish types of buffers.
238 */
239#define XFS_TRANS_SETATTR_NOT_SIZE 1
240#define XFS_TRANS_SETATTR_SIZE 2
241#define XFS_TRANS_INACTIVE 3
242#define XFS_TRANS_CREATE 4
243#define XFS_TRANS_CREATE_TRUNC 5
244#define XFS_TRANS_TRUNCATE_FILE 6
245#define XFS_TRANS_REMOVE 7
246#define XFS_TRANS_LINK 8
247#define XFS_TRANS_RENAME 9
248#define XFS_TRANS_MKDIR 10
249#define XFS_TRANS_RMDIR 11
250#define XFS_TRANS_SYMLINK 12
251#define XFS_TRANS_SET_DMATTRS 13
252#define XFS_TRANS_GROWFS 14
253#define XFS_TRANS_STRAT_WRITE 15
254#define XFS_TRANS_DIOSTRAT 16
255/* 17 was XFS_TRANS_WRITE_SYNC */
256#define XFS_TRANS_WRITEID 18
257#define XFS_TRANS_ADDAFORK 19
258#define XFS_TRANS_ATTRINVAL 20
259#define XFS_TRANS_ATRUNCATE 21
260#define XFS_TRANS_ATTR_SET 22
261#define XFS_TRANS_ATTR_RM 23
262#define XFS_TRANS_ATTR_FLAG 24
263#define XFS_TRANS_CLEAR_AGI_BUCKET 25
264#define XFS_TRANS_QM_SBCHANGE 26
265/*
266 * Dummy entries since we use the transaction type to index into the
267 * trans_type[] in xlog_recover_print_trans_head()
268 */
269#define XFS_TRANS_DUMMY1 27
270#define XFS_TRANS_DUMMY2 28
271#define XFS_TRANS_QM_QUOTAOFF 29
272#define XFS_TRANS_QM_DQALLOC 30
273#define XFS_TRANS_QM_SETQLIM 31
274#define XFS_TRANS_QM_DQCLUSTER 32
275#define XFS_TRANS_QM_QINOCREATE 33
276#define XFS_TRANS_QM_QUOTAOFF_END 34
277#define XFS_TRANS_SB_UNIT 35
278#define XFS_TRANS_FSYNC_TS 36
279#define XFS_TRANS_GROWFSRT_ALLOC 37
280#define XFS_TRANS_GROWFSRT_ZERO 38
281#define XFS_TRANS_GROWFSRT_FREE 39
282#define XFS_TRANS_SWAPEXT 40
283#define XFS_TRANS_SB_COUNT 41
284#define XFS_TRANS_CHECKPOINT 42
285#define XFS_TRANS_ICREATE 43
286#define XFS_TRANS_TYPE_MAX 43
287/* new transaction types need to be reflected in xfs_logprint(8) */
288
289#define XFS_TRANS_TYPES \
290 { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \
291 { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \
292 { XFS_TRANS_INACTIVE, "INACTIVE" }, \
293 { XFS_TRANS_CREATE, "CREATE" }, \
294 { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \
295 { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \
296 { XFS_TRANS_REMOVE, "REMOVE" }, \
297 { XFS_TRANS_LINK, "LINK" }, \
298 { XFS_TRANS_RENAME, "RENAME" }, \
299 { XFS_TRANS_MKDIR, "MKDIR" }, \
300 { XFS_TRANS_RMDIR, "RMDIR" }, \
301 { XFS_TRANS_SYMLINK, "SYMLINK" }, \
302 { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \
303 { XFS_TRANS_GROWFS, "GROWFS" }, \
304 { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \
305 { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \
306 { XFS_TRANS_WRITEID, "WRITEID" }, \
307 { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \
308 { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \
309 { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \
310 { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \
311 { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \
312 { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \
313 { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \
314 { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \
315 { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \
316 { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \
317 { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \
318 { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \
319 { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \
320 { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \
321 { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \
322 { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \
323 { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \
324 { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \
325 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
326 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
327 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
328 { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
329 { XFS_TRANS_DUMMY1, "DUMMY1" }, \
330 { XFS_TRANS_DUMMY2, "DUMMY2" }, \
331 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
332
333/*
334 * This structure is used to track log items associated with
335 * a transaction. It points to the log item and keeps some
336 * flags to track the state of the log item. It also tracks
337 * the amount of space needed to log the item it describes
338 * once we get to commit processing (see xfs_trans_commit()).
339 */
340struct xfs_log_item_desc {
341 struct xfs_log_item *lid_item;
342 struct list_head lid_trans;
343 unsigned char lid_flags;
344};
345
346#define XFS_LID_DIRTY 0x1
347
348/*
349 * Values for t_flags.
350 */
351#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */
352#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */
353#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */
354#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
355#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
356#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
357#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer
358 count in superblock */
359
360/*
361 * Values for call flags parameter.
362 */
363#define XFS_TRANS_RELEASE_LOG_RES 0x4
364#define XFS_TRANS_ABORT 0x8
365
366/*
367 * Field values for xfs_trans_mod_sb.
368 */
369#define XFS_TRANS_SB_ICOUNT 0x00000001
370#define XFS_TRANS_SB_IFREE 0x00000002
371#define XFS_TRANS_SB_FDBLOCKS 0x00000004
372#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008
373#define XFS_TRANS_SB_FREXTENTS 0x00000010
374#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020
375#define XFS_TRANS_SB_DBLOCKS 0x00000040
376#define XFS_TRANS_SB_AGCOUNT 0x00000080
377#define XFS_TRANS_SB_IMAXPCT 0x00000100
378#define XFS_TRANS_SB_REXTSIZE 0x00000200
379#define XFS_TRANS_SB_RBMBLOCKS 0x00000400
380#define XFS_TRANS_SB_RBLOCKS 0x00000800
381#define XFS_TRANS_SB_REXTENTS 0x00001000
382#define XFS_TRANS_SB_REXTSLOG 0x00002000
383
384/*
385 * Here we centralize the specification of XFS meta-data buffer
386 * reference count values. This determine how hard the buffer
387 * cache tries to hold onto the buffer.
388 */
389#define XFS_AGF_REF 4
390#define XFS_AGI_REF 4
391#define XFS_AGFL_REF 3
392#define XFS_INO_BTREE_REF 3
393#define XFS_ALLOC_BTREE_REF 2
394#define XFS_BMAP_BTREE_REF 2
395#define XFS_DIR_BTREE_REF 2
396#define XFS_INO_REF 2
397#define XFS_ATTR_BTREE_REF 1
398#define XFS_DQUOT_REF 1
399
400/*
401 * Flags for xfs_trans_ichgtime().
402 */
403#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
404#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
405#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */
406
407
408/*
409 * Inode Log Item Format definitions.
410 *
411 * This is the structure used to lay out an inode log item in the
412 * log. The size of the inline data/extents/b-tree root to be logged
413 * (if any) is indicated in the ilf_dsize field. Changes to this structure
414 * must be added on to the end.
415 */
416typedef struct xfs_inode_log_format {
417 __uint16_t ilf_type; /* inode log item type */
418 __uint16_t ilf_size; /* size of this item */
419 __uint32_t ilf_fields; /* flags for fields logged */
420 __uint16_t ilf_asize; /* size of attr d/ext/root */
421 __uint16_t ilf_dsize; /* size of data/ext/root */
422 __uint64_t ilf_ino; /* inode number */
423 union {
424 __uint32_t ilfu_rdev; /* rdev value for dev inode*/
425 uuid_t ilfu_uuid; /* mount point value */
426 } ilf_u;
427 __int64_t ilf_blkno; /* blkno of inode buffer */
428 __int32_t ilf_len; /* len of inode buffer */
429 __int32_t ilf_boffset; /* off of inode in buffer */
430} xfs_inode_log_format_t;
431
432typedef struct xfs_inode_log_format_32 {
433 __uint16_t ilf_type; /* inode log item type */
434 __uint16_t ilf_size; /* size of this item */
435 __uint32_t ilf_fields; /* flags for fields logged */
436 __uint16_t ilf_asize; /* size of attr d/ext/root */
437 __uint16_t ilf_dsize; /* size of data/ext/root */
438 __uint64_t ilf_ino; /* inode number */
439 union {
440 __uint32_t ilfu_rdev; /* rdev value for dev inode*/
441 uuid_t ilfu_uuid; /* mount point value */
442 } ilf_u;
443 __int64_t ilf_blkno; /* blkno of inode buffer */
444 __int32_t ilf_len; /* len of inode buffer */
445 __int32_t ilf_boffset; /* off of inode in buffer */
446} __attribute__((packed)) xfs_inode_log_format_32_t;
447
448typedef struct xfs_inode_log_format_64 {
449 __uint16_t ilf_type; /* inode log item type */
450 __uint16_t ilf_size; /* size of this item */
451 __uint32_t ilf_fields; /* flags for fields logged */
452 __uint16_t ilf_asize; /* size of attr d/ext/root */
453 __uint16_t ilf_dsize; /* size of data/ext/root */
454 __uint32_t ilf_pad; /* pad for 64 bit boundary */
455 __uint64_t ilf_ino; /* inode number */
456 union {
457 __uint32_t ilfu_rdev; /* rdev value for dev inode*/
458 uuid_t ilfu_uuid; /* mount point value */
459 } ilf_u;
460 __int64_t ilf_blkno; /* blkno of inode buffer */
461 __int32_t ilf_len; /* len of inode buffer */
462 __int32_t ilf_boffset; /* off of inode in buffer */
463} xfs_inode_log_format_64_t;
464
465/*
466 * Flags for xfs_trans_log_inode flags field.
467 */
468#define XFS_ILOG_CORE 0x001 /* log standard inode fields */
469#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */
470#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */
471#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */
472#define XFS_ILOG_DEV 0x010 /* log the dev field */
473#define XFS_ILOG_UUID 0x020 /* log the uuid field */
474#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */
475#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */
476#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */
477
478
479/*
480 * The timestamps are dirty, but not necessarily anything else in the inode
481 * core. Unlike the other fields above this one must never make it to disk
482 * in the ilf_fields of the inode_log_format, but is purely store in-memory in
483 * ili_fields in the inode_log_item.
484 */
485#define XFS_ILOG_TIMESTAMP 0x4000
486
487#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
488 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
489 XFS_ILOG_UUID | XFS_ILOG_ADATA | \
490 XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
491
492#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
493 XFS_ILOG_DBROOT)
494
495#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
496 XFS_ILOG_ABROOT)
497
498#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \
499 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
500 XFS_ILOG_DEV | XFS_ILOG_UUID | \
501 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
502 XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP)
503
504static inline int xfs_ilog_fbroot(int w)
505{
506 return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
507}
508
509static inline int xfs_ilog_fext(int w)
510{
511 return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
512}
513
514static inline int xfs_ilog_fdata(int w)
515{
516 return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
517}
518
519/*
520 * Incore version of the on-disk inode core structures. We log this directly
521 * into the journal in host CPU format (for better or worse) and as such
522 * directly mirrors the xfs_dinode structure as it must contain all the same
523 * information.
524 */
525typedef struct xfs_ictimestamp {
526 __int32_t t_sec; /* timestamp seconds */
527 __int32_t t_nsec; /* timestamp nanoseconds */
528} xfs_ictimestamp_t;
529
530/*
531 * NOTE: This structure must be kept identical to struct xfs_dinode
532 * in xfs_dinode.h except for the endianness annotations.
533 */
534typedef struct xfs_icdinode {
535 __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */
536 __uint16_t di_mode; /* mode and type of file */
537 __int8_t di_version; /* inode version */
538 __int8_t di_format; /* format of di_c data */
539 __uint16_t di_onlink; /* old number of links to file */
540 __uint32_t di_uid; /* owner's user id */
541 __uint32_t di_gid; /* owner's group id */
542 __uint32_t di_nlink; /* number of links to file */
543 __uint16_t di_projid_lo; /* lower part of owner's project id */
544 __uint16_t di_projid_hi; /* higher part of owner's project id */
545 __uint8_t di_pad[6]; /* unused, zeroed space */
546 __uint16_t di_flushiter; /* incremented on flush */
547 xfs_ictimestamp_t di_atime; /* time last accessed */
548 xfs_ictimestamp_t di_mtime; /* time last modified */
549 xfs_ictimestamp_t di_ctime; /* time created/inode modified */
550 xfs_fsize_t di_size; /* number of bytes in file */
551 xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */
552 xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
553 xfs_extnum_t di_nextents; /* number of extents in data fork */
554 xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
555 __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
556 __int8_t di_aformat; /* format of attr fork's data */
557 __uint32_t di_dmevmask; /* DMIG event mask */
558 __uint16_t di_dmstate; /* DMIG state info */
559 __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */
560 __uint32_t di_gen; /* generation number */
561
562 /* di_next_unlinked is the only non-core field in the old dinode */
563 xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */
564
565 /* start of the extended dinode, writable fields */
566 __uint32_t di_crc; /* CRC of the inode */
567 __uint64_t di_changecount; /* number of attribute changes */
568 xfs_lsn_t di_lsn; /* flush sequence */
569 __uint64_t di_flags2; /* more random flags */
570 __uint8_t di_pad2[16]; /* more padding for future expansion */
571
572 /* fields only written to during inode creation */
573 xfs_ictimestamp_t di_crtime; /* time created */
574 xfs_ino_t di_ino; /* inode number */
575 uuid_t di_uuid; /* UUID of the filesystem */
576
577 /* structure must be padded to 64 bit alignment */
578} xfs_icdinode_t;
579
580static inline uint xfs_icdinode_size(int version)
581{
582 if (version == 3)
583 return sizeof(struct xfs_icdinode);
584 return offsetof(struct xfs_icdinode, di_next_unlinked);
585}
586
587/*
588 * Buffer Log Format defintions
589 *
590 * These are the physical dirty bitmap defintions for the log format structure.
591 */
592#define XFS_BLF_CHUNK 128
593#define XFS_BLF_SHIFT 7
594#define BIT_TO_WORD_SHIFT 5
595#define NBWORD (NBBY * sizeof(unsigned int))
596
597/*
598 * This flag indicates that the buffer contains on disk inodes
599 * and requires special recovery handling.
600 */
601#define XFS_BLF_INODE_BUF (1<<0)
602
603/*
604 * This flag indicates that the buffer should not be replayed
605 * during recovery because its blocks are being freed.
606 */
607#define XFS_BLF_CANCEL (1<<1)
608
609/*
610 * This flag indicates that the buffer contains on disk
611 * user or group dquots and may require special recovery handling.
612 */
613#define XFS_BLF_UDQUOT_BUF (1<<2)
614#define XFS_BLF_PDQUOT_BUF (1<<3)
615#define XFS_BLF_GDQUOT_BUF (1<<4)
616
617/*
618 * This is the structure used to lay out a buf log item in the
619 * log. The data map describes which 128 byte chunks of the buffer
620 * have been logged.
621 */
622#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
623
624typedef struct xfs_buf_log_format {
625 unsigned short blf_type; /* buf log item type indicator */
626 unsigned short blf_size; /* size of this item */
627 ushort blf_flags; /* misc state */
628 ushort blf_len; /* number of blocks in this buf */
629 __int64_t blf_blkno; /* starting blkno of this buf */
630 unsigned int blf_map_size; /* used size of data bitmap in words */
631 unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
632} xfs_buf_log_format_t;
633
634/*
635 * All buffers now need to tell recovery where the magic number
636 * is so that it can verify and calculate the CRCs on the buffer correctly
637 * once the changes have been replayed into the buffer.
638 *
639 * The type value is held in the upper 5 bits of the blf_flags field, which is
640 * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down.
641 */
642#define XFS_BLFT_BITS 5
643#define XFS_BLFT_SHIFT 11
644#define XFS_BLFT_MASK (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT)
645
646enum xfs_blft {
647 XFS_BLFT_UNKNOWN_BUF = 0,
648 XFS_BLFT_UDQUOT_BUF,
649 XFS_BLFT_PDQUOT_BUF,
650 XFS_BLFT_GDQUOT_BUF,
651 XFS_BLFT_BTREE_BUF,
652 XFS_BLFT_AGF_BUF,
653 XFS_BLFT_AGFL_BUF,
654 XFS_BLFT_AGI_BUF,
655 XFS_BLFT_DINO_BUF,
656 XFS_BLFT_SYMLINK_BUF,
657 XFS_BLFT_DIR_BLOCK_BUF,
658 XFS_BLFT_DIR_DATA_BUF,
659 XFS_BLFT_DIR_FREE_BUF,
660 XFS_BLFT_DIR_LEAF1_BUF,
661 XFS_BLFT_DIR_LEAFN_BUF,
662 XFS_BLFT_DA_NODE_BUF,
663 XFS_BLFT_ATTR_LEAF_BUF,
664 XFS_BLFT_ATTR_RMT_BUF,
665 XFS_BLFT_SB_BUF,
666 XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS),
667};
668
669static inline void
670xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type)
671{
672 ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF);
673 blf->blf_flags &= ~XFS_BLFT_MASK;
674 blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK);
675}
676
677static inline __uint16_t
678xfs_blft_from_flags(struct xfs_buf_log_format *blf)
679{
680 return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT;
681}
682
683/*
684 * EFI/EFD log format definitions
685 */
686typedef struct xfs_extent {
687 xfs_dfsbno_t ext_start;
688 xfs_extlen_t ext_len;
689} xfs_extent_t;
690
691/*
692 * Since an xfs_extent_t has types (start:64, len: 32)
693 * there are different alignments on 32 bit and 64 bit kernels.
694 * So we provide the different variants for use by a
695 * conversion routine.
696 */
697typedef struct xfs_extent_32 {
698 __uint64_t ext_start;
699 __uint32_t ext_len;
700} __attribute__((packed)) xfs_extent_32_t;
701
702typedef struct xfs_extent_64 {
703 __uint64_t ext_start;
704 __uint32_t ext_len;
705 __uint32_t ext_pad;
706} xfs_extent_64_t;
707
708/*
709 * This is the structure used to lay out an efi log item in the
710 * log. The efi_extents field is a variable size array whose
711 * size is given by efi_nextents.
712 */
713typedef struct xfs_efi_log_format {
714 __uint16_t efi_type; /* efi log item type */
715 __uint16_t efi_size; /* size of this item */
716 __uint32_t efi_nextents; /* # extents to free */
717 __uint64_t efi_id; /* efi identifier */
718 xfs_extent_t efi_extents[1]; /* array of extents to free */
719} xfs_efi_log_format_t;
720
721typedef struct xfs_efi_log_format_32 {
722 __uint16_t efi_type; /* efi log item type */
723 __uint16_t efi_size; /* size of this item */
724 __uint32_t efi_nextents; /* # extents to free */
725 __uint64_t efi_id; /* efi identifier */
726 xfs_extent_32_t efi_extents[1]; /* array of extents to free */
727} __attribute__((packed)) xfs_efi_log_format_32_t;
728
729typedef struct xfs_efi_log_format_64 {
730 __uint16_t efi_type; /* efi log item type */
731 __uint16_t efi_size; /* size of this item */
732 __uint32_t efi_nextents; /* # extents to free */
733 __uint64_t efi_id; /* efi identifier */
734 xfs_extent_64_t efi_extents[1]; /* array of extents to free */
735} xfs_efi_log_format_64_t;
736
737/*
738 * This is the structure used to lay out an efd log item in the
739 * log. The efd_extents array is a variable size array whose
740 * size is given by efd_nextents;
741 */
742typedef struct xfs_efd_log_format {
743 __uint16_t efd_type; /* efd log item type */
744 __uint16_t efd_size; /* size of this item */
745 __uint32_t efd_nextents; /* # of extents freed */
746 __uint64_t efd_efi_id; /* id of corresponding efi */
747 xfs_extent_t efd_extents[1]; /* array of extents freed */
748} xfs_efd_log_format_t;
749
750typedef struct xfs_efd_log_format_32 {
751 __uint16_t efd_type; /* efd log item type */
752 __uint16_t efd_size; /* size of this item */
753 __uint32_t efd_nextents; /* # of extents freed */
754 __uint64_t efd_efi_id; /* id of corresponding efi */
755 xfs_extent_32_t efd_extents[1]; /* array of extents freed */
756} __attribute__((packed)) xfs_efd_log_format_32_t;
757
758typedef struct xfs_efd_log_format_64 {
759 __uint16_t efd_type; /* efd log item type */
760 __uint16_t efd_size; /* size of this item */
761 __uint32_t efd_nextents; /* # of extents freed */
762 __uint64_t efd_efi_id; /* id of corresponding efi */
763 xfs_extent_64_t efd_extents[1]; /* array of extents freed */
764} xfs_efd_log_format_64_t;
765
766/*
767 * Dquot Log format definitions.
768 *
769 * The first two fields must be the type and size fitting into
770 * 32 bits : log_recovery code assumes that.
771 */
772typedef struct xfs_dq_logformat {
773 __uint16_t qlf_type; /* dquot log item type */
774 __uint16_t qlf_size; /* size of this item */
775 xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */
776 __int64_t qlf_blkno; /* blkno of dquot buffer */
777 __int32_t qlf_len; /* len of dquot buffer */
778 __uint32_t qlf_boffset; /* off of dquot in buffer */
779} xfs_dq_logformat_t;
780
781/*
782 * log format struct for QUOTAOFF records.
783 * The first two fields must be the type and size fitting into
784 * 32 bits : log_recovery code assumes that.
785 * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
786 * to the first and ensures that the first logitem is taken out of the AIL
787 * only when the last one is securely committed.
788 */
789typedef struct xfs_qoff_logformat {
790 unsigned short qf_type; /* quotaoff log item type */
791 unsigned short qf_size; /* size of this item */
792 unsigned int qf_flags; /* USR and/or GRP */
793 char qf_pad[12]; /* padding for future */
794} xfs_qoff_logformat_t;
795
796
797/*
798 * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
799 */
800#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */
801#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */
802#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */
803#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */
804#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */
805#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */
806#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */
807
808/*
809 * Conversion to and from the combined OQUOTA flag (if necessary)
810 * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
811 */
812#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */
813#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */
814#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */
815#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */
816
817#define XFS_ALL_QUOTA_ACCT \
818 (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
819#define XFS_ALL_QUOTA_ENFD \
820 (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
821#define XFS_ALL_QUOTA_CHKD \
822 (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
823
824#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
825 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
826 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
827 XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
828 XFS_PQUOTA_CHKD)
829
830/*
831 * Inode create log item structure
832 *
833 * Log recovery assumes the first two entries are the type and size and they fit
834 * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so
835 * decoding can be done correctly.
836 */
837struct xfs_icreate_log {
838 __uint16_t icl_type; /* type of log format structure */
839 __uint16_t icl_size; /* size of log format structure */
840 __be32 icl_ag; /* ag being allocated in */
841 __be32 icl_agbno; /* start block of inode range */
842 __be32 icl_count; /* number of inodes to initialise */
843 __be32 icl_isize; /* size of inodes */
844 __be32 icl_length; /* length of extent to initialise */
845 __be32 icl_gen; /* inode generation number to use */
846};
847
848int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
849int xfs_log_calc_minimum_size(struct xfs_mount *);
850
851
852#endif /* __XFS_LOG_FORMAT_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index b9ea262dd1c2..136654b9400d 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -24,51 +24,13 @@ struct xlog_ticket;
24struct xfs_mount; 24struct xfs_mount;
25 25
26/* 26/*
27 * Macros, structures, prototypes for internal log manager use. 27 * Flags for log structure
28 */ 28 */
29 29#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
30#define XLOG_MIN_ICLOGS 2 30#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
31#define XLOG_MAX_ICLOGS 8 31#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
32#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */ 32 shutdown */
33#define XLOG_VERSION_1 1 33#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */
34#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */
35#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2)
36#define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */
37#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */
38#define XLOG_MAX_RECORD_BSIZE (256*1024)
39#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */
40#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */
41#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */
42#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */
43#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \
44 (log)->l_mp->m_sb.sb_logsunit)
45#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit)
46
47#define XLOG_HEADER_SIZE 512
48
49#define XLOG_REC_SHIFT(log) \
50 BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
51 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
52#define XLOG_TOTAL_REC_SHIFT(log) \
53 BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
54 XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
55
56static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
57{
58 return ((xfs_lsn_t)cycle << 32) | block;
59}
60
61static inline uint xlog_get_cycle(char *ptr)
62{
63 if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
64 return be32_to_cpu(*((__be32 *)ptr + 1));
65 else
66 return be32_to_cpu(*(__be32 *)ptr);
67}
68
69#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
70
71#ifdef __KERNEL__
72 34
73/* 35/*
74 * get client id from packed copy. 36 * get client id from packed copy.
@@ -101,28 +63,8 @@ static inline uint xlog_get_client_id(__be32 i)
101#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ 63#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */
102#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ 64#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */
103#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ 65#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */
104#endif /* __KERNEL__ */
105 66
106/* 67/*
107 * Flags to log operation header
108 *
109 * The first write of a new transaction will be preceded with a start
110 * record, XLOG_START_TRANS. Once a transaction is committed, a commit
111 * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into
112 * the remainder of the current active in-core log, it is split up into
113 * multiple regions. Each partial region will be marked with a
114 * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS.
115 *
116 */
117#define XLOG_START_TRANS 0x01 /* Start a new transaction */
118#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */
119#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */
120#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */
121#define XLOG_END_TRANS 0x10 /* End a continued transaction */
122#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */
123
124#ifdef __KERNEL__
125/*
126 * Flags to log ticket 68 * Flags to log ticket
127 */ 69 */
128#define XLOG_TIC_INITED 0x1 /* has been initialized */ 70#define XLOG_TIC_INITED 0x1 /* has been initialized */
@@ -132,22 +74,6 @@ static inline uint xlog_get_client_id(__be32 i)
132 { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ 74 { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
133 { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } 75 { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
134 76
135#endif /* __KERNEL__ */
136
137#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */
138
139/*
140 * Flags for log structure
141 */
142#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
143#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
144#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
145 shutdown */
146#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */
147
148typedef __uint32_t xlog_tid_t;
149
150#ifdef __KERNEL__
151/* 77/*
152 * Below are states for covering allocation transactions. 78 * Below are states for covering allocation transactions.
153 * By covering, we mean changing the h_tail_lsn in the last on-disk 79 * By covering, we mean changing the h_tail_lsn in the last on-disk
@@ -223,7 +149,6 @@ typedef __uint32_t xlog_tid_t;
223 149
224#define XLOG_COVER_OPS 5 150#define XLOG_COVER_OPS 5
225 151
226
227/* Ticket reservation region accounting */ 152/* Ticket reservation region accounting */
228#define XLOG_TIC_LEN_MAX 15 153#define XLOG_TIC_LEN_MAX 15
229 154
@@ -258,64 +183,6 @@ typedef struct xlog_ticket {
258 xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */ 183 xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */
259} xlog_ticket_t; 184} xlog_ticket_t;
260 185
261#endif
262
263
264typedef struct xlog_op_header {
265 __be32 oh_tid; /* transaction id of operation : 4 b */
266 __be32 oh_len; /* bytes in data region : 4 b */
267 __u8 oh_clientid; /* who sent me this : 1 b */
268 __u8 oh_flags; /* : 1 b */
269 __u16 oh_res2; /* 32 bit align : 2 b */
270} xlog_op_header_t;
271
272
273/* valid values for h_fmt */
274#define XLOG_FMT_UNKNOWN 0
275#define XLOG_FMT_LINUX_LE 1
276#define XLOG_FMT_LINUX_BE 2
277#define XLOG_FMT_IRIX_BE 3
278
279/* our fmt */
280#ifdef XFS_NATIVE_HOST
281#define XLOG_FMT XLOG_FMT_LINUX_BE
282#else
283#define XLOG_FMT XLOG_FMT_LINUX_LE
284#endif
285
286typedef struct xlog_rec_header {
287 __be32 h_magicno; /* log record (LR) identifier : 4 */
288 __be32 h_cycle; /* write cycle of log : 4 */
289 __be32 h_version; /* LR version : 4 */
290 __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */
291 __be64 h_lsn; /* lsn of this LR : 8 */
292 __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */
293 __le32 h_crc; /* crc of log record : 4 */
294 __be32 h_prev_block; /* block number to previous LR : 4 */
295 __be32 h_num_logops; /* number of log operations in this LR : 4 */
296 __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
297 /* new fields */
298 __be32 h_fmt; /* format of log record : 4 */
299 uuid_t h_fs_uuid; /* uuid of FS : 16 */
300 __be32 h_size; /* iclog size : 4 */
301} xlog_rec_header_t;
302
303typedef struct xlog_rec_ext_header {
304 __be32 xh_cycle; /* write cycle of log : 4 */
305 __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */
306} xlog_rec_ext_header_t;
307
308#ifdef __KERNEL__
309
310/*
311 * Quite misnamed, because this union lays out the actual on-disk log buffer.
312 */
313typedef union xlog_in_core2 {
314 xlog_rec_header_t hic_header;
315 xlog_rec_ext_header_t hic_xheader;
316 char hic_sector[XLOG_HEADER_SIZE];
317} xlog_in_core_2_t;
318
319/* 186/*
320 * - A log record header is 512 bytes. There is plenty of room to grow the 187 * - A log record header is 512 bytes. There is plenty of room to grow the
321 * xlog_rec_header_t into the reserved space. 188 * xlog_rec_header_t into the reserved space.
@@ -411,14 +278,17 @@ struct xfs_cil {
411 struct xlog *xc_log; 278 struct xlog *xc_log;
412 struct list_head xc_cil; 279 struct list_head xc_cil;
413 spinlock_t xc_cil_lock; 280 spinlock_t xc_cil_lock;
281
282 struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp;
414 struct xfs_cil_ctx *xc_ctx; 283 struct xfs_cil_ctx *xc_ctx;
415 struct rw_semaphore xc_ctx_lock; 284
285 spinlock_t xc_push_lock ____cacheline_aligned_in_smp;
286 xfs_lsn_t xc_push_seq;
416 struct list_head xc_committing; 287 struct list_head xc_committing;
417 wait_queue_head_t xc_commit_wait; 288 wait_queue_head_t xc_commit_wait;
418 xfs_lsn_t xc_current_sequence; 289 xfs_lsn_t xc_current_sequence;
419 struct work_struct xc_push_work; 290 struct work_struct xc_push_work;
420 xfs_lsn_t xc_push_seq; 291} ____cacheline_aligned_in_smp;
421};
422 292
423/* 293/*
424 * The amount of log space we allow the CIL to aggregate is difficult to size. 294 * The amount of log space we allow the CIL to aggregate is difficult to size.
@@ -686,6 +556,5 @@ static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
686 schedule(); 556 schedule();
687 remove_wait_queue(wq, &wait); 557 remove_wait_queue(wq, &wait);
688} 558}
689#endif /* __KERNEL__ */
690 559
691#endif /* __XFS_LOG_PRIV_H__ */ 560#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 7681b19aa5dc..7c0c1fdc728b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -17,7 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_format.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h" 23#include "xfs_inum.h"
@@ -41,7 +41,6 @@
41#include "xfs_extfree_item.h" 41#include "xfs_extfree_item.h"
42#include "xfs_trans_priv.h" 42#include "xfs_trans_priv.h"
43#include "xfs_quota.h" 43#include "xfs_quota.h"
44#include "xfs_utils.h"
45#include "xfs_cksum.h" 44#include "xfs_cksum.h"
46#include "xfs_trace.h" 45#include "xfs_trace.h"
47#include "xfs_icache.h" 46#include "xfs_icache.h"
@@ -51,10 +50,12 @@
51#include "xfs_symlink.h" 50#include "xfs_symlink.h"
52#include "xfs_da_btree.h" 51#include "xfs_da_btree.h"
53#include "xfs_dir2_format.h" 52#include "xfs_dir2_format.h"
54#include "xfs_dir2_priv.h" 53#include "xfs_dir2.h"
55#include "xfs_attr_leaf.h" 54#include "xfs_attr_leaf.h"
56#include "xfs_attr_remote.h" 55#include "xfs_attr_remote.h"
57 56
57#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
58
58STATIC int 59STATIC int
59xlog_find_zeroed( 60xlog_find_zeroed(
60 struct xlog *, 61 struct xlog *,
@@ -607,7 +608,7 @@ out:
607 608
608/* 609/*
609 * Head is defined to be the point of the log where the next log write 610 * Head is defined to be the point of the log where the next log write
610 * write could go. This means that incomplete LR writes at the end are 611 * could go. This means that incomplete LR writes at the end are
611 * eliminated when calculating the head. We aren't guaranteed that previous 612 * eliminated when calculating the head. We aren't guaranteed that previous
612 * LR have complete transactions. We only know that a cycle number of 613 * LR have complete transactions. We only know that a cycle number of
613 * current cycle number -1 won't be present in the log if we start writing 614 * current cycle number -1 won't be present in the log if we start writing
@@ -963,6 +964,7 @@ xlog_find_tail(
963 } 964 }
964 if (!found) { 965 if (!found) {
965 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); 966 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
967 xlog_put_bp(bp);
966 ASSERT(0); 968 ASSERT(0);
967 return XFS_ERROR(EIO); 969 return XFS_ERROR(EIO);
968 } 970 }
@@ -1144,7 +1146,8 @@ xlog_find_zeroed(
1144 */ 1146 */
1145 xfs_warn(log->l_mp, 1147 xfs_warn(log->l_mp,
1146 "Log inconsistent or not a log (last==0, first!=1)"); 1148 "Log inconsistent or not a log (last==0, first!=1)");
1147 return XFS_ERROR(EINVAL); 1149 error = XFS_ERROR(EINVAL);
1150 goto bp_err;
1148 } 1151 }
1149 1152
1150 /* we have a partially zeroed log */ 1153 /* we have a partially zeroed log */
@@ -1766,19 +1769,11 @@ xlog_recover_buffer_pass1(
1766 1769
1767/* 1770/*
1768 * Check to see whether the buffer being recovered has a corresponding 1771 * Check to see whether the buffer being recovered has a corresponding
1769 * entry in the buffer cancel record table. If it does then return 1 1772 * entry in the buffer cancel record table. If it is, return the cancel
1770 * so that it will be cancelled, otherwise return 0. If the buffer is 1773 * buffer structure to the caller.
1771 * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
1772 * the refcount on the entry in the table and remove it from the table
1773 * if this is the last reference.
1774 *
1775 * We remove the cancel record from the table when we encounter its
1776 * last occurrence in the log so that if the same buffer is re-used
1777 * again after its last cancellation we actually replay the changes
1778 * made at that point.
1779 */ 1774 */
1780STATIC int 1775STATIC struct xfs_buf_cancel *
1781xlog_check_buffer_cancelled( 1776xlog_peek_buffer_cancelled(
1782 struct xlog *log, 1777 struct xlog *log,
1783 xfs_daddr_t blkno, 1778 xfs_daddr_t blkno,
1784 uint len, 1779 uint len,
@@ -1787,22 +1782,16 @@ xlog_check_buffer_cancelled(
1787 struct list_head *bucket; 1782 struct list_head *bucket;
1788 struct xfs_buf_cancel *bcp; 1783 struct xfs_buf_cancel *bcp;
1789 1784
1790 if (log->l_buf_cancel_table == NULL) { 1785 if (!log->l_buf_cancel_table) {
1791 /* 1786 /* empty table means no cancelled buffers in the log */
1792 * There is nothing in the table built in pass one,
1793 * so this buffer must not be cancelled.
1794 */
1795 ASSERT(!(flags & XFS_BLF_CANCEL)); 1787 ASSERT(!(flags & XFS_BLF_CANCEL));
1796 return 0; 1788 return NULL;
1797 } 1789 }
1798 1790
1799 /*
1800 * Search for an entry in the cancel table that matches our buffer.
1801 */
1802 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); 1791 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
1803 list_for_each_entry(bcp, bucket, bc_list) { 1792 list_for_each_entry(bcp, bucket, bc_list) {
1804 if (bcp->bc_blkno == blkno && bcp->bc_len == len) 1793 if (bcp->bc_blkno == blkno && bcp->bc_len == len)
1805 goto found; 1794 return bcp;
1806 } 1795 }
1807 1796
1808 /* 1797 /*
@@ -1810,9 +1799,32 @@ xlog_check_buffer_cancelled(
1810 * that the buffer is NOT cancelled. 1799 * that the buffer is NOT cancelled.
1811 */ 1800 */
1812 ASSERT(!(flags & XFS_BLF_CANCEL)); 1801 ASSERT(!(flags & XFS_BLF_CANCEL));
1813 return 0; 1802 return NULL;
1803}
1804
1805/*
1806 * If the buffer is being cancelled then return 1 so that it will be cancelled,
1807 * otherwise return 0. If the buffer is actually a buffer cancel item
1808 * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
1809 * table and remove it from the table if this is the last reference.
1810 *
1811 * We remove the cancel record from the table when we encounter its last
1812 * occurrence in the log so that if the same buffer is re-used again after its
1813 * last cancellation we actually replay the changes made at that point.
1814 */
1815STATIC int
1816xlog_check_buffer_cancelled(
1817 struct xlog *log,
1818 xfs_daddr_t blkno,
1819 uint len,
1820 ushort flags)
1821{
1822 struct xfs_buf_cancel *bcp;
1823
1824 bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
1825 if (!bcp)
1826 return 0;
1814 1827
1815found:
1816 /* 1828 /*
1817 * We've go a match, so return 1 so that the recovery of this buffer 1829 * We've go a match, so return 1 so that the recovery of this buffer
1818 * is cancelled. If this buffer is actually a buffer cancel log 1830 * is cancelled. If this buffer is actually a buffer cancel log
@@ -1947,6 +1959,104 @@ xlog_recover_do_inode_buffer(
1947} 1959}
1948 1960
1949/* 1961/*
1962 * V5 filesystems know the age of the buffer on disk being recovered. We can
1963 * have newer objects on disk than we are replaying, and so for these cases we
1964 * don't want to replay the current change as that will make the buffer contents
1965 * temporarily invalid on disk.
1966 *
1967 * The magic number might not match the buffer type we are going to recover
1968 * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence
1969 * extract the LSN of the existing object in the buffer based on it's current
1970 * magic number. If we don't recognise the magic number in the buffer, then
1971 * return a LSN of -1 so that the caller knows it was an unrecognised block and
1972 * so can recover the buffer.
1973 */
1974static xfs_lsn_t
1975xlog_recover_get_buf_lsn(
1976 struct xfs_mount *mp,
1977 struct xfs_buf *bp)
1978{
1979 __uint32_t magic32;
1980 __uint16_t magic16;
1981 __uint16_t magicda;
1982 void *blk = bp->b_addr;
1983
1984 /* v4 filesystems always recover immediately */
1985 if (!xfs_sb_version_hascrc(&mp->m_sb))
1986 goto recover_immediately;
1987
1988 magic32 = be32_to_cpu(*(__be32 *)blk);
1989 switch (magic32) {
1990 case XFS_ABTB_CRC_MAGIC:
1991 case XFS_ABTC_CRC_MAGIC:
1992 case XFS_ABTB_MAGIC:
1993 case XFS_ABTC_MAGIC:
1994 case XFS_IBT_CRC_MAGIC:
1995 case XFS_IBT_MAGIC:
1996 return be64_to_cpu(
1997 ((struct xfs_btree_block *)blk)->bb_u.s.bb_lsn);
1998 case XFS_BMAP_CRC_MAGIC:
1999 case XFS_BMAP_MAGIC:
2000 return be64_to_cpu(
2001 ((struct xfs_btree_block *)blk)->bb_u.l.bb_lsn);
2002 case XFS_AGF_MAGIC:
2003 return be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
2004 case XFS_AGFL_MAGIC:
2005 return be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
2006 case XFS_AGI_MAGIC:
2007 return be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
2008 case XFS_SYMLINK_MAGIC:
2009 return be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
2010 case XFS_DIR3_BLOCK_MAGIC:
2011 case XFS_DIR3_DATA_MAGIC:
2012 case XFS_DIR3_FREE_MAGIC:
2013 return be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
2014 case XFS_ATTR3_RMT_MAGIC:
2015 return be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
2016 case XFS_SB_MAGIC:
2017 return be64_to_cpu(((struct xfs_sb *)blk)->sb_lsn);
2018 default:
2019 break;
2020 }
2021
2022 magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
2023 switch (magicda) {
2024 case XFS_DIR3_LEAF1_MAGIC:
2025 case XFS_DIR3_LEAFN_MAGIC:
2026 case XFS_DA3_NODE_MAGIC:
2027 return be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
2028 default:
2029 break;
2030 }
2031
2032 /*
2033 * We do individual object checks on dquot and inode buffers as they
2034 * have their own individual LSN records. Also, we could have a stale
2035 * buffer here, so we have to at least recognise these buffer types.
2036 *
2037 * A notd complexity here is inode unlinked list processing - it logs
2038 * the inode directly in the buffer, but we don't know which inodes have
2039 * been modified, and there is no global buffer LSN. Hence we need to
2040 * recover all inode buffer types immediately. This problem will be
2041 * fixed by logical logging of the unlinked list modifications.
2042 */
2043 magic16 = be16_to_cpu(*(__be16 *)blk);
2044 switch (magic16) {
2045 case XFS_DQUOT_MAGIC:
2046 case XFS_DINODE_MAGIC:
2047 goto recover_immediately;
2048 default:
2049 break;
2050 }
2051
2052 /* unknown buffer contents, recover immediately */
2053
2054recover_immediately:
2055 return (xfs_lsn_t)-1;
2056
2057}
2058
2059/*
1950 * Validate the recovered buffer is of the correct type and attach the 2060 * Validate the recovered buffer is of the correct type and attach the
1951 * appropriate buffer operations to them for writeback. Magic numbers are in a 2061 * appropriate buffer operations to them for writeback. Magic numbers are in a
1952 * few places: 2062 * few places:
@@ -1955,7 +2065,7 @@ xlog_recover_do_inode_buffer(
1955 * inside a struct xfs_da_blkinfo at the start of the buffer. 2065 * inside a struct xfs_da_blkinfo at the start of the buffer.
1956 */ 2066 */
1957static void 2067static void
1958xlog_recovery_validate_buf_type( 2068xlog_recover_validate_buf_type(
1959 struct xfs_mount *mp, 2069 struct xfs_mount *mp,
1960 struct xfs_buf *bp, 2070 struct xfs_buf *bp,
1961 xfs_buf_log_format_t *buf_f) 2071 xfs_buf_log_format_t *buf_f)
@@ -2234,7 +2344,7 @@ xlog_recover_do_reg_buffer(
2234 * just avoid the verification stage for non-crc filesystems 2344 * just avoid the verification stage for non-crc filesystems
2235 */ 2345 */
2236 if (xfs_sb_version_hascrc(&mp->m_sb)) 2346 if (xfs_sb_version_hascrc(&mp->m_sb))
2237 xlog_recovery_validate_buf_type(mp, bp, buf_f); 2347 xlog_recover_validate_buf_type(mp, bp, buf_f);
2238} 2348}
2239 2349
2240/* 2350/*
@@ -2366,7 +2476,7 @@ xfs_qm_dqcheck(
2366 2476
2367/* 2477/*
2368 * Perform a dquot buffer recovery. 2478 * Perform a dquot buffer recovery.
2369 * Simple algorithm: if we have found a QUOTAOFF logitem of the same type 2479 * Simple algorithm: if we have found a QUOTAOFF log item of the same type
2370 * (ie. USR or GRP), then just toss this buffer away; don't recover it. 2480 * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2371 * Else, treat it as a regular buffer and do recovery. 2481 * Else, treat it as a regular buffer and do recovery.
2372 */ 2482 */
@@ -2425,20 +2535,22 @@ xlog_recover_do_dquot_buffer(
2425 * over the log during recovery. During the first we build a table of 2535 * over the log during recovery. During the first we build a table of
2426 * those buffers which have been cancelled, and during the second we 2536 * those buffers which have been cancelled, and during the second we
2427 * only replay those buffers which do not have corresponding cancel 2537 * only replay those buffers which do not have corresponding cancel
2428 * records in the table. See xlog_recover_do_buffer_pass[1,2] above 2538 * records in the table. See xlog_recover_buffer_pass[1,2] above
2429 * for more details on the implementation of the table of cancel records. 2539 * for more details on the implementation of the table of cancel records.
2430 */ 2540 */
2431STATIC int 2541STATIC int
2432xlog_recover_buffer_pass2( 2542xlog_recover_buffer_pass2(
2433 struct xlog *log, 2543 struct xlog *log,
2434 struct list_head *buffer_list, 2544 struct list_head *buffer_list,
2435 struct xlog_recover_item *item) 2545 struct xlog_recover_item *item,
2546 xfs_lsn_t current_lsn)
2436{ 2547{
2437 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 2548 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
2438 xfs_mount_t *mp = log->l_mp; 2549 xfs_mount_t *mp = log->l_mp;
2439 xfs_buf_t *bp; 2550 xfs_buf_t *bp;
2440 int error; 2551 int error;
2441 uint buf_flags; 2552 uint buf_flags;
2553 xfs_lsn_t lsn;
2442 2554
2443 /* 2555 /*
2444 * In this pass we only want to recover all the buffers which have 2556 * In this pass we only want to recover all the buffers which have
@@ -2463,10 +2575,17 @@ xlog_recover_buffer_pass2(
2463 error = bp->b_error; 2575 error = bp->b_error;
2464 if (error) { 2576 if (error) {
2465 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); 2577 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
2466 xfs_buf_relse(bp); 2578 goto out_release;
2467 return error;
2468 } 2579 }
2469 2580
2581 /*
2582 * recover the buffer only if we get an LSN from it and it's less than
2583 * the lsn of the transaction we are replaying.
2584 */
2585 lsn = xlog_recover_get_buf_lsn(mp, bp);
2586 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0)
2587 goto out_release;
2588
2470 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { 2589 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2471 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2590 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2472 } else if (buf_f->blf_flags & 2591 } else if (buf_f->blf_flags &
@@ -2476,7 +2595,7 @@ xlog_recover_buffer_pass2(
2476 xlog_recover_do_reg_buffer(mp, item, bp, buf_f); 2595 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2477 } 2596 }
2478 if (error) 2597 if (error)
2479 return XFS_ERROR(error); 2598 goto out_release;
2480 2599
2481 /* 2600 /*
2482 * Perform delayed write on the buffer. Asynchronous writes will be 2601 * Perform delayed write on the buffer. Asynchronous writes will be
@@ -2505,6 +2624,7 @@ xlog_recover_buffer_pass2(
2505 xfs_buf_delwri_queue(bp, buffer_list); 2624 xfs_buf_delwri_queue(bp, buffer_list);
2506 } 2625 }
2507 2626
2627out_release:
2508 xfs_buf_relse(bp); 2628 xfs_buf_relse(bp);
2509 return error; 2629 return error;
2510} 2630}
@@ -2513,7 +2633,8 @@ STATIC int
2513xlog_recover_inode_pass2( 2633xlog_recover_inode_pass2(
2514 struct xlog *log, 2634 struct xlog *log,
2515 struct list_head *buffer_list, 2635 struct list_head *buffer_list,
2516 struct xlog_recover_item *item) 2636 struct xlog_recover_item *item,
2637 xfs_lsn_t current_lsn)
2517{ 2638{
2518 xfs_inode_log_format_t *in_f; 2639 xfs_inode_log_format_t *in_f;
2519 xfs_mount_t *mp = log->l_mp; 2640 xfs_mount_t *mp = log->l_mp;
@@ -2593,6 +2714,20 @@ xlog_recover_inode_pass2(
2593 } 2714 }
2594 2715
2595 /* 2716 /*
2717 * If the inode has an LSN in it, recover the inode only if it's less
2718 * than the lsn of the transaction we are replaying.
2719 */
2720 if (dip->di_version >= 3) {
2721 xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
2722
2723 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2724 trace_xfs_log_recover_inode_skip(log, in_f);
2725 error = 0;
2726 goto out_release;
2727 }
2728 }
2729
2730 /*
2596 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes 2731 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
2597 * are transactional and if ordering is necessary we can determine that 2732 * are transactional and if ordering is necessary we can determine that
2598 * more accurately by the LSN field in the V3 inode core. Don't trust 2733 * more accurately by the LSN field in the V3 inode core. Don't trust
@@ -2781,6 +2916,8 @@ write_inode_buffer:
2781 ASSERT(bp->b_target->bt_mount == mp); 2916 ASSERT(bp->b_target->bt_mount == mp);
2782 bp->b_iodone = xlog_recover_iodone; 2917 bp->b_iodone = xlog_recover_iodone;
2783 xfs_buf_delwri_queue(bp, buffer_list); 2918 xfs_buf_delwri_queue(bp, buffer_list);
2919
2920out_release:
2784 xfs_buf_relse(bp); 2921 xfs_buf_relse(bp);
2785error: 2922error:
2786 if (need_free) 2923 if (need_free)
@@ -2822,7 +2959,8 @@ STATIC int
2822xlog_recover_dquot_pass2( 2959xlog_recover_dquot_pass2(
2823 struct xlog *log, 2960 struct xlog *log,
2824 struct list_head *buffer_list, 2961 struct list_head *buffer_list,
2825 struct xlog_recover_item *item) 2962 struct xlog_recover_item *item,
2963 xfs_lsn_t current_lsn)
2826{ 2964{
2827 xfs_mount_t *mp = log->l_mp; 2965 xfs_mount_t *mp = log->l_mp;
2828 xfs_buf_t *bp; 2966 xfs_buf_t *bp;
@@ -2896,6 +3034,19 @@ xlog_recover_dquot_pass2(
2896 return XFS_ERROR(EIO); 3034 return XFS_ERROR(EIO);
2897 } 3035 }
2898 3036
3037 /*
3038 * If the dquot has an LSN in it, recover the dquot only if it's less
3039 * than the lsn of the transaction we are replaying.
3040 */
3041 if (xfs_sb_version_hascrc(&mp->m_sb)) {
3042 struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
3043 xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn);
3044
3045 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
3046 goto out_release;
3047 }
3048 }
3049
2899 memcpy(ddq, recddq, item->ri_buf[1].i_len); 3050 memcpy(ddq, recddq, item->ri_buf[1].i_len);
2900 if (xfs_sb_version_hascrc(&mp->m_sb)) { 3051 if (xfs_sb_version_hascrc(&mp->m_sb)) {
2901 xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), 3052 xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
@@ -2906,9 +3057,10 @@ xlog_recover_dquot_pass2(
2906 ASSERT(bp->b_target->bt_mount == mp); 3057 ASSERT(bp->b_target->bt_mount == mp);
2907 bp->b_iodone = xlog_recover_iodone; 3058 bp->b_iodone = xlog_recover_iodone;
2908 xfs_buf_delwri_queue(bp, buffer_list); 3059 xfs_buf_delwri_queue(bp, buffer_list);
2909 xfs_buf_relse(bp);
2910 3060
2911 return (0); 3061out_release:
3062 xfs_buf_relse(bp);
3063 return 0;
2912} 3064}
2913 3065
2914/* 3066/*
@@ -3116,6 +3268,106 @@ xlog_recover_free_trans(
3116 kmem_free(trans); 3268 kmem_free(trans);
3117} 3269}
3118 3270
3271STATIC void
3272xlog_recover_buffer_ra_pass2(
3273 struct xlog *log,
3274 struct xlog_recover_item *item)
3275{
3276 struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
3277 struct xfs_mount *mp = log->l_mp;
3278
3279 if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
3280 buf_f->blf_len, buf_f->blf_flags)) {
3281 return;
3282 }
3283
3284 xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
3285 buf_f->blf_len, NULL);
3286}
3287
3288STATIC void
3289xlog_recover_inode_ra_pass2(
3290 struct xlog *log,
3291 struct xlog_recover_item *item)
3292{
3293 struct xfs_inode_log_format ilf_buf;
3294 struct xfs_inode_log_format *ilfp;
3295 struct xfs_mount *mp = log->l_mp;
3296 int error;
3297
3298 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
3299 ilfp = item->ri_buf[0].i_addr;
3300 } else {
3301 ilfp = &ilf_buf;
3302 memset(ilfp, 0, sizeof(*ilfp));
3303 error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
3304 if (error)
3305 return;
3306 }
3307
3308 if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
3309 return;
3310
3311 xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
3312 ilfp->ilf_len, &xfs_inode_buf_ra_ops);
3313}
3314
3315STATIC void
3316xlog_recover_dquot_ra_pass2(
3317 struct xlog *log,
3318 struct xlog_recover_item *item)
3319{
3320 struct xfs_mount *mp = log->l_mp;
3321 struct xfs_disk_dquot *recddq;
3322 struct xfs_dq_logformat *dq_f;
3323 uint type;
3324
3325
3326 if (mp->m_qflags == 0)
3327 return;
3328
3329 recddq = item->ri_buf[1].i_addr;
3330 if (recddq == NULL)
3331 return;
3332 if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
3333 return;
3334
3335 type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
3336 ASSERT(type);
3337 if (log->l_quotaoffs_flag & type)
3338 return;
3339
3340 dq_f = item->ri_buf[0].i_addr;
3341 ASSERT(dq_f);
3342 ASSERT(dq_f->qlf_len == 1);
3343
3344 xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno,
3345 XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL);
3346}
3347
3348STATIC void
3349xlog_recover_ra_pass2(
3350 struct xlog *log,
3351 struct xlog_recover_item *item)
3352{
3353 switch (ITEM_TYPE(item)) {
3354 case XFS_LI_BUF:
3355 xlog_recover_buffer_ra_pass2(log, item);
3356 break;
3357 case XFS_LI_INODE:
3358 xlog_recover_inode_ra_pass2(log, item);
3359 break;
3360 case XFS_LI_DQUOT:
3361 xlog_recover_dquot_ra_pass2(log, item);
3362 break;
3363 case XFS_LI_EFI:
3364 case XFS_LI_EFD:
3365 case XFS_LI_QUOTAOFF:
3366 default:
3367 break;
3368 }
3369}
3370
3119STATIC int 3371STATIC int
3120xlog_recover_commit_pass1( 3372xlog_recover_commit_pass1(
3121 struct xlog *log, 3373 struct xlog *log,
@@ -3155,15 +3407,18 @@ xlog_recover_commit_pass2(
3155 3407
3156 switch (ITEM_TYPE(item)) { 3408 switch (ITEM_TYPE(item)) {
3157 case XFS_LI_BUF: 3409 case XFS_LI_BUF:
3158 return xlog_recover_buffer_pass2(log, buffer_list, item); 3410 return xlog_recover_buffer_pass2(log, buffer_list, item,
3411 trans->r_lsn);
3159 case XFS_LI_INODE: 3412 case XFS_LI_INODE:
3160 return xlog_recover_inode_pass2(log, buffer_list, item); 3413 return xlog_recover_inode_pass2(log, buffer_list, item,
3414 trans->r_lsn);
3161 case XFS_LI_EFI: 3415 case XFS_LI_EFI:
3162 return xlog_recover_efi_pass2(log, item, trans->r_lsn); 3416 return xlog_recover_efi_pass2(log, item, trans->r_lsn);
3163 case XFS_LI_EFD: 3417 case XFS_LI_EFD:
3164 return xlog_recover_efd_pass2(log, item); 3418 return xlog_recover_efd_pass2(log, item);
3165 case XFS_LI_DQUOT: 3419 case XFS_LI_DQUOT:
3166 return xlog_recover_dquot_pass2(log, buffer_list, item); 3420 return xlog_recover_dquot_pass2(log, buffer_list, item,
3421 trans->r_lsn);
3167 case XFS_LI_ICREATE: 3422 case XFS_LI_ICREATE:
3168 return xlog_recover_do_icreate_pass2(log, buffer_list, item); 3423 return xlog_recover_do_icreate_pass2(log, buffer_list, item);
3169 case XFS_LI_QUOTAOFF: 3424 case XFS_LI_QUOTAOFF:
@@ -3177,6 +3432,26 @@ xlog_recover_commit_pass2(
3177 } 3432 }
3178} 3433}
3179 3434
3435STATIC int
3436xlog_recover_items_pass2(
3437 struct xlog *log,
3438 struct xlog_recover *trans,
3439 struct list_head *buffer_list,
3440 struct list_head *item_list)
3441{
3442 struct xlog_recover_item *item;
3443 int error = 0;
3444
3445 list_for_each_entry(item, item_list, ri_list) {
3446 error = xlog_recover_commit_pass2(log, trans,
3447 buffer_list, item);
3448 if (error)
3449 return error;
3450 }
3451
3452 return error;
3453}
3454
3180/* 3455/*
3181 * Perform the transaction. 3456 * Perform the transaction.
3182 * 3457 *
@@ -3189,9 +3464,16 @@ xlog_recover_commit_trans(
3189 struct xlog_recover *trans, 3464 struct xlog_recover *trans,
3190 int pass) 3465 int pass)
3191{ 3466{
3192 int error = 0, error2; 3467 int error = 0;
3193 xlog_recover_item_t *item; 3468 int error2;
3194 LIST_HEAD (buffer_list); 3469 int items_queued = 0;
3470 struct xlog_recover_item *item;
3471 struct xlog_recover_item *next;
3472 LIST_HEAD (buffer_list);
3473 LIST_HEAD (ra_list);
3474 LIST_HEAD (done_list);
3475
3476 #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
3195 3477
3196 hlist_del(&trans->r_list); 3478 hlist_del(&trans->r_list);
3197 3479
@@ -3199,14 +3481,22 @@ xlog_recover_commit_trans(
3199 if (error) 3481 if (error)
3200 return error; 3482 return error;
3201 3483
3202 list_for_each_entry(item, &trans->r_itemq, ri_list) { 3484 list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
3203 switch (pass) { 3485 switch (pass) {
3204 case XLOG_RECOVER_PASS1: 3486 case XLOG_RECOVER_PASS1:
3205 error = xlog_recover_commit_pass1(log, trans, item); 3487 error = xlog_recover_commit_pass1(log, trans, item);
3206 break; 3488 break;
3207 case XLOG_RECOVER_PASS2: 3489 case XLOG_RECOVER_PASS2:
3208 error = xlog_recover_commit_pass2(log, trans, 3490 xlog_recover_ra_pass2(log, item);
3209 &buffer_list, item); 3491 list_move_tail(&item->ri_list, &ra_list);
3492 items_queued++;
3493 if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
3494 error = xlog_recover_items_pass2(log, trans,
3495 &buffer_list, &ra_list);
3496 list_splice_tail_init(&ra_list, &done_list);
3497 items_queued = 0;
3498 }
3499
3210 break; 3500 break;
3211 default: 3501 default:
3212 ASSERT(0); 3502 ASSERT(0);
@@ -3216,9 +3506,19 @@ xlog_recover_commit_trans(
3216 goto out; 3506 goto out;
3217 } 3507 }
3218 3508
3509out:
3510 if (!list_empty(&ra_list)) {
3511 if (!error)
3512 error = xlog_recover_items_pass2(log, trans,
3513 &buffer_list, &ra_list);
3514 list_splice_tail_init(&ra_list, &done_list);
3515 }
3516
3517 if (!list_empty(&done_list))
3518 list_splice_init(&done_list, &trans->r_itemq);
3519
3219 xlog_recover_free_trans(trans); 3520 xlog_recover_free_trans(trans);
3220 3521
3221out:
3222 error2 = xfs_buf_delwri_submit(&buffer_list); 3522 error2 = xfs_buf_delwri_submit(&buffer_list);
3223 return error ? error : error2; 3523 return error ? error : error2;
3224} 3524}
@@ -3376,7 +3676,7 @@ xlog_recover_process_efi(
3376 } 3676 }
3377 3677
3378 tp = xfs_trans_alloc(mp, 0); 3678 tp = xfs_trans_alloc(mp, 0);
3379 error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0); 3679 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
3380 if (error) 3680 if (error)
3381 goto abort_error; 3681 goto abort_error;
3382 efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); 3682 efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
@@ -3482,8 +3782,7 @@ xlog_recover_clear_agi_bucket(
3482 int error; 3782 int error;
3483 3783
3484 tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); 3784 tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
3485 error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 3785 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0);
3486 0, 0, 0);
3487 if (error) 3786 if (error)
3488 goto out_abort; 3787 goto out_abort;
3489 3788
diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/xfs_log_rlimit.c
new file mode 100644
index 000000000000..bbcec0bbc12d
--- /dev/null
+++ b/fs/xfs/xfs_log_rlimit.c
@@ -0,0 +1,147 @@
1/*
2 * Copyright (c) 2013 Jie Liu.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_log.h"
21#include "xfs_trans.h"
22#include "xfs_ag.h"
23#include "xfs_sb.h"
24#include "xfs_mount.h"
25#include "xfs_trans_space.h"
26#include "xfs_bmap_btree.h"
27#include "xfs_inode.h"
28#include "xfs_da_btree.h"
29#include "xfs_attr_leaf.h"
30
31/*
32 * Calculate the maximum length in bytes that would be required for a local
33 * attribute value as large attributes out of line are not logged.
34 */
35STATIC int
36xfs_log_calc_max_attrsetm_res(
37 struct xfs_mount *mp)
38{
39 int size;
40 int nblks;
41
42 size = xfs_attr_leaf_entsize_local_max(mp->m_sb.sb_blocksize) -
43 MAXNAMELEN - 1;
44 nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
45 nblks += XFS_B_TO_FSB(mp, size);
46 nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK);
47
48 return M_RES(mp)->tr_attrsetm.tr_logres +
49 M_RES(mp)->tr_attrsetrt.tr_logres * nblks;
50}
51
52/*
53 * Iterate over the log space reservation table to figure out and return
54 * the maximum one in terms of the pre-calculated values which were done
55 * at mount time.
56 */
57STATIC void
58xfs_log_get_max_trans_res(
59 struct xfs_mount *mp,
60 struct xfs_trans_res *max_resp)
61{
62 struct xfs_trans_res *resp;
63 struct xfs_trans_res *end_resp;
64 int log_space = 0;
65 int attr_space;
66
67 attr_space = xfs_log_calc_max_attrsetm_res(mp);
68
69 resp = (struct xfs_trans_res *)M_RES(mp);
70 end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
71 for (; resp < end_resp; resp++) {
72 int tmp = resp->tr_logcount > 1 ?
73 resp->tr_logres * resp->tr_logcount :
74 resp->tr_logres;
75 if (log_space < tmp) {
76 log_space = tmp;
77 *max_resp = *resp; /* struct copy */
78 }
79 }
80
81 if (attr_space > log_space) {
82 *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */
83 max_resp->tr_logres = attr_space;
84 }
85}
86
87/*
88 * Calculate the minimum valid log size for the given superblock configuration.
89 * Used to calculate the minimum log size at mkfs time, and to determine if
90 * the log is large enough or not at mount time. Returns the minimum size in
91 * filesystem block size units.
92 */
93int
94xfs_log_calc_minimum_size(
95 struct xfs_mount *mp)
96{
97 struct xfs_trans_res tres = {0};
98 int max_logres;
99 int min_logblks = 0;
100 int lsunit = 0;
101
102 xfs_log_get_max_trans_res(mp, &tres);
103
104 max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres);
105 if (tres.tr_logcount > 1)
106 max_logres *= tres.tr_logcount;
107
108 if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1)
109 lsunit = BTOBB(mp->m_sb.sb_logsunit);
110
111 /*
112 * Two factors should be taken into account for calculating the minimum
113 * log space.
114 * 1) The fundamental limitation is that no single transaction can be
115 * larger than half size of the log.
116 *
117 * From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR
118 * define, which is set to 3. That means we can definitely fit
119 * maximally sized 2 transactions in the log. We'll use this same
120 * value here.
121 *
122 * 2) If the lsunit option is specified, a transaction requires 2 LSU
123 * for the reservation because there are two log writes that can
124 * require padding - the transaction data and the commit record which
125 * are written separately and both can require padding to the LSU.
126 * Consider that we can have an active CIL reservation holding 2*LSU,
127 * but the CIL is not over a push threshold, in this case, if we
128 * don't have enough log space for at one new transaction, which
129 * includes another 2*LSU in the reservation, we will run into dead
130 * loop situation in log space grant procedure. i.e.
131 * xlog_grant_head_wait().
132 *
133 * Hence the log size needs to be able to contain two maximally sized
134 * and padded transactions, which is (2 * (2 * LSU + maxlres)).
135 *
136 * Also, the log size should be a multiple of the log stripe unit, round
137 * it up to lsunit boundary if lsunit is specified.
138 */
139 if (lsunit) {
140 min_logblks = roundup_64(BTOBB(max_logres), lsunit) +
141 2 * lsunit;
142 } else
143 min_logblks = BTOBB(max_logres) + 2 * BBSIZE;
144 min_logblks *= XFS_MIN_LOG_FACTOR;
145
146 return XFS_BB_TO_FSB(mp, min_logblks);
147}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 2b0ba3581656..5dcc68019d1b 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -17,7 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_format.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h" 23#include "xfs_inum.h"
@@ -25,8 +25,10 @@
25#include "xfs_trans_priv.h" 25#include "xfs_trans_priv.h"
26#include "xfs_sb.h" 26#include "xfs_sb.h"
27#include "xfs_ag.h" 27#include "xfs_ag.h"
28#include "xfs_dir2.h"
29#include "xfs_mount.h" 28#include "xfs_mount.h"
29#include "xfs_da_btree.h"
30#include "xfs_dir2_format.h"
31#include "xfs_dir2.h"
30#include "xfs_bmap_btree.h" 32#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h" 33#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h" 34#include "xfs_ialloc_btree.h"
@@ -40,7 +42,6 @@
40#include "xfs_error.h" 42#include "xfs_error.h"
41#include "xfs_quota.h" 43#include "xfs_quota.h"
42#include "xfs_fsops.h" 44#include "xfs_fsops.h"
43#include "xfs_utils.h"
44#include "xfs_trace.h" 45#include "xfs_trace.h"
45#include "xfs_icache.h" 46#include "xfs_icache.h"
46#include "xfs_cksum.h" 47#include "xfs_cksum.h"
@@ -59,69 +60,6 @@ STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
59#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) 60#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0)
60#endif 61#endif
61 62
62static const struct {
63 short offset;
64 short type; /* 0 = integer
65 * 1 = binary / string (no translation)
66 */
67} xfs_sb_info[] = {
68 { offsetof(xfs_sb_t, sb_magicnum), 0 },
69 { offsetof(xfs_sb_t, sb_blocksize), 0 },
70 { offsetof(xfs_sb_t, sb_dblocks), 0 },
71 { offsetof(xfs_sb_t, sb_rblocks), 0 },
72 { offsetof(xfs_sb_t, sb_rextents), 0 },
73 { offsetof(xfs_sb_t, sb_uuid), 1 },
74 { offsetof(xfs_sb_t, sb_logstart), 0 },
75 { offsetof(xfs_sb_t, sb_rootino), 0 },
76 { offsetof(xfs_sb_t, sb_rbmino), 0 },
77 { offsetof(xfs_sb_t, sb_rsumino), 0 },
78 { offsetof(xfs_sb_t, sb_rextsize), 0 },
79 { offsetof(xfs_sb_t, sb_agblocks), 0 },
80 { offsetof(xfs_sb_t, sb_agcount), 0 },
81 { offsetof(xfs_sb_t, sb_rbmblocks), 0 },
82 { offsetof(xfs_sb_t, sb_logblocks), 0 },
83 { offsetof(xfs_sb_t, sb_versionnum), 0 },
84 { offsetof(xfs_sb_t, sb_sectsize), 0 },
85 { offsetof(xfs_sb_t, sb_inodesize), 0 },
86 { offsetof(xfs_sb_t, sb_inopblock), 0 },
87 { offsetof(xfs_sb_t, sb_fname[0]), 1 },
88 { offsetof(xfs_sb_t, sb_blocklog), 0 },
89 { offsetof(xfs_sb_t, sb_sectlog), 0 },
90 { offsetof(xfs_sb_t, sb_inodelog), 0 },
91 { offsetof(xfs_sb_t, sb_inopblog), 0 },
92 { offsetof(xfs_sb_t, sb_agblklog), 0 },
93 { offsetof(xfs_sb_t, sb_rextslog), 0 },
94 { offsetof(xfs_sb_t, sb_inprogress), 0 },
95 { offsetof(xfs_sb_t, sb_imax_pct), 0 },
96 { offsetof(xfs_sb_t, sb_icount), 0 },
97 { offsetof(xfs_sb_t, sb_ifree), 0 },
98 { offsetof(xfs_sb_t, sb_fdblocks), 0 },
99 { offsetof(xfs_sb_t, sb_frextents), 0 },
100 { offsetof(xfs_sb_t, sb_uquotino), 0 },
101 { offsetof(xfs_sb_t, sb_gquotino), 0 },
102 { offsetof(xfs_sb_t, sb_qflags), 0 },
103 { offsetof(xfs_sb_t, sb_flags), 0 },
104 { offsetof(xfs_sb_t, sb_shared_vn), 0 },
105 { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
106 { offsetof(xfs_sb_t, sb_unit), 0 },
107 { offsetof(xfs_sb_t, sb_width), 0 },
108 { offsetof(xfs_sb_t, sb_dirblklog), 0 },
109 { offsetof(xfs_sb_t, sb_logsectlog), 0 },
110 { offsetof(xfs_sb_t, sb_logsectsize),0 },
111 { offsetof(xfs_sb_t, sb_logsunit), 0 },
112 { offsetof(xfs_sb_t, sb_features2), 0 },
113 { offsetof(xfs_sb_t, sb_bad_features2), 0 },
114 { offsetof(xfs_sb_t, sb_features_compat), 0 },
115 { offsetof(xfs_sb_t, sb_features_ro_compat), 0 },
116 { offsetof(xfs_sb_t, sb_features_incompat), 0 },
117 { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
118 { offsetof(xfs_sb_t, sb_crc), 0 },
119 { offsetof(xfs_sb_t, sb_pad), 0 },
120 { offsetof(xfs_sb_t, sb_pquotino), 0 },
121 { offsetof(xfs_sb_t, sb_lsn), 0 },
122 { sizeof(xfs_sb_t), 0 }
123};
124
125static DEFINE_MUTEX(xfs_uuid_table_mutex); 63static DEFINE_MUTEX(xfs_uuid_table_mutex);
126static int xfs_uuid_table_size; 64static int xfs_uuid_table_size;
127static uuid_t *xfs_uuid_table; 65static uuid_t *xfs_uuid_table;
@@ -197,64 +135,6 @@ xfs_uuid_unmount(
197} 135}
198 136
199 137
200/*
201 * Reference counting access wrappers to the perag structures.
202 * Because we never free per-ag structures, the only thing we
203 * have to protect against changes is the tree structure itself.
204 */
205struct xfs_perag *
206xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
207{
208 struct xfs_perag *pag;
209 int ref = 0;
210
211 rcu_read_lock();
212 pag = radix_tree_lookup(&mp->m_perag_tree, agno);
213 if (pag) {
214 ASSERT(atomic_read(&pag->pag_ref) >= 0);
215 ref = atomic_inc_return(&pag->pag_ref);
216 }
217 rcu_read_unlock();
218 trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
219 return pag;
220}
221
222/*
223 * search from @first to find the next perag with the given tag set.
224 */
225struct xfs_perag *
226xfs_perag_get_tag(
227 struct xfs_mount *mp,
228 xfs_agnumber_t first,
229 int tag)
230{
231 struct xfs_perag *pag;
232 int found;
233 int ref;
234
235 rcu_read_lock();
236 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
237 (void **)&pag, first, 1, tag);
238 if (found <= 0) {
239 rcu_read_unlock();
240 return NULL;
241 }
242 ref = atomic_inc_return(&pag->pag_ref);
243 rcu_read_unlock();
244 trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
245 return pag;
246}
247
248void
249xfs_perag_put(struct xfs_perag *pag)
250{
251 int ref;
252
253 ASSERT(atomic_read(&pag->pag_ref) > 0);
254 ref = atomic_dec_return(&pag->pag_ref);
255 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
256}
257
258STATIC void 138STATIC void
259__xfs_free_perag( 139__xfs_free_perag(
260 struct rcu_head *head) 140 struct rcu_head *head)
@@ -307,184 +187,6 @@ xfs_sb_validate_fsb_count(
307 return 0; 187 return 0;
308} 188}
309 189
310/*
311 * Check the validity of the SB found.
312 */
313STATIC int
314xfs_mount_validate_sb(
315 xfs_mount_t *mp,
316 xfs_sb_t *sbp,
317 bool check_inprogress,
318 bool check_version)
319{
320
321 /*
322 * If the log device and data device have the
323 * same device number, the log is internal.
324 * Consequently, the sb_logstart should be non-zero. If
325 * we have a zero sb_logstart in this case, we may be trying to mount
326 * a volume filesystem in a non-volume manner.
327 */
328 if (sbp->sb_magicnum != XFS_SB_MAGIC) {
329 xfs_warn(mp, "bad magic number");
330 return XFS_ERROR(EWRONGFS);
331 }
332
333
334 if (!xfs_sb_good_version(sbp)) {
335 xfs_warn(mp, "bad version");
336 return XFS_ERROR(EWRONGFS);
337 }
338
339 if ((sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) &&
340 (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
341 XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))) {
342 xfs_notice(mp,
343"Super block has XFS_OQUOTA bits along with XFS_PQUOTA and/or XFS_GQUOTA bits.\n");
344 return XFS_ERROR(EFSCORRUPTED);
345 }
346
347 /*
348 * Version 5 superblock feature mask validation. Reject combinations the
349 * kernel cannot support up front before checking anything else. For
350 * write validation, we don't need to check feature masks.
351 */
352 if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
353 xfs_alert(mp,
354"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
355"Use of these features in this kernel is at your own risk!");
356
357 if (xfs_sb_has_compat_feature(sbp,
358 XFS_SB_FEAT_COMPAT_UNKNOWN)) {
359 xfs_warn(mp,
360"Superblock has unknown compatible features (0x%x) enabled.\n"
361"Using a more recent kernel is recommended.",
362 (sbp->sb_features_compat &
363 XFS_SB_FEAT_COMPAT_UNKNOWN));
364 }
365
366 if (xfs_sb_has_ro_compat_feature(sbp,
367 XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
368 xfs_alert(mp,
369"Superblock has unknown read-only compatible features (0x%x) enabled.",
370 (sbp->sb_features_ro_compat &
371 XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
372 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
373 xfs_warn(mp,
374"Attempted to mount read-only compatible filesystem read-write.\n"
375"Filesystem can only be safely mounted read only.");
376 return XFS_ERROR(EINVAL);
377 }
378 }
379 if (xfs_sb_has_incompat_feature(sbp,
380 XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
381 xfs_warn(mp,
382"Superblock has unknown incompatible features (0x%x) enabled.\n"
383"Filesystem can not be safely mounted by this kernel.",
384 (sbp->sb_features_incompat &
385 XFS_SB_FEAT_INCOMPAT_UNKNOWN));
386 return XFS_ERROR(EINVAL);
387 }
388 }
389
390 if (unlikely(
391 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
392 xfs_warn(mp,
393 "filesystem is marked as having an external log; "
394 "specify logdev on the mount command line.");
395 return XFS_ERROR(EINVAL);
396 }
397
398 if (unlikely(
399 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
400 xfs_warn(mp,
401 "filesystem is marked as having an internal log; "
402 "do not specify logdev on the mount command line.");
403 return XFS_ERROR(EINVAL);
404 }
405
406 /*
407 * More sanity checking. Most of these were stolen directly from
408 * xfs_repair.
409 */
410 if (unlikely(
411 sbp->sb_agcount <= 0 ||
412 sbp->sb_sectsize < XFS_MIN_SECTORSIZE ||
413 sbp->sb_sectsize > XFS_MAX_SECTORSIZE ||
414 sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG ||
415 sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG ||
416 sbp->sb_sectsize != (1 << sbp->sb_sectlog) ||
417 sbp->sb_blocksize < XFS_MIN_BLOCKSIZE ||
418 sbp->sb_blocksize > XFS_MAX_BLOCKSIZE ||
419 sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG ||
420 sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
421 sbp->sb_blocksize != (1 << sbp->sb_blocklog) ||
422 sbp->sb_inodesize < XFS_DINODE_MIN_SIZE ||
423 sbp->sb_inodesize > XFS_DINODE_MAX_SIZE ||
424 sbp->sb_inodelog < XFS_DINODE_MIN_LOG ||
425 sbp->sb_inodelog > XFS_DINODE_MAX_LOG ||
426 sbp->sb_inodesize != (1 << sbp->sb_inodelog) ||
427 (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) ||
428 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
429 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
430 (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) ||
431 sbp->sb_dblocks == 0 ||
432 sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) ||
433 sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
434 XFS_CORRUPTION_ERROR("SB sanity check failed",
435 XFS_ERRLEVEL_LOW, mp, sbp);
436 return XFS_ERROR(EFSCORRUPTED);
437 }
438
439 /*
440 * Until this is fixed only page-sized or smaller data blocks work.
441 */
442 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
443 xfs_warn(mp,
444 "File system with blocksize %d bytes. "
445 "Only pagesize (%ld) or less will currently work.",
446 sbp->sb_blocksize, PAGE_SIZE);
447 return XFS_ERROR(ENOSYS);
448 }
449
450 /*
451 * Currently only very few inode sizes are supported.
452 */
453 switch (sbp->sb_inodesize) {
454 case 256:
455 case 512:
456 case 1024:
457 case 2048:
458 break;
459 default:
460 xfs_warn(mp, "inode size of %d bytes not supported",
461 sbp->sb_inodesize);
462 return XFS_ERROR(ENOSYS);
463 }
464
465 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
466 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
467 xfs_warn(mp,
468 "file system too large to be mounted on this system.");
469 return XFS_ERROR(EFBIG);
470 }
471
472 if (check_inprogress && sbp->sb_inprogress) {
473 xfs_warn(mp, "Offline file system operation in progress!");
474 return XFS_ERROR(EFSCORRUPTED);
475 }
476
477 /*
478 * Version 1 directory format has never worked on Linux.
479 */
480 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
481 xfs_warn(mp, "file system using version 1 directory format");
482 return XFS_ERROR(ENOSYS);
483 }
484
485 return 0;
486}
487
488int 190int
489xfs_initialize_perag( 191xfs_initialize_perag(
490 xfs_mount_t *mp, 192 xfs_mount_t *mp,
@@ -569,283 +271,15 @@ out_unwind:
569 return error; 271 return error;
570} 272}
571 273
572static void
573xfs_sb_quota_from_disk(struct xfs_sb *sbp)
574{
575 if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
576 sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
577 XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
578 if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
579 sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
580 XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
581 sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
582}
583
584void
585xfs_sb_from_disk(
586 struct xfs_sb *to,
587 xfs_dsb_t *from)
588{
589 to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
590 to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
591 to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
592 to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
593 to->sb_rextents = be64_to_cpu(from->sb_rextents);
594 memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
595 to->sb_logstart = be64_to_cpu(from->sb_logstart);
596 to->sb_rootino = be64_to_cpu(from->sb_rootino);
597 to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
598 to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
599 to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
600 to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
601 to->sb_agcount = be32_to_cpu(from->sb_agcount);
602 to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
603 to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
604 to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
605 to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
606 to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
607 to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
608 memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
609 to->sb_blocklog = from->sb_blocklog;
610 to->sb_sectlog = from->sb_sectlog;
611 to->sb_inodelog = from->sb_inodelog;
612 to->sb_inopblog = from->sb_inopblog;
613 to->sb_agblklog = from->sb_agblklog;
614 to->sb_rextslog = from->sb_rextslog;
615 to->sb_inprogress = from->sb_inprogress;
616 to->sb_imax_pct = from->sb_imax_pct;
617 to->sb_icount = be64_to_cpu(from->sb_icount);
618 to->sb_ifree = be64_to_cpu(from->sb_ifree);
619 to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
620 to->sb_frextents = be64_to_cpu(from->sb_frextents);
621 to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
622 to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
623 to->sb_qflags = be16_to_cpu(from->sb_qflags);
624 to->sb_flags = from->sb_flags;
625 to->sb_shared_vn = from->sb_shared_vn;
626 to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
627 to->sb_unit = be32_to_cpu(from->sb_unit);
628 to->sb_width = be32_to_cpu(from->sb_width);
629 to->sb_dirblklog = from->sb_dirblklog;
630 to->sb_logsectlog = from->sb_logsectlog;
631 to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
632 to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
633 to->sb_features2 = be32_to_cpu(from->sb_features2);
634 to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
635 to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
636 to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
637 to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
638 to->sb_features_log_incompat =
639 be32_to_cpu(from->sb_features_log_incompat);
640 to->sb_pad = 0;
641 to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
642 to->sb_lsn = be64_to_cpu(from->sb_lsn);
643}
644
645static inline void
646xfs_sb_quota_to_disk(
647 xfs_dsb_t *to,
648 xfs_sb_t *from,
649 __int64_t *fields)
650{
651 __uint16_t qflags = from->sb_qflags;
652
653 if (*fields & XFS_SB_QFLAGS) {
654 /*
655 * The in-core version of sb_qflags do not have
656 * XFS_OQUOTA_* flags, whereas the on-disk version
657 * does. So, convert incore XFS_{PG}QUOTA_* flags
658 * to on-disk XFS_OQUOTA_* flags.
659 */
660 qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
661 XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
662
663 if (from->sb_qflags &
664 (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
665 qflags |= XFS_OQUOTA_ENFD;
666 if (from->sb_qflags &
667 (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
668 qflags |= XFS_OQUOTA_CHKD;
669 to->sb_qflags = cpu_to_be16(qflags);
670 *fields &= ~XFS_SB_QFLAGS;
671 }
672}
673
674/*
675 * Copy in core superblock to ondisk one.
676 *
677 * The fields argument is mask of superblock fields to copy.
678 */
679void
680xfs_sb_to_disk(
681 xfs_dsb_t *to,
682 xfs_sb_t *from,
683 __int64_t fields)
684{
685 xfs_caddr_t to_ptr = (xfs_caddr_t)to;
686 xfs_caddr_t from_ptr = (xfs_caddr_t)from;
687 xfs_sb_field_t f;
688 int first;
689 int size;
690
691 ASSERT(fields);
692 if (!fields)
693 return;
694
695 xfs_sb_quota_to_disk(to, from, &fields);
696 while (fields) {
697 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
698 first = xfs_sb_info[f].offset;
699 size = xfs_sb_info[f + 1].offset - first;
700
701 ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
702
703 if (size == 1 || xfs_sb_info[f].type == 1) {
704 memcpy(to_ptr + first, from_ptr + first, size);
705 } else {
706 switch (size) {
707 case 2:
708 *(__be16 *)(to_ptr + first) =
709 cpu_to_be16(*(__u16 *)(from_ptr + first));
710 break;
711 case 4:
712 *(__be32 *)(to_ptr + first) =
713 cpu_to_be32(*(__u32 *)(from_ptr + first));
714 break;
715 case 8:
716 *(__be64 *)(to_ptr + first) =
717 cpu_to_be64(*(__u64 *)(from_ptr + first));
718 break;
719 default:
720 ASSERT(0);
721 }
722 }
723
724 fields &= ~(1LL << f);
725 }
726}
727
728static int
729xfs_sb_verify(
730 struct xfs_buf *bp,
731 bool check_version)
732{
733 struct xfs_mount *mp = bp->b_target->bt_mount;
734 struct xfs_sb sb;
735
736 xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
737
738 /*
739 * Only check the in progress field for the primary superblock as
740 * mkfs.xfs doesn't clear it from secondary superblocks.
741 */
742 return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
743 check_version);
744}
745
746/*
747 * If the superblock has the CRC feature bit set or the CRC field is non-null,
748 * check that the CRC is valid. We check the CRC field is non-null because a
749 * single bit error could clear the feature bit and unused parts of the
750 * superblock are supposed to be zero. Hence a non-null crc field indicates that
751 * we've potentially lost a feature bit and we should check it anyway.
752 */
753static void
754xfs_sb_read_verify(
755 struct xfs_buf *bp)
756{
757 struct xfs_mount *mp = bp->b_target->bt_mount;
758 struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
759 int error;
760
761 /*
762 * open code the version check to avoid needing to convert the entire
763 * superblock from disk order just to check the version number
764 */
765 if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
766 (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
767 XFS_SB_VERSION_5) ||
768 dsb->sb_crc != 0)) {
769
770 if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize),
771 offsetof(struct xfs_sb, sb_crc))) {
772 error = EFSCORRUPTED;
773 goto out_error;
774 }
775 }
776 error = xfs_sb_verify(bp, true);
777
778out_error:
779 if (error) {
780 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
781 xfs_buf_ioerror(bp, error);
782 }
783}
784
785/*
786 * We may be probed for a filesystem match, so we may not want to emit
787 * messages when the superblock buffer is not actually an XFS superblock.
788 * If we find an XFS superblock, the run a normal, noisy mount because we are
789 * really going to mount it and want to know about errors.
790 */
791static void
792xfs_sb_quiet_read_verify(
793 struct xfs_buf *bp)
794{
795 struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
796
797
798 if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
799 /* XFS filesystem, verify noisily! */
800 xfs_sb_read_verify(bp);
801 return;
802 }
803 /* quietly fail */
804 xfs_buf_ioerror(bp, EWRONGFS);
805}
806
807static void
808xfs_sb_write_verify(
809 struct xfs_buf *bp)
810{
811 struct xfs_mount *mp = bp->b_target->bt_mount;
812 struct xfs_buf_log_item *bip = bp->b_fspriv;
813 int error;
814
815 error = xfs_sb_verify(bp, false);
816 if (error) {
817 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
818 xfs_buf_ioerror(bp, error);
819 return;
820 }
821
822 if (!xfs_sb_version_hascrc(&mp->m_sb))
823 return;
824
825 if (bip)
826 XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
827
828 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
829 offsetof(struct xfs_sb, sb_crc));
830}
831
832const struct xfs_buf_ops xfs_sb_buf_ops = {
833 .verify_read = xfs_sb_read_verify,
834 .verify_write = xfs_sb_write_verify,
835};
836
837static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
838 .verify_read = xfs_sb_quiet_read_verify,
839 .verify_write = xfs_sb_write_verify,
840};
841
842/* 274/*
843 * xfs_readsb 275 * xfs_readsb
844 * 276 *
845 * Does the initial read of the superblock. 277 * Does the initial read of the superblock.
846 */ 278 */
847int 279int
848xfs_readsb(xfs_mount_t *mp, int flags) 280xfs_readsb(
281 struct xfs_mount *mp,
282 int flags)
849{ 283{
850 unsigned int sector_size; 284 unsigned int sector_size;
851 struct xfs_buf *bp; 285 struct xfs_buf *bp;
@@ -884,8 +318,8 @@ reread:
884 * Initialize the mount structure from the superblock. 318 * Initialize the mount structure from the superblock.
885 */ 319 */
886 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); 320 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
887
888 xfs_sb_quota_from_disk(&mp->m_sb); 321 xfs_sb_quota_from_disk(&mp->m_sb);
322
889 /* 323 /*
890 * We must be able to do sector-sized and sector-aligned IO. 324 * We must be able to do sector-sized and sector-aligned IO.
891 */ 325 */
@@ -922,107 +356,6 @@ release_buf:
922 return error; 356 return error;
923} 357}
924 358
925
926/*
927 * xfs_mount_common
928 *
929 * Mount initialization code establishing various mount
930 * fields from the superblock associated with the given
931 * mount structure
932 */
933STATIC void
934xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
935{
936 mp->m_agfrotor = mp->m_agirotor = 0;
937 spin_lock_init(&mp->m_agirotor_lock);
938 mp->m_maxagi = mp->m_sb.sb_agcount;
939 mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
940 mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
941 mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
942 mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
943 mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
944 mp->m_blockmask = sbp->sb_blocksize - 1;
945 mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
946 mp->m_blockwmask = mp->m_blockwsize - 1;
947
948 mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
949 mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
950 mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
951 mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
952
953 mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
954 mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
955 mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
956 mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
957
958 mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
959 mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
960 mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
961 mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
962
963 mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
964 mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
965 sbp->sb_inopblock);
966 mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
967}
968
969/*
970 * xfs_initialize_perag_data
971 *
972 * Read in each per-ag structure so we can count up the number of
973 * allocated inodes, free inodes and used filesystem blocks as this
974 * information is no longer persistent in the superblock. Once we have
975 * this information, write it into the in-core superblock structure.
976 */
977STATIC int
978xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
979{
980 xfs_agnumber_t index;
981 xfs_perag_t *pag;
982 xfs_sb_t *sbp = &mp->m_sb;
983 uint64_t ifree = 0;
984 uint64_t ialloc = 0;
985 uint64_t bfree = 0;
986 uint64_t bfreelst = 0;
987 uint64_t btree = 0;
988 int error;
989
990 for (index = 0; index < agcount; index++) {
991 /*
992 * read the agf, then the agi. This gets us
993 * all the information we need and populates the
994 * per-ag structures for us.
995 */
996 error = xfs_alloc_pagf_init(mp, NULL, index, 0);
997 if (error)
998 return error;
999
1000 error = xfs_ialloc_pagi_init(mp, NULL, index);
1001 if (error)
1002 return error;
1003 pag = xfs_perag_get(mp, index);
1004 ifree += pag->pagi_freecount;
1005 ialloc += pag->pagi_count;
1006 bfree += pag->pagf_freeblks;
1007 bfreelst += pag->pagf_flcount;
1008 btree += pag->pagf_btreeblks;
1009 xfs_perag_put(pag);
1010 }
1011 /*
1012 * Overwrite incore superblock counters with just-read data
1013 */
1014 spin_lock(&mp->m_sb_lock);
1015 sbp->sb_ifree = ifree;
1016 sbp->sb_icount = ialloc;
1017 sbp->sb_fdblocks = bfree + bfreelst + btree;
1018 spin_unlock(&mp->m_sb_lock);
1019
1020 /* Fixup the per-cpu counters as well. */
1021 xfs_icsb_reinit_counters(mp);
1022
1023 return 0;
1024}
1025
1026/* 359/*
1027 * Update alignment values based on mount options and sb values 360 * Update alignment values based on mount options and sb values
1028 */ 361 */
@@ -1194,7 +527,7 @@ xfs_set_inoalignment(xfs_mount_t *mp)
1194} 527}
1195 528
1196/* 529/*
1197 * Check that the data (and log if separate) are an ok size. 530 * Check that the data (and log if separate) is an ok size.
1198 */ 531 */
1199STATIC int 532STATIC int
1200xfs_check_sizes(xfs_mount_t *mp) 533xfs_check_sizes(xfs_mount_t *mp)
@@ -1264,8 +597,7 @@ xfs_mount_reset_sbqflags(
1264 return 0; 597 return 0;
1265 598
1266 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); 599 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
1267 error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp), 600 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
1268 0, 0, XFS_DEFAULT_LOG_COUNT);
1269 if (error) { 601 if (error) {
1270 xfs_trans_cancel(tp, 0); 602 xfs_trans_cancel(tp, 0);
1271 xfs_alert(mp, "%s: Superblock update failed!", __func__); 603 xfs_alert(mp, "%s: Superblock update failed!", __func__);
@@ -1315,7 +647,7 @@ xfs_mountfs(
1315 uint quotaflags = 0; 647 uint quotaflags = 0;
1316 int error = 0; 648 int error = 0;
1317 649
1318 xfs_mount_common(mp, sbp); 650 xfs_sb_mount_common(mp, sbp);
1319 651
1320 /* 652 /*
1321 * Check for a mismatched features2 values. Older kernels 653 * Check for a mismatched features2 values. Older kernels
@@ -1400,7 +732,7 @@ xfs_mountfs(
1400 xfs_set_inoalignment(mp); 732 xfs_set_inoalignment(mp);
1401 733
1402 /* 734 /*
1403 * Check that the data (and log if separate) are an ok size. 735 * Check that the data (and log if separate) is an ok size.
1404 */ 736 */
1405 error = xfs_check_sizes(mp); 737 error = xfs_check_sizes(mp);
1406 if (error) 738 if (error)
@@ -1738,8 +1070,7 @@ xfs_log_sbcount(xfs_mount_t *mp)
1738 return 0; 1070 return 0;
1739 1071
1740 tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP); 1072 tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
1741 error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, 1073 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
1742 XFS_DEFAULT_LOG_COUNT);
1743 if (error) { 1074 if (error) {
1744 xfs_trans_cancel(tp, 0); 1075 xfs_trans_cancel(tp, 0);
1745 return error; 1076 return error;
@@ -1752,49 +1083,7 @@ xfs_log_sbcount(xfs_mount_t *mp)
1752} 1083}
1753 1084
1754/* 1085/*
1755 * xfs_mod_sb() can be used to copy arbitrary changes to the 1086 * xfs_mod_incore_sb_unlocked() is a utility routine commonly used to apply
1756 * in-core superblock into the superblock buffer to be logged.
1757 * It does not provide the higher level of locking that is
1758 * needed to protect the in-core superblock from concurrent
1759 * access.
1760 */
1761void
1762xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
1763{
1764 xfs_buf_t *bp;
1765 int first;
1766 int last;
1767 xfs_mount_t *mp;
1768 xfs_sb_field_t f;
1769
1770 ASSERT(fields);
1771 if (!fields)
1772 return;
1773 mp = tp->t_mountp;
1774 bp = xfs_trans_getsb(tp, mp, 0);
1775 first = sizeof(xfs_sb_t);
1776 last = 0;
1777
1778 /* translate/copy */
1779
1780 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
1781
1782 /* find modified range */
1783 f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
1784 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1785 last = xfs_sb_info[f + 1].offset - 1;
1786
1787 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
1788 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
1789 first = xfs_sb_info[f].offset;
1790
1791 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
1792 xfs_trans_log_buf(tp, bp, first, last);
1793}
1794
1795
1796/*
1797 * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
1798 * a delta to a specified field in the in-core superblock. Simply 1087 * a delta to a specified field in the in-core superblock. Simply
1799 * switch on the field indicated and apply the delta to that field. 1088 * switch on the field indicated and apply the delta to that field.
1800 * Fields are not allowed to dip below zero, so if the delta would 1089 * Fields are not allowed to dip below zero, so if the delta would
@@ -2101,8 +1390,7 @@ xfs_mount_log_sb(
2101 XFS_SB_VERSIONNUM)); 1390 XFS_SB_VERSIONNUM));
2102 1391
2103 tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT); 1392 tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
2104 error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0, 1393 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
2105 XFS_DEFAULT_LOG_COUNT);
2106 if (error) { 1394 if (error) {
2107 xfs_trans_cancel(tp, 0); 1395 xfs_trans_cancel(tp, 0);
2108 return error; 1396 return error;
@@ -2260,12 +1548,6 @@ xfs_icsb_init_counters(
2260 if (mp->m_sb_cnts == NULL) 1548 if (mp->m_sb_cnts == NULL)
2261 return -ENOMEM; 1549 return -ENOMEM;
2262 1550
2263#ifdef CONFIG_HOTPLUG_CPU
2264 mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
2265 mp->m_icsb_notifier.priority = 0;
2266 register_hotcpu_notifier(&mp->m_icsb_notifier);
2267#endif /* CONFIG_HOTPLUG_CPU */
2268
2269 for_each_online_cpu(i) { 1551 for_each_online_cpu(i) {
2270 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); 1552 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
2271 memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); 1553 memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
@@ -2278,6 +1560,13 @@ xfs_icsb_init_counters(
2278 * initial balance kicks us off correctly 1560 * initial balance kicks us off correctly
2279 */ 1561 */
2280 mp->m_icsb_counters = -1; 1562 mp->m_icsb_counters = -1;
1563
1564#ifdef CONFIG_HOTPLUG_CPU
1565 mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
1566 mp->m_icsb_notifier.priority = 0;
1567 register_hotcpu_notifier(&mp->m_icsb_notifier);
1568#endif /* CONFIG_HOTPLUG_CPU */
1569
2281 return 0; 1570 return 0;
2282} 1571}
2283 1572
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 4e374d4a9189..1fa0584b5627 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -18,45 +18,7 @@
18#ifndef __XFS_MOUNT_H__ 18#ifndef __XFS_MOUNT_H__
19#define __XFS_MOUNT_H__ 19#define __XFS_MOUNT_H__
20 20
21typedef struct xfs_trans_reservations { 21#ifdef __KERNEL__
22 uint tr_write; /* extent alloc trans */
23 uint tr_itruncate; /* truncate trans */
24 uint tr_rename; /* rename trans */
25 uint tr_link; /* link trans */
26 uint tr_remove; /* unlink trans */
27 uint tr_symlink; /* symlink trans */
28 uint tr_create; /* create trans */
29 uint tr_mkdir; /* mkdir trans */
30 uint tr_ifree; /* inode free trans */
31 uint tr_ichange; /* inode update trans */
32 uint tr_growdata; /* fs data section grow trans */
33 uint tr_swrite; /* sync write inode trans */
34 uint tr_addafork; /* cvt inode to attributed trans */
35 uint tr_writeid; /* write setuid/setgid file */
36 uint tr_attrinval; /* attr fork buffer invalidation */
37 uint tr_attrsetm; /* set/create an attribute at mount time */
38 uint tr_attrsetrt; /* set/create an attribute at runtime */
39 uint tr_attrrm; /* remove an attribute */
40 uint tr_clearagi; /* clear bad agi unlinked ino bucket */
41 uint tr_growrtalloc; /* grow realtime allocations */
42 uint tr_growrtzero; /* grow realtime zeroing */
43 uint tr_growrtfree; /* grow realtime freeing */
44 uint tr_qm_sbchange; /* change quota flags */
45 uint tr_qm_setqlim; /* adjust quota limits */
46 uint tr_qm_dqalloc; /* allocate quota on disk */
47 uint tr_qm_quotaoff; /* turn quota off */
48 uint tr_qm_equotaoff;/* end of turn quota off */
49 uint tr_sb; /* modify superblock */
50} xfs_trans_reservations_t;
51
52#ifndef __KERNEL__
53
54#define xfs_daddr_to_agno(mp,d) \
55 ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
56#define xfs_daddr_to_agbno(mp,d) \
57 ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
58
59#else /* __KERNEL__ */
60 22
61struct xlog; 23struct xlog;
62struct xfs_inode; 24struct xfs_inode;
@@ -174,7 +136,7 @@ typedef struct xfs_mount {
174 int m_ialloc_blks; /* blocks in inode allocation */ 136 int m_ialloc_blks; /* blocks in inode allocation */
175 int m_inoalign_mask;/* mask sb_inoalignmt if used */ 137 int m_inoalign_mask;/* mask sb_inoalignmt if used */
176 uint m_qflags; /* quota status flags */ 138 uint m_qflags; /* quota status flags */
177 xfs_trans_reservations_t m_reservations;/* precomputed res values */ 139 struct xfs_trans_resv m_resv; /* precomputed res values */
178 __uint64_t m_maxicount; /* maximum inode count */ 140 __uint64_t m_maxicount; /* maximum inode count */
179 __uint64_t m_resblks; /* total reserved blocks */ 141 __uint64_t m_resblks; /* total reserved blocks */
180 __uint64_t m_resblks_avail;/* available reserved blocks */ 142 __uint64_t m_resblks_avail;/* available reserved blocks */
@@ -330,14 +292,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
330} 292}
331 293
332/* 294/*
333 * perag get/put wrappers for ref counting
334 */
335struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
336struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
337 int tag);
338void xfs_perag_put(struct xfs_perag *pag);
339
340/*
341 * Per-cpu superblock locking functions 295 * Per-cpu superblock locking functions
342 */ 296 */
343#ifdef HAVE_PERCPU_SB 297#ifdef HAVE_PERCPU_SB
@@ -366,9 +320,63 @@ typedef struct xfs_mod_sb {
366 int64_t msb_delta; /* Change to make to specified field */ 320 int64_t msb_delta; /* Change to make to specified field */
367} xfs_mod_sb_t; 321} xfs_mod_sb_t;
368 322
323/*
324 * Per-ag incore structure, copies of information in agf and agi, to improve the
325 * performance of allocation group selection. This is defined for the kernel
326 * only, and hence is defined here instead of in xfs_ag.h. You need the struct
327 * xfs_mount to be defined to look up a xfs_perag anyway (via mp->m_perag_tree),
328 * so this doesn't introduce any strange header file dependencies.
329 */
330typedef struct xfs_perag {
331 struct xfs_mount *pag_mount; /* owner filesystem */
332 xfs_agnumber_t pag_agno; /* AG this structure belongs to */
333 atomic_t pag_ref; /* perag reference count */
334 char pagf_init; /* this agf's entry is initialized */
335 char pagi_init; /* this agi's entry is initialized */
336 char pagf_metadata; /* the agf is preferred to be metadata */
337 char pagi_inodeok; /* The agi is ok for inodes */
338 __uint8_t pagf_levels[XFS_BTNUM_AGF];
339 /* # of levels in bno & cnt btree */
340 __uint32_t pagf_flcount; /* count of blocks in freelist */
341 xfs_extlen_t pagf_freeblks; /* total free blocks */
342 xfs_extlen_t pagf_longest; /* longest free space */
343 __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */
344 xfs_agino_t pagi_freecount; /* number of free inodes */
345 xfs_agino_t pagi_count; /* number of allocated inodes */
346
347 /*
348 * Inode allocation search lookup optimisation.
349 * If the pagino matches, the search for new inodes
350 * doesn't need to search the near ones again straight away
351 */
352 xfs_agino_t pagl_pagino;
353 xfs_agino_t pagl_leftrec;
354 xfs_agino_t pagl_rightrec;
355 spinlock_t pagb_lock; /* lock for pagb_tree */
356 struct rb_root pagb_tree; /* ordered tree of busy extents */
357
358 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
359
360 spinlock_t pag_ici_lock; /* incore inode cache lock */
361 struct radix_tree_root pag_ici_root; /* incore inode cache root */
362 int pag_ici_reclaimable; /* reclaimable inodes */
363 struct mutex pag_ici_reclaim_lock; /* serialisation point */
364 unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
365
366 /* buffer cache index */
367 spinlock_t pag_buf_lock; /* lock for pag_buf_tree */
368 struct rb_root pag_buf_tree; /* ordered tree of active buffers */
369
370 /* for rcu-safe freeing */
371 struct rcu_head rcu_head;
372 int pagb_count; /* pagb slots in use */
373} xfs_perag_t;
374
369extern int xfs_log_sbcount(xfs_mount_t *); 375extern int xfs_log_sbcount(xfs_mount_t *);
370extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); 376extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
371extern int xfs_mountfs(xfs_mount_t *mp); 377extern int xfs_mountfs(xfs_mount_t *mp);
378extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount,
379 xfs_agnumber_t *maxagi);
372 380
373extern void xfs_unmountfs(xfs_mount_t *); 381extern void xfs_unmountfs(xfs_mount_t *);
374extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); 382extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
@@ -387,13 +395,4 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
387 395
388#endif /* __KERNEL__ */ 396#endif /* __KERNEL__ */
389 397
390extern void xfs_sb_calc_crc(struct xfs_buf *);
391extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
392extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
393 xfs_agnumber_t *);
394extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
395extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
396
397extern const struct xfs_buf_ops xfs_sb_buf_ops;
398
399#endif /* __XFS_MOUNT_H__ */ 398#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index d320794d03ce..6218a0aeeeea 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_bit.h" 21#include "xfs_bit.h"
21#include "xfs_log.h" 22#include "xfs_log.h"
22#include "xfs_trans.h" 23#include "xfs_trans.h"
@@ -37,7 +38,6 @@
37#include "xfs_attr.h" 38#include "xfs_attr.h"
38#include "xfs_buf_item.h" 39#include "xfs_buf_item.h"
39#include "xfs_trans_space.h" 40#include "xfs_trans_space.h"
40#include "xfs_utils.h"
41#include "xfs_qm.h" 41#include "xfs_qm.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43#include "xfs_icache.h" 43#include "xfs_icache.h"
@@ -834,21 +834,52 @@ xfs_qm_qino_alloc(
834 int error; 834 int error;
835 int committed; 835 int committed;
836 836
837 *ip = NULL;
838 /*
839 * With superblock that doesn't have separate pquotino, we
840 * share an inode between gquota and pquota. If the on-disk
841 * superblock has GQUOTA and the filesystem is now mounted
842 * with PQUOTA, just use sb_gquotino for sb_pquotino and
843 * vice-versa.
844 */
845 if (!xfs_sb_version_has_pquotino(&mp->m_sb) &&
846 (flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) {
847 xfs_ino_t ino = NULLFSINO;
848
849 if ((flags & XFS_QMOPT_PQUOTA) &&
850 (mp->m_sb.sb_gquotino != NULLFSINO)) {
851 ino = mp->m_sb.sb_gquotino;
852 ASSERT(mp->m_sb.sb_pquotino == NULLFSINO);
853 } else if ((flags & XFS_QMOPT_GQUOTA) &&
854 (mp->m_sb.sb_pquotino != NULLFSINO)) {
855 ino = mp->m_sb.sb_pquotino;
856 ASSERT(mp->m_sb.sb_gquotino == NULLFSINO);
857 }
858 if (ino != NULLFSINO) {
859 error = xfs_iget(mp, NULL, ino, 0, 0, ip);
860 if (error)
861 return error;
862 mp->m_sb.sb_gquotino = NULLFSINO;
863 mp->m_sb.sb_pquotino = NULLFSINO;
864 }
865 }
866
837 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE); 867 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
838 if ((error = xfs_trans_reserve(tp, 868 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create,
839 XFS_QM_QINOCREATE_SPACE_RES(mp), 869 XFS_QM_QINOCREATE_SPACE_RES(mp), 0);
840 XFS_CREATE_LOG_RES(mp), 0, 870 if (error) {
841 XFS_TRANS_PERM_LOG_RES,
842 XFS_CREATE_LOG_COUNT))) {
843 xfs_trans_cancel(tp, 0); 871 xfs_trans_cancel(tp, 0);
844 return error; 872 return error;
845 } 873 }
846 874
847 error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed); 875 if (!*ip) {
848 if (error) { 876 error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
849 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | 877 &committed);
850 XFS_TRANS_ABORT); 878 if (error) {
851 return error; 879 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
880 XFS_TRANS_ABORT);
881 return error;
882 }
852 } 883 }
853 884
854 /* 885 /*
@@ -860,21 +891,25 @@ xfs_qm_qino_alloc(
860 if (flags & XFS_QMOPT_SBVERSION) { 891 if (flags & XFS_QMOPT_SBVERSION) {
861 ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); 892 ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
862 ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | 893 ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
863 XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) == 894 XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | XFS_SB_QFLAGS)) ==
864 (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | 895 (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
865 XFS_SB_GQUOTINO | XFS_SB_QFLAGS)); 896 XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
897 XFS_SB_QFLAGS));
866 898
867 xfs_sb_version_addquota(&mp->m_sb); 899 xfs_sb_version_addquota(&mp->m_sb);
868 mp->m_sb.sb_uquotino = NULLFSINO; 900 mp->m_sb.sb_uquotino = NULLFSINO;
869 mp->m_sb.sb_gquotino = NULLFSINO; 901 mp->m_sb.sb_gquotino = NULLFSINO;
902 mp->m_sb.sb_pquotino = NULLFSINO;
870 903
871 /* qflags will get updated _after_ quotacheck */ 904 /* qflags will get updated fully _after_ quotacheck */
872 mp->m_sb.sb_qflags = 0; 905 mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT;
873 } 906 }
874 if (flags & XFS_QMOPT_UQUOTA) 907 if (flags & XFS_QMOPT_UQUOTA)
875 mp->m_sb.sb_uquotino = (*ip)->i_ino; 908 mp->m_sb.sb_uquotino = (*ip)->i_ino;
876 else 909 else if (flags & XFS_QMOPT_GQUOTA)
877 mp->m_sb.sb_gquotino = (*ip)->i_ino; 910 mp->m_sb.sb_gquotino = (*ip)->i_ino;
911 else
912 mp->m_sb.sb_pquotino = (*ip)->i_ino;
878 spin_unlock(&mp->m_sb_lock); 913 spin_unlock(&mp->m_sb_lock);
879 xfs_mod_sb(tp, sbfields); 914 xfs_mod_sb(tp, sbfields);
880 915
@@ -1484,11 +1519,10 @@ xfs_qm_init_quotainos(
1484 if (error) 1519 if (error)
1485 goto error_rele; 1520 goto error_rele;
1486 } 1521 }
1487 /* XXX: Use gquotino for now */
1488 if (XFS_IS_PQUOTA_ON(mp) && 1522 if (XFS_IS_PQUOTA_ON(mp) &&
1489 mp->m_sb.sb_gquotino != NULLFSINO) { 1523 mp->m_sb.sb_pquotino != NULLFSINO) {
1490 ASSERT(mp->m_sb.sb_gquotino > 0); 1524 ASSERT(mp->m_sb.sb_pquotino > 0);
1491 error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 1525 error = xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
1492 0, 0, &pip); 1526 0, 0, &pip);
1493 if (error) 1527 if (error)
1494 goto error_rele; 1528 goto error_rele;
@@ -1496,7 +1530,8 @@ xfs_qm_init_quotainos(
1496 } else { 1530 } else {
1497 flags |= XFS_QMOPT_SBVERSION; 1531 flags |= XFS_QMOPT_SBVERSION;
1498 sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | 1532 sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
1499 XFS_SB_GQUOTINO | XFS_SB_QFLAGS); 1533 XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
1534 XFS_SB_QFLAGS);
1500 } 1535 }
1501 1536
1502 /* 1537 /*
@@ -1524,9 +1559,8 @@ xfs_qm_init_quotainos(
1524 flags &= ~XFS_QMOPT_SBVERSION; 1559 flags &= ~XFS_QMOPT_SBVERSION;
1525 } 1560 }
1526 if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) { 1561 if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) {
1527 /* XXX: Use XFS_SB_GQUOTINO for now */
1528 error = xfs_qm_qino_alloc(mp, &pip, 1562 error = xfs_qm_qino_alloc(mp, &pip,
1529 sbflags | XFS_SB_GQUOTINO, 1563 sbflags | XFS_SB_PQUOTINO,
1530 flags | XFS_QMOPT_PQUOTA); 1564 flags | XFS_QMOPT_PQUOTA);
1531 if (error) 1565 if (error)
1532 goto error_rele; 1566 goto error_rele;
@@ -1704,8 +1738,7 @@ xfs_qm_write_sb_changes(
1704 int error; 1738 int error;
1705 1739
1706 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); 1740 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
1707 error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp), 1741 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
1708 0, 0, XFS_DEFAULT_LOG_COUNT);
1709 if (error) { 1742 if (error) {
1710 xfs_trans_cancel(tp, 0); 1743 xfs_trans_cancel(tp, 0);
1711 return error; 1744 return error;
@@ -1734,8 +1767,8 @@ xfs_qm_write_sb_changes(
1734int 1767int
1735xfs_qm_vop_dqalloc( 1768xfs_qm_vop_dqalloc(
1736 struct xfs_inode *ip, 1769 struct xfs_inode *ip,
1737 uid_t uid, 1770 xfs_dqid_t uid,
1738 gid_t gid, 1771 xfs_dqid_t gid,
1739 prid_t prid, 1772 prid_t prid,
1740 uint flags, 1773 uint flags,
1741 struct xfs_dquot **O_udqpp, 1774 struct xfs_dquot **O_udqpp,
@@ -1782,7 +1815,7 @@ xfs_qm_vop_dqalloc(
1782 * holding ilock. 1815 * holding ilock.
1783 */ 1816 */
1784 xfs_iunlock(ip, lockflags); 1817 xfs_iunlock(ip, lockflags);
1785 error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid, 1818 error = xfs_qm_dqget(mp, NULL, uid,
1786 XFS_DQ_USER, 1819 XFS_DQ_USER,
1787 XFS_QMOPT_DQALLOC | 1820 XFS_QMOPT_DQALLOC |
1788 XFS_QMOPT_DOWARN, 1821 XFS_QMOPT_DOWARN,
@@ -1809,7 +1842,7 @@ xfs_qm_vop_dqalloc(
1809 if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { 1842 if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
1810 if (ip->i_d.di_gid != gid) { 1843 if (ip->i_d.di_gid != gid) {
1811 xfs_iunlock(ip, lockflags); 1844 xfs_iunlock(ip, lockflags);
1812 error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid, 1845 error = xfs_qm_dqget(mp, NULL, gid,
1813 XFS_DQ_GROUP, 1846 XFS_DQ_GROUP,
1814 XFS_QMOPT_DQALLOC | 1847 XFS_QMOPT_DQALLOC |
1815 XFS_QMOPT_DOWARN, 1848 XFS_QMOPT_DOWARN,
@@ -1943,7 +1976,7 @@ xfs_qm_vop_chown_reserve(
1943 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; 1976 XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
1944 1977
1945 if (XFS_IS_UQUOTA_ON(mp) && udqp && 1978 if (XFS_IS_UQUOTA_ON(mp) && udqp &&
1946 ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) { 1979 ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) {
1947 udq_delblks = udqp; 1980 udq_delblks = udqp;
1948 /* 1981 /*
1949 * If there are delayed allocation blocks, then we have to 1982 * If there are delayed allocation blocks, then we have to
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 579d6a02a5b6..670cd4464070 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -160,6 +160,8 @@ extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
160 struct fs_disk_quota *); 160 struct fs_disk_quota *);
161extern int xfs_qm_scall_getqstat(struct xfs_mount *, 161extern int xfs_qm_scall_getqstat(struct xfs_mount *,
162 struct fs_quota_stat *); 162 struct fs_quota_stat *);
163extern int xfs_qm_scall_getqstatv(struct xfs_mount *,
164 struct fs_quota_statv *);
163extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); 165extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint);
164extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); 166extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint);
165 167
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 437a52d91f6d..3af50ccdfac1 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
21#include "xfs_trans.h" 22#include "xfs_trans.h"
22#include "xfs_sb.h" 23#include "xfs_sb.h"
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index e4f8b2d6f38b..8174aad0b388 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -20,6 +20,7 @@
20 20
21#include "xfs.h" 21#include "xfs.h"
22#include "xfs_fs.h" 22#include "xfs_fs.h"
23#include "xfs_format.h"
23#include "xfs_bit.h" 24#include "xfs_bit.h"
24#include "xfs_log.h" 25#include "xfs_log.h"
25#include "xfs_trans.h" 26#include "xfs_trans.h"
@@ -37,7 +38,6 @@
37#include "xfs_error.h" 38#include "xfs_error.h"
38#include "xfs_attr.h" 39#include "xfs_attr.h"
39#include "xfs_buf_item.h" 40#include "xfs_buf_item.h"
40#include "xfs_utils.h"
41#include "xfs_qm.h" 41#include "xfs_qm.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43#include "xfs_icache.h" 43#include "xfs_icache.h"
@@ -247,9 +247,7 @@ xfs_qm_scall_trunc_qfile(
247 xfs_ilock(ip, XFS_IOLOCK_EXCL); 247 xfs_ilock(ip, XFS_IOLOCK_EXCL);
248 248
249 tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); 249 tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
250 error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 250 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
251 XFS_TRANS_PERM_LOG_RES,
252 XFS_ITRUNCATE_LOG_COUNT);
253 if (error) { 251 if (error) {
254 xfs_trans_cancel(tp, 0); 252 xfs_trans_cancel(tp, 0);
255 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 253 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -296,8 +294,10 @@ xfs_qm_scall_trunc_qfiles(
296 294
297 if (flags & XFS_DQ_USER) 295 if (flags & XFS_DQ_USER)
298 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); 296 error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
299 if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) 297 if (flags & XFS_DQ_GROUP)
300 error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); 298 error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
299 if (flags & XFS_DQ_PROJ)
300 error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
301 301
302 return error ? error : error2; 302 return error ? error : error2;
303} 303}
@@ -404,6 +404,7 @@ xfs_qm_scall_quotaon(
404 404
405/* 405/*
406 * Return quota status information, such as uquota-off, enforcements, etc. 406 * Return quota status information, such as uquota-off, enforcements, etc.
407 * for Q_XGETQSTAT command.
407 */ 408 */
408int 409int
409xfs_qm_scall_getqstat( 410xfs_qm_scall_getqstat(
@@ -413,8 +414,10 @@ xfs_qm_scall_getqstat(
413 struct xfs_quotainfo *q = mp->m_quotainfo; 414 struct xfs_quotainfo *q = mp->m_quotainfo;
414 struct xfs_inode *uip = NULL; 415 struct xfs_inode *uip = NULL;
415 struct xfs_inode *gip = NULL; 416 struct xfs_inode *gip = NULL;
417 struct xfs_inode *pip = NULL;
416 bool tempuqip = false; 418 bool tempuqip = false;
417 bool tempgqip = false; 419 bool tempgqip = false;
420 bool temppqip = false;
418 421
419 memset(out, 0, sizeof(fs_quota_stat_t)); 422 memset(out, 0, sizeof(fs_quota_stat_t));
420 423
@@ -424,16 +427,106 @@ xfs_qm_scall_getqstat(
424 out->qs_gquota.qfs_ino = NULLFSINO; 427 out->qs_gquota.qfs_ino = NULLFSINO;
425 return (0); 428 return (0);
426 } 429 }
430
431 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
432 (XFS_ALL_QUOTA_ACCT|
433 XFS_ALL_QUOTA_ENFD));
434 if (q) {
435 uip = q->qi_uquotaip;
436 gip = q->qi_gquotaip;
437 pip = q->qi_pquotaip;
438 }
439 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
440 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
441 0, 0, &uip) == 0)
442 tempuqip = true;
443 }
444 if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
445 if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
446 0, 0, &gip) == 0)
447 tempgqip = true;
448 }
449 /*
450 * Q_XGETQSTAT doesn't have room for both group and project quotas.
451 * So, allow the project quota values to be copied out only if
452 * there is no group quota information available.
453 */
454 if (!gip) {
455 if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) {
456 if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
457 0, 0, &pip) == 0)
458 temppqip = true;
459 }
460 } else
461 pip = NULL;
462 if (uip) {
463 out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
464 out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
465 out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
466 if (tempuqip)
467 IRELE(uip);
468 }
469
470 if (gip) {
471 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
472 out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
473 out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
474 if (tempgqip)
475 IRELE(gip);
476 }
477 if (pip) {
478 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
479 out->qs_gquota.qfs_nblks = pip->i_d.di_nblocks;
480 out->qs_gquota.qfs_nextents = pip->i_d.di_nextents;
481 if (temppqip)
482 IRELE(pip);
483 }
484 if (q) {
485 out->qs_incoredqs = q->qi_dquots;
486 out->qs_btimelimit = q->qi_btimelimit;
487 out->qs_itimelimit = q->qi_itimelimit;
488 out->qs_rtbtimelimit = q->qi_rtbtimelimit;
489 out->qs_bwarnlimit = q->qi_bwarnlimit;
490 out->qs_iwarnlimit = q->qi_iwarnlimit;
491 }
492 return 0;
493}
494
495/*
496 * Return quota status information, such as uquota-off, enforcements, etc.
497 * for Q_XGETQSTATV command, to support separate project quota field.
498 */
499int
500xfs_qm_scall_getqstatv(
501 struct xfs_mount *mp,
502 struct fs_quota_statv *out)
503{
504 struct xfs_quotainfo *q = mp->m_quotainfo;
505 struct xfs_inode *uip = NULL;
506 struct xfs_inode *gip = NULL;
507 struct xfs_inode *pip = NULL;
508 bool tempuqip = false;
509 bool tempgqip = false;
510 bool temppqip = false;
511
512 if (!xfs_sb_version_hasquota(&mp->m_sb)) {
513 out->qs_uquota.qfs_ino = NULLFSINO;
514 out->qs_gquota.qfs_ino = NULLFSINO;
515 out->qs_pquota.qfs_ino = NULLFSINO;
516 return (0);
517 }
518
427 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & 519 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
428 (XFS_ALL_QUOTA_ACCT| 520 (XFS_ALL_QUOTA_ACCT|
429 XFS_ALL_QUOTA_ENFD)); 521 XFS_ALL_QUOTA_ENFD));
430 out->qs_pad = 0;
431 out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; 522 out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
432 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; 523 out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
524 out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino;
433 525
434 if (q) { 526 if (q) {
435 uip = q->qi_uquotaip; 527 uip = q->qi_uquotaip;
436 gip = q->qi_gquotaip; 528 gip = q->qi_gquotaip;
529 pip = q->qi_pquotaip;
437 } 530 }
438 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { 531 if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
439 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 532 if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
@@ -445,18 +538,30 @@ xfs_qm_scall_getqstat(
445 0, 0, &gip) == 0) 538 0, 0, &gip) == 0)
446 tempgqip = true; 539 tempgqip = true;
447 } 540 }
541 if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) {
542 if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino,
543 0, 0, &pip) == 0)
544 temppqip = true;
545 }
448 if (uip) { 546 if (uip) {
449 out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; 547 out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
450 out->qs_uquota.qfs_nextents = uip->i_d.di_nextents; 548 out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
451 if (tempuqip) 549 if (tempuqip)
452 IRELE(uip); 550 IRELE(uip);
453 } 551 }
552
454 if (gip) { 553 if (gip) {
455 out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks; 554 out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
456 out->qs_gquota.qfs_nextents = gip->i_d.di_nextents; 555 out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
457 if (tempgqip) 556 if (tempgqip)
458 IRELE(gip); 557 IRELE(gip);
459 } 558 }
559 if (pip) {
560 out->qs_pquota.qfs_nblks = pip->i_d.di_nblocks;
561 out->qs_pquota.qfs_nextents = pip->i_d.di_nextents;
562 if (temppqip)
563 IRELE(pip);
564 }
460 if (q) { 565 if (q) {
461 out->qs_incoredqs = q->qi_dquots; 566 out->qs_incoredqs = q->qi_dquots;
462 out->qs_btimelimit = q->qi_btimelimit; 567 out->qs_btimelimit = q->qi_btimelimit;
@@ -515,8 +620,7 @@ xfs_qm_scall_setqlim(
515 xfs_dqunlock(dqp); 620 xfs_dqunlock(dqp);
516 621
517 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); 622 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
518 error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), 623 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0);
519 0, 0, XFS_DEFAULT_LOG_COUNT);
520 if (error) { 624 if (error) {
521 xfs_trans_cancel(tp, 0); 625 xfs_trans_cancel(tp, 0);
522 goto out_rele; 626 goto out_rele;
@@ -650,8 +754,7 @@ xfs_qm_log_quotaoff_end(
650 754
651 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); 755 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
652 756
653 error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_END_LOG_RES(mp), 757 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
654 0, 0, XFS_DEFAULT_LOG_COUNT);
655 if (error) { 758 if (error) {
656 xfs_trans_cancel(tp, 0); 759 xfs_trans_cancel(tp, 0);
657 return (error); 760 return (error);
@@ -684,8 +787,7 @@ xfs_qm_log_quotaoff(
684 uint oldsbqflag=0; 787 uint oldsbqflag=0;
685 788
686 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); 789 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
687 error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_LOG_RES(mp), 790 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0);
688 0, 0, XFS_DEFAULT_LOG_COUNT);
689 if (error) 791 if (error)
690 goto error0; 792 goto error0;
691 793
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index b14f42c714b6..e7d84d2d8683 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -18,267 +18,14 @@
18#ifndef __XFS_QUOTA_H__ 18#ifndef __XFS_QUOTA_H__
19#define __XFS_QUOTA_H__ 19#define __XFS_QUOTA_H__
20 20
21struct xfs_trans; 21#include "xfs_quota_defs.h"
22
23/*
24 * The ondisk form of a dquot structure.
25 */
26#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
27#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */
28
29/*
30 * uid_t and gid_t are hard-coded to 32 bits in the inode.
31 * Hence, an 'id' in a dquot is 32 bits..
32 */
33typedef __uint32_t xfs_dqid_t;
34
35/*
36 * Even though users may not have quota limits occupying all 64-bits,
37 * they may need 64-bit accounting. Hence, 64-bit quota-counters,
38 * and quota-limits. This is a waste in the common case, but hey ...
39 */
40typedef __uint64_t xfs_qcnt_t;
41typedef __uint16_t xfs_qwarncnt_t;
42
43/*
44 * This is the main portion of the on-disk representation of quota
45 * information for a user. This is the q_core of the xfs_dquot_t that
46 * is kept in kernel memory. We pad this with some more expansion room
47 * to construct the on disk structure.
48 */
49typedef struct xfs_disk_dquot {
50 __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */
51 __u8 d_version; /* dquot version */
52 __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */
53 __be32 d_id; /* user,project,group id */
54 __be64 d_blk_hardlimit;/* absolute limit on disk blks */
55 __be64 d_blk_softlimit;/* preferred limit on disk blks */
56 __be64 d_ino_hardlimit;/* maximum # allocated inodes */
57 __be64 d_ino_softlimit;/* preferred inode limit */
58 __be64 d_bcount; /* disk blocks owned by the user */
59 __be64 d_icount; /* inodes owned by the user */
60 __be32 d_itimer; /* zero if within inode limits if not,
61 this is when we refuse service */
62 __be32 d_btimer; /* similar to above; for disk blocks */
63 __be16 d_iwarns; /* warnings issued wrt num inodes */
64 __be16 d_bwarns; /* warnings issued wrt disk blocks */
65 __be32 d_pad0; /* 64 bit align */
66 __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */
67 __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */
68 __be64 d_rtbcount; /* realtime blocks owned */
69 __be32 d_rtbtimer; /* similar to above; for RT disk blocks */
70 __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */
71 __be16 d_pad;
72} xfs_disk_dquot_t;
73
74/*
75 * This is what goes on disk. This is separated from the xfs_disk_dquot because
76 * carrying the unnecessary padding would be a waste of memory.
77 */
78typedef struct xfs_dqblk {
79 xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */
80 char dd_fill[4]; /* filling for posterity */
81
82 /*
83 * These two are only present on filesystems with the CRC bits set.
84 */
85 __be32 dd_crc; /* checksum */
86 __be64 dd_lsn; /* last modification in log */
87 uuid_t dd_uuid; /* location information */
88} xfs_dqblk_t;
89
90#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc)
91
92/*
93 * flags for q_flags field in the dquot.
94 */
95#define XFS_DQ_USER 0x0001 /* a user quota */
96#define XFS_DQ_PROJ 0x0002 /* project quota */
97#define XFS_DQ_GROUP 0x0004 /* a group quota */
98#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */
99#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */
100
101#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
102
103#define XFS_DQ_FLAGS \
104 { XFS_DQ_USER, "USER" }, \
105 { XFS_DQ_PROJ, "PROJ" }, \
106 { XFS_DQ_GROUP, "GROUP" }, \
107 { XFS_DQ_DIRTY, "DIRTY" }, \
108 { XFS_DQ_FREEING, "FREEING" }
109
110/*
111 * We have the possibility of all three quota types being active at once, and
112 * hence free space modification requires modification of all three current
113 * dquots in a single transaction. For this case we need to have a reservation
114 * of at least 3 dquots.
115 *
116 * However, a chmod operation can change both UID and GID in a single
117 * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
118 * modified. Hence for this case we need to reserve space for at least 4 dquots.
119 *
120 * And in the worst case, there's a rename operation that can be modifying up to
121 * 4 inodes with dquots attached to them. In reality, the only inodes that can
122 * have their dquots modified are the source and destination directory inodes
123 * due to directory name creation and removal. That can require space allocation
124 * and/or freeing on both directory inodes, and hence all three dquots on each
125 * inode can be modified. And if the directories are world writeable, all the
126 * dquots can be unique and so 6 dquots can be modified....
127 *
128 * And, of course, we also need to take into account the dquot log format item
129 * used to describe each dquot.
130 */
131#define XFS_DQUOT_LOGRES(mp) \
132 ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
133
134/*
135 * These are the structures used to lay out dquots and quotaoff
136 * records on the log. Quite similar to those of inodes.
137 */
138
139/*
140 * log format struct for dquots.
141 * The first two fields must be the type and size fitting into
142 * 32 bits : log_recovery code assumes that.
143 */
144typedef struct xfs_dq_logformat {
145 __uint16_t qlf_type; /* dquot log item type */
146 __uint16_t qlf_size; /* size of this item */
147 xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */
148 __int64_t qlf_blkno; /* blkno of dquot buffer */
149 __int32_t qlf_len; /* len of dquot buffer */
150 __uint32_t qlf_boffset; /* off of dquot in buffer */
151} xfs_dq_logformat_t;
152
153/*
154 * log format struct for QUOTAOFF records.
155 * The first two fields must be the type and size fitting into
156 * 32 bits : log_recovery code assumes that.
157 * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer
158 * to the first and ensures that the first logitem is taken out of the AIL
159 * only when the last one is securely committed.
160 */
161typedef struct xfs_qoff_logformat {
162 unsigned short qf_type; /* quotaoff log item type */
163 unsigned short qf_size; /* size of this item */
164 unsigned int qf_flags; /* USR and/or GRP */
165 char qf_pad[12]; /* padding for future */
166} xfs_qoff_logformat_t;
167
168
169/*
170 * Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
171 */
172#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */
173#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */
174#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */
175#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */
176#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */
177#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */
178#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */
179
180/*
181 * Conversion to and from the combined OQUOTA flag (if necessary)
182 * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk()
183 */
184#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */
185#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */
186#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */
187#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */
188
189/*
190 * Quota Accounting/Enforcement flags
191 */
192#define XFS_ALL_QUOTA_ACCT \
193 (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
194#define XFS_ALL_QUOTA_ENFD \
195 (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD)
196#define XFS_ALL_QUOTA_CHKD \
197 (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD)
198
199#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
200#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
201#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
202#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
203#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD)
204#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD)
205#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD)
206
207/*
208 * Incore only flags for quotaoff - these bits get cleared when quota(s)
209 * are in the process of getting turned off. These flags are in m_qflags but
210 * never in sb_qflags.
211 */
212#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */
213#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */
214#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */
215#define XFS_ALL_QUOTA_ACTIVE \
216 (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
217 22
218/* 23/*
219 * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees 24 * Kernel only quota definitions and functions
220 * quota will be not be switched off as long as that inode lock is held.
221 */ 25 */
222#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
223 XFS_GQUOTA_ACTIVE | \
224 XFS_PQUOTA_ACTIVE))
225#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
226 XFS_PQUOTA_ACTIVE))
227#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
228#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
229#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
230 26
231/* 27struct xfs_trans;
232 * Flags to tell various functions what to do. Not all of these are meaningful
233 * to a single function. None of these XFS_QMOPT_* flags are meant to have
234 * persistent values (ie. their values can and will change between versions)
235 */
236#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */
237#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
238#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
239#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
240#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
241#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */
242#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
243#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
244#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
245
246/*
247 * flags to xfs_trans_mod_dquot to indicate which field needs to be
248 * modified.
249 */
250#define XFS_QMOPT_RES_REGBLKS 0x0010000
251#define XFS_QMOPT_RES_RTBLKS 0x0020000
252#define XFS_QMOPT_BCOUNT 0x0040000
253#define XFS_QMOPT_ICOUNT 0x0080000
254#define XFS_QMOPT_RTBCOUNT 0x0100000
255#define XFS_QMOPT_DELBCOUNT 0x0200000
256#define XFS_QMOPT_DELRTBCOUNT 0x0400000
257#define XFS_QMOPT_RES_INOS 0x0800000
258
259/*
260 * flags for dqalloc.
261 */
262#define XFS_QMOPT_INHERIT 0x1000000
263
264/*
265 * flags to xfs_trans_mod_dquot.
266 */
267#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS
268#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS
269#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS
270#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT
271#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT
272#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT
273#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT
274#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
275
276
277#define XFS_QMOPT_QUOTALL \
278 (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
279#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
280 28
281#ifdef __KERNEL__
282/* 29/*
283 * This check is done typically without holding the inode lock; 30 * This check is done typically without holding the inode lock;
284 * that may seem racy, but it is harmless in the context that it is used. 31 * that may seem racy, but it is harmless in the context that it is used.
@@ -301,13 +48,6 @@ typedef struct xfs_qoff_logformat {
301 (XFS_IS_PQUOTA_ON(mp) && \ 48 (XFS_IS_PQUOTA_ON(mp) && \
302 (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0)) 49 (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
303 50
304#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\
305 XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\
306 XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\
307 XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\
308 XFS_PQUOTA_CHKD)
309
310
311/* 51/*
312 * The structure kept inside the xfs_trans_t keep track of dquot changes 52 * The structure kept inside the xfs_trans_t keep track of dquot changes
313 * within a transaction and apply them later. 53 * within a transaction and apply them later.
@@ -340,8 +80,9 @@ extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
340 struct xfs_mount *, struct xfs_dquot *, 80 struct xfs_mount *, struct xfs_dquot *,
341 struct xfs_dquot *, struct xfs_dquot *, long, long, uint); 81 struct xfs_dquot *, struct xfs_dquot *, long, long, uint);
342 82
343extern int xfs_qm_vop_dqalloc(struct xfs_inode *, uid_t, gid_t, prid_t, uint, 83extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t,
344 struct xfs_dquot **, struct xfs_dquot **, struct xfs_dquot **); 84 prid_t, uint, struct xfs_dquot **, struct xfs_dquot **,
85 struct xfs_dquot **);
345extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *, 86extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
346 struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *); 87 struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *);
347extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **); 88extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **);
@@ -362,9 +103,9 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *);
362 103
363#else 104#else
364static inline int 105static inline int
365xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid, 106xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid,
366 uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp, 107 prid_t prid, uint flags, struct xfs_dquot **udqp,
367 struct xfs_dquot **pdqp) 108 struct xfs_dquot **gdqp, struct xfs_dquot **pdqp)
368{ 109{
369 *udqp = NULL; 110 *udqp = NULL;
370 *gdqp = NULL; 111 *gdqp = NULL;
@@ -415,5 +156,4 @@ extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
415 156
416extern const struct xfs_buf_ops xfs_dquot_buf_ops; 157extern const struct xfs_buf_ops xfs_dquot_buf_ops;
417 158
418#endif /* __KERNEL__ */
419#endif /* __XFS_QUOTA_H__ */ 159#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/xfs_quota_defs.h
new file mode 100644
index 000000000000..e6b0d6e1f4f2
--- /dev/null
+++ b/fs/xfs/xfs_quota_defs.h
@@ -0,0 +1,157 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_QUOTA_DEFS_H__
19#define __XFS_QUOTA_DEFS_H__
20
21/*
22 * Quota definitions shared between user and kernel source trees.
23 */
24
25/*
26 * Even though users may not have quota limits occupying all 64-bits,
27 * they may need 64-bit accounting. Hence, 64-bit quota-counters,
28 * and quota-limits. This is a waste in the common case, but hey ...
29 */
30typedef __uint64_t xfs_qcnt_t;
31typedef __uint16_t xfs_qwarncnt_t;
32
33/*
34 * flags for q_flags field in the dquot.
35 */
36#define XFS_DQ_USER 0x0001 /* a user quota */
37#define XFS_DQ_PROJ 0x0002 /* project quota */
38#define XFS_DQ_GROUP 0x0004 /* a group quota */
39#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */
40#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */
41
42#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
43
44#define XFS_DQ_FLAGS \
45 { XFS_DQ_USER, "USER" }, \
46 { XFS_DQ_PROJ, "PROJ" }, \
47 { XFS_DQ_GROUP, "GROUP" }, \
48 { XFS_DQ_DIRTY, "DIRTY" }, \
49 { XFS_DQ_FREEING, "FREEING" }
50
51/*
52 * We have the possibility of all three quota types being active at once, and
53 * hence free space modification requires modification of all three current
54 * dquots in a single transaction. For this case we need to have a reservation
55 * of at least 3 dquots.
56 *
57 * However, a chmod operation can change both UID and GID in a single
58 * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be
59 * modified. Hence for this case we need to reserve space for at least 4 dquots.
60 *
61 * And in the worst case, there's a rename operation that can be modifying up to
62 * 4 inodes with dquots attached to them. In reality, the only inodes that can
63 * have their dquots modified are the source and destination directory inodes
64 * due to directory name creation and removal. That can require space allocation
65 * and/or freeing on both directory inodes, and hence all three dquots on each
66 * inode can be modified. And if the directories are world writeable, all the
67 * dquots can be unique and so 6 dquots can be modified....
68 *
69 * And, of course, we also need to take into account the dquot log format item
70 * used to describe each dquot.
71 */
72#define XFS_DQUOT_LOGRES(mp) \
73 ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6)
74
75#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
76#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT)
77#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT)
78#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT)
79#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD)
80#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD)
81#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD)
82
83/*
84 * Incore only flags for quotaoff - these bits get cleared when quota(s)
85 * are in the process of getting turned off. These flags are in m_qflags but
86 * never in sb_qflags.
87 */
88#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */
89#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */
90#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */
91#define XFS_ALL_QUOTA_ACTIVE \
92 (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE)
93
94/*
95 * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
96 * quota will be not be switched off as long as that inode lock is held.
97 */
98#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
99 XFS_GQUOTA_ACTIVE | \
100 XFS_PQUOTA_ACTIVE))
101#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
102 XFS_PQUOTA_ACTIVE))
103#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
104#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
105#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
106
107/*
108 * Flags to tell various functions what to do. Not all of these are meaningful
109 * to a single function. None of these XFS_QMOPT_* flags are meant to have
110 * persistent values (ie. their values can and will change between versions)
111 */
112#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */
113#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
114#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
115#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
116#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
117#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */
118#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */
119#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
120#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
121
122/*
123 * flags to xfs_trans_mod_dquot to indicate which field needs to be
124 * modified.
125 */
126#define XFS_QMOPT_RES_REGBLKS 0x0010000
127#define XFS_QMOPT_RES_RTBLKS 0x0020000
128#define XFS_QMOPT_BCOUNT 0x0040000
129#define XFS_QMOPT_ICOUNT 0x0080000
130#define XFS_QMOPT_RTBCOUNT 0x0100000
131#define XFS_QMOPT_DELBCOUNT 0x0200000
132#define XFS_QMOPT_DELRTBCOUNT 0x0400000
133#define XFS_QMOPT_RES_INOS 0x0800000
134
135/*
136 * flags for dqalloc.
137 */
138#define XFS_QMOPT_INHERIT 0x1000000
139
140/*
141 * flags to xfs_trans_mod_dquot.
142 */
143#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS
144#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS
145#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS
146#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT
147#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT
148#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT
149#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT
150#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT
151
152
153#define XFS_QMOPT_QUOTALL \
154 (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
155#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
156
157#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 20e30f93b0c7..1326d81596c2 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -16,8 +16,10 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_sb.h" 19#include "xfs_format.h"
20#include "xfs_trans_resv.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
22#include "xfs_sb.h"
21#include "xfs_ag.h" 23#include "xfs_ag.h"
22#include "xfs_mount.h" 24#include "xfs_mount.h"
23#include "xfs_quota.h" 25#include "xfs_quota.h"
@@ -54,6 +56,18 @@ xfs_fs_get_xstate(
54} 56}
55 57
56STATIC int 58STATIC int
59xfs_fs_get_xstatev(
60 struct super_block *sb,
61 struct fs_quota_statv *fqs)
62{
63 struct xfs_mount *mp = XFS_M(sb);
64
65 if (!XFS_IS_QUOTA_RUNNING(mp))
66 return -ENOSYS;
67 return -xfs_qm_scall_getqstatv(mp, fqs);
68}
69
70STATIC int
57xfs_fs_set_xstate( 71xfs_fs_set_xstate(
58 struct super_block *sb, 72 struct super_block *sb,
59 unsigned int uflags, 73 unsigned int uflags,
@@ -133,6 +147,7 @@ xfs_fs_set_dqblk(
133} 147}
134 148
135const struct quotactl_ops xfs_quotactl_operations = { 149const struct quotactl_ops xfs_quotactl_operations = {
150 .get_xstatev = xfs_fs_get_xstatev,
136 .get_xstate = xfs_fs_get_xstate, 151 .get_xstate = xfs_fs_get_xstate,
137 .set_xstate = xfs_fs_set_xstate, 152 .set_xstate = xfs_fs_set_xstate,
138 .get_dqblk = xfs_fs_get_dqblk, 153 .get_dqblk = xfs_fs_get_dqblk,
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
deleted file mode 100644
index 30ff5f401d28..000000000000
--- a/fs/xfs/xfs_rename.c
+++ /dev/null
@@ -1,346 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_log.h"
22#include "xfs_trans.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_dir2.h"
26#include "xfs_mount.h"
27#include "xfs_da_btree.h"
28#include "xfs_bmap_btree.h"
29#include "xfs_dinode.h"
30#include "xfs_inode.h"
31#include "xfs_inode_item.h"
32#include "xfs_bmap.h"
33#include "xfs_error.h"
34#include "xfs_quota.h"
35#include "xfs_utils.h"
36#include "xfs_trans_space.h"
37#include "xfs_vnodeops.h"
38#include "xfs_trace.h"
39
40
41/*
42 * Enter all inodes for a rename transaction into a sorted array.
43 */
44STATIC void
45xfs_sort_for_rename(
46 xfs_inode_t *dp1, /* in: old (source) directory inode */
47 xfs_inode_t *dp2, /* in: new (target) directory inode */
48 xfs_inode_t *ip1, /* in: inode of old entry */
49 xfs_inode_t *ip2, /* in: inode of new entry, if it
50 already exists, NULL otherwise. */
51 xfs_inode_t **i_tab,/* out: array of inode returned, sorted */
52 int *num_inodes) /* out: number of inodes in array */
53{
54 xfs_inode_t *temp;
55 int i, j;
56
57 /*
58 * i_tab contains a list of pointers to inodes. We initialize
59 * the table here & we'll sort it. We will then use it to
60 * order the acquisition of the inode locks.
61 *
62 * Note that the table may contain duplicates. e.g., dp1 == dp2.
63 */
64 i_tab[0] = dp1;
65 i_tab[1] = dp2;
66 i_tab[2] = ip1;
67 if (ip2) {
68 *num_inodes = 4;
69 i_tab[3] = ip2;
70 } else {
71 *num_inodes = 3;
72 i_tab[3] = NULL;
73 }
74
75 /*
76 * Sort the elements via bubble sort. (Remember, there are at
77 * most 4 elements to sort, so this is adequate.)
78 */
79 for (i = 0; i < *num_inodes; i++) {
80 for (j = 1; j < *num_inodes; j++) {
81 if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
82 temp = i_tab[j];
83 i_tab[j] = i_tab[j-1];
84 i_tab[j-1] = temp;
85 }
86 }
87 }
88}
89
90/*
91 * xfs_rename
92 */
93int
94xfs_rename(
95 xfs_inode_t *src_dp,
96 struct xfs_name *src_name,
97 xfs_inode_t *src_ip,
98 xfs_inode_t *target_dp,
99 struct xfs_name *target_name,
100 xfs_inode_t *target_ip)
101{
102 xfs_trans_t *tp = NULL;
103 xfs_mount_t *mp = src_dp->i_mount;
104 int new_parent; /* moving to a new dir */
105 int src_is_directory; /* src_name is a directory */
106 int error;
107 xfs_bmap_free_t free_list;
108 xfs_fsblock_t first_block;
109 int cancel_flags;
110 int committed;
111 xfs_inode_t *inodes[4];
112 int spaceres;
113 int num_inodes;
114
115 trace_xfs_rename(src_dp, target_dp, src_name, target_name);
116
117 new_parent = (src_dp != target_dp);
118 src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
119
120 xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
121 inodes, &num_inodes);
122
123 xfs_bmap_init(&free_list, &first_block);
124 tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
125 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
126 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
127 error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
128 XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
129 if (error == ENOSPC) {
130 spaceres = 0;
131 error = xfs_trans_reserve(tp, 0, XFS_RENAME_LOG_RES(mp), 0,
132 XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
133 }
134 if (error) {
135 xfs_trans_cancel(tp, 0);
136 goto std_return;
137 }
138
139 /*
140 * Attach the dquots to the inodes
141 */
142 error = xfs_qm_vop_rename_dqattach(inodes);
143 if (error) {
144 xfs_trans_cancel(tp, cancel_flags);
145 goto std_return;
146 }
147
148 /*
149 * Lock all the participating inodes. Depending upon whether
150 * the target_name exists in the target directory, and
151 * whether the target directory is the same as the source
152 * directory, we can lock from 2 to 4 inodes.
153 */
154 xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
155
156 /*
157 * Join all the inodes to the transaction. From this point on,
158 * we can rely on either trans_commit or trans_cancel to unlock
159 * them.
160 */
161 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
162 if (new_parent)
163 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
164 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
165 if (target_ip)
166 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
167
168 /*
169 * If we are using project inheritance, we only allow renames
170 * into our tree when the project IDs are the same; else the
171 * tree quota mechanism would be circumvented.
172 */
173 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
174 (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
175 error = XFS_ERROR(EXDEV);
176 goto error_return;
177 }
178
179 /*
180 * Set up the target.
181 */
182 if (target_ip == NULL) {
183 /*
184 * If there's no space reservation, check the entry will
185 * fit before actually inserting it.
186 */
187 error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
188 if (error)
189 goto error_return;
190 /*
191 * If target does not exist and the rename crosses
192 * directories, adjust the target directory link count
193 * to account for the ".." reference from the new entry.
194 */
195 error = xfs_dir_createname(tp, target_dp, target_name,
196 src_ip->i_ino, &first_block,
197 &free_list, spaceres);
198 if (error == ENOSPC)
199 goto error_return;
200 if (error)
201 goto abort_return;
202
203 xfs_trans_ichgtime(tp, target_dp,
204 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
205
206 if (new_parent && src_is_directory) {
207 error = xfs_bumplink(tp, target_dp);
208 if (error)
209 goto abort_return;
210 }
211 } else { /* target_ip != NULL */
212 /*
213 * If target exists and it's a directory, check that both
214 * target and source are directories and that target can be
215 * destroyed, or that neither is a directory.
216 */
217 if (S_ISDIR(target_ip->i_d.di_mode)) {
218 /*
219 * Make sure target dir is empty.
220 */
221 if (!(xfs_dir_isempty(target_ip)) ||
222 (target_ip->i_d.di_nlink > 2)) {
223 error = XFS_ERROR(EEXIST);
224 goto error_return;
225 }
226 }
227
228 /*
229 * Link the source inode under the target name.
230 * If the source inode is a directory and we are moving
231 * it across directories, its ".." entry will be
232 * inconsistent until we replace that down below.
233 *
234 * In case there is already an entry with the same
235 * name at the destination directory, remove it first.
236 */
237 error = xfs_dir_replace(tp, target_dp, target_name,
238 src_ip->i_ino,
239 &first_block, &free_list, spaceres);
240 if (error)
241 goto abort_return;
242
243 xfs_trans_ichgtime(tp, target_dp,
244 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
245
246 /*
247 * Decrement the link count on the target since the target
248 * dir no longer points to it.
249 */
250 error = xfs_droplink(tp, target_ip);
251 if (error)
252 goto abort_return;
253
254 if (src_is_directory) {
255 /*
256 * Drop the link from the old "." entry.
257 */
258 error = xfs_droplink(tp, target_ip);
259 if (error)
260 goto abort_return;
261 }
262 } /* target_ip != NULL */
263
264 /*
265 * Remove the source.
266 */
267 if (new_parent && src_is_directory) {
268 /*
269 * Rewrite the ".." entry to point to the new
270 * directory.
271 */
272 error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
273 target_dp->i_ino,
274 &first_block, &free_list, spaceres);
275 ASSERT(error != EEXIST);
276 if (error)
277 goto abort_return;
278 }
279
280 /*
281 * We always want to hit the ctime on the source inode.
282 *
283 * This isn't strictly required by the standards since the source
284 * inode isn't really being changed, but old unix file systems did
285 * it and some incremental backup programs won't work without it.
286 */
287 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
288 xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
289
290 /*
291 * Adjust the link count on src_dp. This is necessary when
292 * renaming a directory, either within one parent when
293 * the target existed, or across two parent directories.
294 */
295 if (src_is_directory && (new_parent || target_ip != NULL)) {
296
297 /*
298 * Decrement link count on src_directory since the
299 * entry that's moved no longer points to it.
300 */
301 error = xfs_droplink(tp, src_dp);
302 if (error)
303 goto abort_return;
304 }
305
306 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
307 &first_block, &free_list, spaceres);
308 if (error)
309 goto abort_return;
310
311 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
312 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
313 if (new_parent)
314 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
315
316 /*
317 * If this is a synchronous mount, make sure that the
318 * rename transaction goes to disk before returning to
319 * the user.
320 */
321 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
322 xfs_trans_set_sync(tp);
323 }
324
325 error = xfs_bmap_finish(&tp, &free_list, &committed);
326 if (error) {
327 xfs_bmap_cancel(&free_list);
328 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
329 XFS_TRANS_ABORT));
330 goto std_return;
331 }
332
333 /*
334 * trans_commit will unlock src_ip, target_ip & decrement
335 * the vnode references.
336 */
337 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
338
339 abort_return:
340 cancel_flags |= XFS_TRANS_ABORT;
341 error_return:
342 xfs_bmap_cancel(&free_list);
343 xfs_trans_cancel(tp, cancel_flags);
344 std_return:
345 return error;
346}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 98dc670d3ee0..6f9e63c9fc26 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -17,25 +17,24 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_format.h"
21#include "xfs_bit.h" 21#include "xfs_bit.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
25#include "xfs_ag.h" 25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_mount.h" 26#include "xfs_mount.h"
28#include "xfs_bmap_btree.h" 27#include "xfs_bmap_btree.h"
29#include "xfs_dinode.h" 28#include "xfs_dinode.h"
30#include "xfs_inode.h" 29#include "xfs_inode.h"
31#include "xfs_alloc.h" 30#include "xfs_alloc.h"
32#include "xfs_bmap.h" 31#include "xfs_bmap.h"
32#include "xfs_bmap_util.h"
33#include "xfs_rtalloc.h" 33#include "xfs_rtalloc.h"
34#include "xfs_fsops.h" 34#include "xfs_fsops.h"
35#include "xfs_error.h" 35#include "xfs_error.h"
36#include "xfs_inode_item.h" 36#include "xfs_inode_item.h"
37#include "xfs_trans_space.h" 37#include "xfs_trans_space.h"
38#include "xfs_utils.h"
39#include "xfs_trace.h" 38#include "xfs_trace.h"
40#include "xfs_buf.h" 39#include "xfs_buf.h"
41#include "xfs_icache.h" 40#include "xfs_icache.h"
@@ -101,10 +100,9 @@ xfs_growfs_rt_alloc(
101 /* 100 /*
102 * Reserve space & log for one extent added to the file. 101 * Reserve space & log for one extent added to the file.
103 */ 102 */
104 if ((error = xfs_trans_reserve(tp, resblks, 103 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
105 XFS_GROWRTALLOC_LOG_RES(mp), 0, 104 resblks, 0);
106 XFS_TRANS_PERM_LOG_RES, 105 if (error)
107 XFS_DEFAULT_PERM_LOG_COUNT)))
108 goto error_cancel; 106 goto error_cancel;
109 cancelflags = XFS_TRANS_RELEASE_LOG_RES; 107 cancelflags = XFS_TRANS_RELEASE_LOG_RES;
110 /* 108 /*
@@ -147,8 +145,9 @@ xfs_growfs_rt_alloc(
147 /* 145 /*
148 * Reserve log for one block zeroing. 146 * Reserve log for one block zeroing.
149 */ 147 */
150 if ((error = xfs_trans_reserve(tp, 0, 148 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
151 XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0))) 149 0, 0);
150 if (error)
152 goto error_cancel; 151 goto error_cancel;
153 /* 152 /*
154 * Lock the bitmap inode. 153 * Lock the bitmap inode.
@@ -736,8 +735,8 @@ xfs_rtallocate_range(
736{ 735{
737 xfs_rtblock_t end; /* end of the allocated extent */ 736 xfs_rtblock_t end; /* end of the allocated extent */
738 int error; /* error value */ 737 int error; /* error value */
739 xfs_rtblock_t postblock; /* first block allocated > end */ 738 xfs_rtblock_t postblock = 0; /* first block allocated > end */
740 xfs_rtblock_t preblock; /* first block allocated < start */ 739 xfs_rtblock_t preblock = 0; /* first block allocated < start */
741 740
742 end = start + len - 1; 741 end = start + len - 1;
743 /* 742 /*
@@ -1958,8 +1957,9 @@ xfs_growfs_rt(
1958 * Start a transaction, get the log reservation. 1957 * Start a transaction, get the log reservation.
1959 */ 1958 */
1960 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE); 1959 tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
1961 if ((error = xfs_trans_reserve(tp, 0, 1960 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree,
1962 XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0))) 1961 0, 0);
1962 if (error)
1963 goto error_cancel; 1963 goto error_cancel;
1964 /* 1964 /*
1965 * Lock out other callers by grabbing the bitmap inode lock. 1965 * Lock out other callers by grabbing the bitmap inode lock.
@@ -2148,7 +2148,7 @@ xfs_rtfree_extent(
2148 ASSERT(mp->m_rbmip->i_itemp != NULL); 2148 ASSERT(mp->m_rbmip->i_itemp != NULL);
2149 ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); 2149 ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
2150 2150
2151#if defined(__KERNEL__) && defined(DEBUG) 2151#ifdef DEBUG
2152 /* 2152 /*
2153 * Check to see that this whole range is currently allocated. 2153 * Check to see that this whole range is currently allocated.
2154 */ 2154 */
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index f7f3a359c1c5..b2a1a24c0e2f 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -18,58 +18,11 @@
18#ifndef __XFS_RTALLOC_H__ 18#ifndef __XFS_RTALLOC_H__
19#define __XFS_RTALLOC_H__ 19#define __XFS_RTALLOC_H__
20 20
21/* kernel only definitions and functions */
22
21struct xfs_mount; 23struct xfs_mount;
22struct xfs_trans; 24struct xfs_trans;
23 25
24/* Min and max rt extent sizes, specified in bytes */
25#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */
26#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */
27#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
28
29/*
30 * Constants for bit manipulations.
31 */
32#define XFS_NBBYLOG 3 /* log2(NBBY) */
33#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */
34#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG)
35#define XFS_NBWORD (1 << XFS_NBWORDLOG)
36#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1)
37
38#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize)
39#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask)
40#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize)
41#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask)
42
43/*
44 * Summary and bit manipulation macros.
45 */
46#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb)))
47#define XFS_SUMOFFSTOBLOCK(mp,s) \
48 (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog)
49#define XFS_SUMPTR(mp,bp,so) \
50 ((xfs_suminfo_t *)((bp)->b_addr + \
51 (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp))))
52
53#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log)
54#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log)
55#define XFS_BITTOWORD(mp,bi) \
56 ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp)))
57
58#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b))
59#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b))
60
61#define XFS_RTLOBIT(w) xfs_lowbit32(w)
62#define XFS_RTHIBIT(w) xfs_highbit32(w)
63
64#if XFS_BIG_BLKNOS
65#define XFS_RTBLOCKLOG(b) xfs_highbit64(b)
66#else
67#define XFS_RTBLOCKLOG(b) xfs_highbit32(b)
68#endif
69
70
71#ifdef __KERNEL__
72
73#ifdef CONFIG_XFS_RT 26#ifdef CONFIG_XFS_RT
74/* 27/*
75 * Function prototypes for exported functions. 28 * Function prototypes for exported functions.
@@ -161,6 +114,4 @@ xfs_rtmount_init(
161# define xfs_rtunmount_inodes(m) 114# define xfs_rtunmount_inodes(m)
162#endif /* CONFIG_XFS_RT */ 115#endif /* CONFIG_XFS_RT */
163 116
164#endif /* __KERNEL__ */
165
166#endif /* __XFS_RTALLOC_H__ */ 117#endif /* __XFS_RTALLOC_H__ */
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
new file mode 100644
index 000000000000..a5b59d92eb70
--- /dev/null
+++ b/fs/xfs/xfs_sb.c
@@ -0,0 +1,834 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_format.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_mount.h"
29#include "xfs_da_btree.h"
30#include "xfs_dir2_format.h"
31#include "xfs_dir2.h"
32#include "xfs_bmap_btree.h"
33#include "xfs_alloc_btree.h"
34#include "xfs_ialloc_btree.h"
35#include "xfs_dinode.h"
36#include "xfs_inode.h"
37#include "xfs_btree.h"
38#include "xfs_ialloc.h"
39#include "xfs_alloc.h"
40#include "xfs_rtalloc.h"
41#include "xfs_bmap.h"
42#include "xfs_error.h"
43#include "xfs_quota.h"
44#include "xfs_fsops.h"
45#include "xfs_trace.h"
46#include "xfs_cksum.h"
47#include "xfs_buf_item.h"
48
49/*
50 * Physical superblock buffer manipulations. Shared with libxfs in userspace.
51 */
52
53static const struct {
54 short offset;
55 short type; /* 0 = integer
56 * 1 = binary / string (no translation)
57 */
58} xfs_sb_info[] = {
59 { offsetof(xfs_sb_t, sb_magicnum), 0 },
60 { offsetof(xfs_sb_t, sb_blocksize), 0 },
61 { offsetof(xfs_sb_t, sb_dblocks), 0 },
62 { offsetof(xfs_sb_t, sb_rblocks), 0 },
63 { offsetof(xfs_sb_t, sb_rextents), 0 },
64 { offsetof(xfs_sb_t, sb_uuid), 1 },
65 { offsetof(xfs_sb_t, sb_logstart), 0 },
66 { offsetof(xfs_sb_t, sb_rootino), 0 },
67 { offsetof(xfs_sb_t, sb_rbmino), 0 },
68 { offsetof(xfs_sb_t, sb_rsumino), 0 },
69 { offsetof(xfs_sb_t, sb_rextsize), 0 },
70 { offsetof(xfs_sb_t, sb_agblocks), 0 },
71 { offsetof(xfs_sb_t, sb_agcount), 0 },
72 { offsetof(xfs_sb_t, sb_rbmblocks), 0 },
73 { offsetof(xfs_sb_t, sb_logblocks), 0 },
74 { offsetof(xfs_sb_t, sb_versionnum), 0 },
75 { offsetof(xfs_sb_t, sb_sectsize), 0 },
76 { offsetof(xfs_sb_t, sb_inodesize), 0 },
77 { offsetof(xfs_sb_t, sb_inopblock), 0 },
78 { offsetof(xfs_sb_t, sb_fname[0]), 1 },
79 { offsetof(xfs_sb_t, sb_blocklog), 0 },
80 { offsetof(xfs_sb_t, sb_sectlog), 0 },
81 { offsetof(xfs_sb_t, sb_inodelog), 0 },
82 { offsetof(xfs_sb_t, sb_inopblog), 0 },
83 { offsetof(xfs_sb_t, sb_agblklog), 0 },
84 { offsetof(xfs_sb_t, sb_rextslog), 0 },
85 { offsetof(xfs_sb_t, sb_inprogress), 0 },
86 { offsetof(xfs_sb_t, sb_imax_pct), 0 },
87 { offsetof(xfs_sb_t, sb_icount), 0 },
88 { offsetof(xfs_sb_t, sb_ifree), 0 },
89 { offsetof(xfs_sb_t, sb_fdblocks), 0 },
90 { offsetof(xfs_sb_t, sb_frextents), 0 },
91 { offsetof(xfs_sb_t, sb_uquotino), 0 },
92 { offsetof(xfs_sb_t, sb_gquotino), 0 },
93 { offsetof(xfs_sb_t, sb_qflags), 0 },
94 { offsetof(xfs_sb_t, sb_flags), 0 },
95 { offsetof(xfs_sb_t, sb_shared_vn), 0 },
96 { offsetof(xfs_sb_t, sb_inoalignmt), 0 },
97 { offsetof(xfs_sb_t, sb_unit), 0 },
98 { offsetof(xfs_sb_t, sb_width), 0 },
99 { offsetof(xfs_sb_t, sb_dirblklog), 0 },
100 { offsetof(xfs_sb_t, sb_logsectlog), 0 },
101 { offsetof(xfs_sb_t, sb_logsectsize), 0 },
102 { offsetof(xfs_sb_t, sb_logsunit), 0 },
103 { offsetof(xfs_sb_t, sb_features2), 0 },
104 { offsetof(xfs_sb_t, sb_bad_features2), 0 },
105 { offsetof(xfs_sb_t, sb_features_compat), 0 },
106 { offsetof(xfs_sb_t, sb_features_ro_compat), 0 },
107 { offsetof(xfs_sb_t, sb_features_incompat), 0 },
108 { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
109 { offsetof(xfs_sb_t, sb_crc), 0 },
110 { offsetof(xfs_sb_t, sb_pad), 0 },
111 { offsetof(xfs_sb_t, sb_pquotino), 0 },
112 { offsetof(xfs_sb_t, sb_lsn), 0 },
113 { sizeof(xfs_sb_t), 0 }
114};
115
116/*
117 * Reference counting access wrappers to the perag structures.
118 * Because we never free per-ag structures, the only thing we
119 * have to protect against changes is the tree structure itself.
120 */
121struct xfs_perag *
122xfs_perag_get(
123 struct xfs_mount *mp,
124 xfs_agnumber_t agno)
125{
126 struct xfs_perag *pag;
127 int ref = 0;
128
129 rcu_read_lock();
130 pag = radix_tree_lookup(&mp->m_perag_tree, agno);
131 if (pag) {
132 ASSERT(atomic_read(&pag->pag_ref) >= 0);
133 ref = atomic_inc_return(&pag->pag_ref);
134 }
135 rcu_read_unlock();
136 trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
137 return pag;
138}
139
140/*
141 * search from @first to find the next perag with the given tag set.
142 */
143struct xfs_perag *
144xfs_perag_get_tag(
145 struct xfs_mount *mp,
146 xfs_agnumber_t first,
147 int tag)
148{
149 struct xfs_perag *pag;
150 int found;
151 int ref;
152
153 rcu_read_lock();
154 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
155 (void **)&pag, first, 1, tag);
156 if (found <= 0) {
157 rcu_read_unlock();
158 return NULL;
159 }
160 ref = atomic_inc_return(&pag->pag_ref);
161 rcu_read_unlock();
162 trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_);
163 return pag;
164}
165
166void
167xfs_perag_put(
168 struct xfs_perag *pag)
169{
170 int ref;
171
172 ASSERT(atomic_read(&pag->pag_ref) > 0);
173 ref = atomic_dec_return(&pag->pag_ref);
174 trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
175}
176
177/*
178 * Check the validity of the SB found.
179 */
180STATIC int
181xfs_mount_validate_sb(
182 xfs_mount_t *mp,
183 xfs_sb_t *sbp,
184 bool check_inprogress,
185 bool check_version)
186{
187
188 /*
189 * If the log device and data device have the
190 * same device number, the log is internal.
191 * Consequently, the sb_logstart should be non-zero. If
192 * we have a zero sb_logstart in this case, we may be trying to mount
193 * a volume filesystem in a non-volume manner.
194 */
195 if (sbp->sb_magicnum != XFS_SB_MAGIC) {
196 xfs_warn(mp, "bad magic number");
197 return XFS_ERROR(EWRONGFS);
198 }
199
200
201 if (!xfs_sb_good_version(sbp)) {
202 xfs_warn(mp, "bad version");
203 return XFS_ERROR(EWRONGFS);
204 }
205
206 /*
207 * Version 5 superblock feature mask validation. Reject combinations the
208 * kernel cannot support up front before checking anything else. For
209 * write validation, we don't need to check feature masks.
210 */
211 if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
212 xfs_alert(mp,
213"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
214"Use of these features in this kernel is at your own risk!");
215
216 if (xfs_sb_has_compat_feature(sbp,
217 XFS_SB_FEAT_COMPAT_UNKNOWN)) {
218 xfs_warn(mp,
219"Superblock has unknown compatible features (0x%x) enabled.\n"
220"Using a more recent kernel is recommended.",
221 (sbp->sb_features_compat &
222 XFS_SB_FEAT_COMPAT_UNKNOWN));
223 }
224
225 if (xfs_sb_has_ro_compat_feature(sbp,
226 XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
227 xfs_alert(mp,
228"Superblock has unknown read-only compatible features (0x%x) enabled.",
229 (sbp->sb_features_ro_compat &
230 XFS_SB_FEAT_RO_COMPAT_UNKNOWN));
231 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
232 xfs_warn(mp,
233"Attempted to mount read-only compatible filesystem read-write.\n"
234"Filesystem can only be safely mounted read only.");
235 return XFS_ERROR(EINVAL);
236 }
237 }
238 if (xfs_sb_has_incompat_feature(sbp,
239 XFS_SB_FEAT_INCOMPAT_UNKNOWN)) {
240 xfs_warn(mp,
241"Superblock has unknown incompatible features (0x%x) enabled.\n"
242"Filesystem can not be safely mounted by this kernel.",
243 (sbp->sb_features_incompat &
244 XFS_SB_FEAT_INCOMPAT_UNKNOWN));
245 return XFS_ERROR(EINVAL);
246 }
247 }
248
249 if (xfs_sb_version_has_pquotino(sbp)) {
250 if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
251 xfs_notice(mp,
252 "Version 5 of Super block has XFS_OQUOTA bits.\n");
253 return XFS_ERROR(EFSCORRUPTED);
254 }
255 } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
256 XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
257 xfs_notice(mp,
258"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.\n");
259 return XFS_ERROR(EFSCORRUPTED);
260 }
261
262 if (unlikely(
263 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
264 xfs_warn(mp,
265 "filesystem is marked as having an external log; "
266 "specify logdev on the mount command line.");
267 return XFS_ERROR(EINVAL);
268 }
269
270 if (unlikely(
271 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
272 xfs_warn(mp,
273 "filesystem is marked as having an internal log; "
274 "do not specify logdev on the mount command line.");
275 return XFS_ERROR(EINVAL);
276 }
277
278 /*
279 * More sanity checking. Most of these were stolen directly from
280 * xfs_repair.
281 */
282 if (unlikely(
283 sbp->sb_agcount <= 0 ||
284 sbp->sb_sectsize < XFS_MIN_SECTORSIZE ||
285 sbp->sb_sectsize > XFS_MAX_SECTORSIZE ||
286 sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG ||
287 sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG ||
288 sbp->sb_sectsize != (1 << sbp->sb_sectlog) ||
289 sbp->sb_blocksize < XFS_MIN_BLOCKSIZE ||
290 sbp->sb_blocksize > XFS_MAX_BLOCKSIZE ||
291 sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG ||
292 sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
293 sbp->sb_blocksize != (1 << sbp->sb_blocklog) ||
294 sbp->sb_inodesize < XFS_DINODE_MIN_SIZE ||
295 sbp->sb_inodesize > XFS_DINODE_MAX_SIZE ||
296 sbp->sb_inodelog < XFS_DINODE_MIN_LOG ||
297 sbp->sb_inodelog > XFS_DINODE_MAX_LOG ||
298 sbp->sb_inodesize != (1 << sbp->sb_inodelog) ||
299 (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) ||
300 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
301 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
302 (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) ||
303 sbp->sb_dblocks == 0 ||
304 sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) ||
305 sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
306 XFS_CORRUPTION_ERROR("SB sanity check failed",
307 XFS_ERRLEVEL_LOW, mp, sbp);
308 return XFS_ERROR(EFSCORRUPTED);
309 }
310
311 /*
312 * Until this is fixed only page-sized or smaller data blocks work.
313 */
314 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
315 xfs_warn(mp,
316 "File system with blocksize %d bytes. "
317 "Only pagesize (%ld) or less will currently work.",
318 sbp->sb_blocksize, PAGE_SIZE);
319 return XFS_ERROR(ENOSYS);
320 }
321
322 /*
323 * Currently only very few inode sizes are supported.
324 */
325 switch (sbp->sb_inodesize) {
326 case 256:
327 case 512:
328 case 1024:
329 case 2048:
330 break;
331 default:
332 xfs_warn(mp, "inode size of %d bytes not supported",
333 sbp->sb_inodesize);
334 return XFS_ERROR(ENOSYS);
335 }
336
337 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
338 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
339 xfs_warn(mp,
340 "file system too large to be mounted on this system.");
341 return XFS_ERROR(EFBIG);
342 }
343
344 if (check_inprogress && sbp->sb_inprogress) {
345 xfs_warn(mp, "Offline file system operation in progress!");
346 return XFS_ERROR(EFSCORRUPTED);
347 }
348
349 /*
350 * Version 1 directory format has never worked on Linux.
351 */
352 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
353 xfs_warn(mp, "file system using version 1 directory format");
354 return XFS_ERROR(ENOSYS);
355 }
356
357 return 0;
358}
359
360void
361xfs_sb_quota_from_disk(struct xfs_sb *sbp)
362{
363 /*
364 * older mkfs doesn't initialize quota inodes to NULLFSINO. This
365 * leads to in-core values having two different values for a quota
366 * inode to be invalid: 0 and NULLFSINO. Change it to a single value
367 * NULLFSINO.
368 *
369 * Note that this change affect only the in-core values. These
370 * values are not written back to disk unless any quota information
371 * is written to the disk. Even in that case, sb_pquotino field is
372 * not written to disk unless the superblock supports pquotino.
373 */
374 if (sbp->sb_uquotino == 0)
375 sbp->sb_uquotino = NULLFSINO;
376 if (sbp->sb_gquotino == 0)
377 sbp->sb_gquotino = NULLFSINO;
378 if (sbp->sb_pquotino == 0)
379 sbp->sb_pquotino = NULLFSINO;
380
381 /*
382 * We need to do these manipilations only if we are working
383 * with an older version of on-disk superblock.
384 */
385 if (xfs_sb_version_has_pquotino(sbp))
386 return;
387
388 if (sbp->sb_qflags & XFS_OQUOTA_ENFD)
389 sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
390 XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD;
391 if (sbp->sb_qflags & XFS_OQUOTA_CHKD)
392 sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ?
393 XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
394 sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
395
396 if (sbp->sb_qflags & XFS_PQUOTA_ACCT) {
397 /*
398 * In older version of superblock, on-disk superblock only
399 * has sb_gquotino, and in-core superblock has both sb_gquotino
400 * and sb_pquotino. But, only one of them is supported at any
401 * point of time. So, if PQUOTA is set in disk superblock,
402 * copy over sb_gquotino to sb_pquotino.
403 */
404 sbp->sb_pquotino = sbp->sb_gquotino;
405 sbp->sb_gquotino = NULLFSINO;
406 }
407}
408
409void
410xfs_sb_from_disk(
411 struct xfs_sb *to,
412 xfs_dsb_t *from)
413{
414 to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
415 to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
416 to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
417 to->sb_rblocks = be64_to_cpu(from->sb_rblocks);
418 to->sb_rextents = be64_to_cpu(from->sb_rextents);
419 memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
420 to->sb_logstart = be64_to_cpu(from->sb_logstart);
421 to->sb_rootino = be64_to_cpu(from->sb_rootino);
422 to->sb_rbmino = be64_to_cpu(from->sb_rbmino);
423 to->sb_rsumino = be64_to_cpu(from->sb_rsumino);
424 to->sb_rextsize = be32_to_cpu(from->sb_rextsize);
425 to->sb_agblocks = be32_to_cpu(from->sb_agblocks);
426 to->sb_agcount = be32_to_cpu(from->sb_agcount);
427 to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks);
428 to->sb_logblocks = be32_to_cpu(from->sb_logblocks);
429 to->sb_versionnum = be16_to_cpu(from->sb_versionnum);
430 to->sb_sectsize = be16_to_cpu(from->sb_sectsize);
431 to->sb_inodesize = be16_to_cpu(from->sb_inodesize);
432 to->sb_inopblock = be16_to_cpu(from->sb_inopblock);
433 memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
434 to->sb_blocklog = from->sb_blocklog;
435 to->sb_sectlog = from->sb_sectlog;
436 to->sb_inodelog = from->sb_inodelog;
437 to->sb_inopblog = from->sb_inopblog;
438 to->sb_agblklog = from->sb_agblklog;
439 to->sb_rextslog = from->sb_rextslog;
440 to->sb_inprogress = from->sb_inprogress;
441 to->sb_imax_pct = from->sb_imax_pct;
442 to->sb_icount = be64_to_cpu(from->sb_icount);
443 to->sb_ifree = be64_to_cpu(from->sb_ifree);
444 to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks);
445 to->sb_frextents = be64_to_cpu(from->sb_frextents);
446 to->sb_uquotino = be64_to_cpu(from->sb_uquotino);
447 to->sb_gquotino = be64_to_cpu(from->sb_gquotino);
448 to->sb_qflags = be16_to_cpu(from->sb_qflags);
449 to->sb_flags = from->sb_flags;
450 to->sb_shared_vn = from->sb_shared_vn;
451 to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt);
452 to->sb_unit = be32_to_cpu(from->sb_unit);
453 to->sb_width = be32_to_cpu(from->sb_width);
454 to->sb_dirblklog = from->sb_dirblklog;
455 to->sb_logsectlog = from->sb_logsectlog;
456 to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize);
457 to->sb_logsunit = be32_to_cpu(from->sb_logsunit);
458 to->sb_features2 = be32_to_cpu(from->sb_features2);
459 to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2);
460 to->sb_features_compat = be32_to_cpu(from->sb_features_compat);
461 to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat);
462 to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
463 to->sb_features_log_incompat =
464 be32_to_cpu(from->sb_features_log_incompat);
465 to->sb_pad = 0;
466 to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
467 to->sb_lsn = be64_to_cpu(from->sb_lsn);
468}
469
470static inline void
471xfs_sb_quota_to_disk(
472 xfs_dsb_t *to,
473 xfs_sb_t *from,
474 __int64_t *fields)
475{
476 __uint16_t qflags = from->sb_qflags;
477
478 /*
479 * We need to do these manipilations only if we are working
480 * with an older version of on-disk superblock.
481 */
482 if (xfs_sb_version_has_pquotino(from))
483 return;
484
485 if (*fields & XFS_SB_QFLAGS) {
486 /*
487 * The in-core version of sb_qflags do not have
488 * XFS_OQUOTA_* flags, whereas the on-disk version
489 * does. So, convert incore XFS_{PG}QUOTA_* flags
490 * to on-disk XFS_OQUOTA_* flags.
491 */
492 qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
493 XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
494
495 if (from->sb_qflags &
496 (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
497 qflags |= XFS_OQUOTA_ENFD;
498 if (from->sb_qflags &
499 (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
500 qflags |= XFS_OQUOTA_CHKD;
501 to->sb_qflags = cpu_to_be16(qflags);
502 *fields &= ~XFS_SB_QFLAGS;
503 }
504
505 /*
506 * GQUOTINO and PQUOTINO cannot be used together in versions
507 * of superblock that do not have pquotino. from->sb_flags
508 * tells us which quota is active and should be copied to
509 * disk.
510 */
511 if ((*fields & XFS_SB_GQUOTINO) &&
512 (from->sb_qflags & XFS_GQUOTA_ACCT))
513 to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
514 else if ((*fields & XFS_SB_PQUOTINO) &&
515 (from->sb_qflags & XFS_PQUOTA_ACCT))
516 to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
517
518 *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO);
519}
520
521/*
522 * Copy in core superblock to ondisk one.
523 *
524 * The fields argument is mask of superblock fields to copy.
525 */
526void
527xfs_sb_to_disk(
528 xfs_dsb_t *to,
529 xfs_sb_t *from,
530 __int64_t fields)
531{
532 xfs_caddr_t to_ptr = (xfs_caddr_t)to;
533 xfs_caddr_t from_ptr = (xfs_caddr_t)from;
534 xfs_sb_field_t f;
535 int first;
536 int size;
537
538 ASSERT(fields);
539 if (!fields)
540 return;
541
542 xfs_sb_quota_to_disk(to, from, &fields);
543 while (fields) {
544 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
545 first = xfs_sb_info[f].offset;
546 size = xfs_sb_info[f + 1].offset - first;
547
548 ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
549
550 if (size == 1 || xfs_sb_info[f].type == 1) {
551 memcpy(to_ptr + first, from_ptr + first, size);
552 } else {
553 switch (size) {
554 case 2:
555 *(__be16 *)(to_ptr + first) =
556 cpu_to_be16(*(__u16 *)(from_ptr + first));
557 break;
558 case 4:
559 *(__be32 *)(to_ptr + first) =
560 cpu_to_be32(*(__u32 *)(from_ptr + first));
561 break;
562 case 8:
563 *(__be64 *)(to_ptr + first) =
564 cpu_to_be64(*(__u64 *)(from_ptr + first));
565 break;
566 default:
567 ASSERT(0);
568 }
569 }
570
571 fields &= ~(1LL << f);
572 }
573}
574
575static int
576xfs_sb_verify(
577 struct xfs_buf *bp,
578 bool check_version)
579{
580 struct xfs_mount *mp = bp->b_target->bt_mount;
581 struct xfs_sb sb;
582
583 xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
584
585 /*
586 * Only check the in progress field for the primary superblock as
587 * mkfs.xfs doesn't clear it from secondary superblocks.
588 */
589 return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
590 check_version);
591}
592
593/*
594 * If the superblock has the CRC feature bit set or the CRC field is non-null,
595 * check that the CRC is valid. We check the CRC field is non-null because a
596 * single bit error could clear the feature bit and unused parts of the
597 * superblock are supposed to be zero. Hence a non-null crc field indicates that
598 * we've potentially lost a feature bit and we should check it anyway.
599 */
600static void
601xfs_sb_read_verify(
602 struct xfs_buf *bp)
603{
604 struct xfs_mount *mp = bp->b_target->bt_mount;
605 struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
606 int error;
607
608 /*
609 * open code the version check to avoid needing to convert the entire
610 * superblock from disk order just to check the version number
611 */
612 if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) &&
613 (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) ==
614 XFS_SB_VERSION_5) ||
615 dsb->sb_crc != 0)) {
616
617 if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize),
618 offsetof(struct xfs_sb, sb_crc))) {
619 error = EFSCORRUPTED;
620 goto out_error;
621 }
622 }
623 error = xfs_sb_verify(bp, true);
624
625out_error:
626 if (error) {
627 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
628 mp, bp->b_addr);
629 xfs_buf_ioerror(bp, error);
630 }
631}
632
633/*
634 * We may be probed for a filesystem match, so we may not want to emit
635 * messages when the superblock buffer is not actually an XFS superblock.
636 * If we find an XFS superblock, then run a normal, noisy mount because we are
637 * really going to mount it and want to know about errors.
638 */
639static void
640xfs_sb_quiet_read_verify(
641 struct xfs_buf *bp)
642{
643 struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
644
645
646 if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
647 /* XFS filesystem, verify noisily! */
648 xfs_sb_read_verify(bp);
649 return;
650 }
651 /* quietly fail */
652 xfs_buf_ioerror(bp, EWRONGFS);
653}
654
655static void
656xfs_sb_write_verify(
657 struct xfs_buf *bp)
658{
659 struct xfs_mount *mp = bp->b_target->bt_mount;
660 struct xfs_buf_log_item *bip = bp->b_fspriv;
661 int error;
662
663 error = xfs_sb_verify(bp, false);
664 if (error) {
665 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
666 mp, bp->b_addr);
667 xfs_buf_ioerror(bp, error);
668 return;
669 }
670
671 if (!xfs_sb_version_hascrc(&mp->m_sb))
672 return;
673
674 if (bip)
675 XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
676
677 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
678 offsetof(struct xfs_sb, sb_crc));
679}
680
681const struct xfs_buf_ops xfs_sb_buf_ops = {
682 .verify_read = xfs_sb_read_verify,
683 .verify_write = xfs_sb_write_verify,
684};
685
686const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
687 .verify_read = xfs_sb_quiet_read_verify,
688 .verify_write = xfs_sb_write_verify,
689};
690
691/*
692 * xfs_mount_common
693 *
694 * Mount initialization code establishing various mount
695 * fields from the superblock associated with the given
696 * mount structure
697 */
698void
699xfs_sb_mount_common(
700 struct xfs_mount *mp,
701 struct xfs_sb *sbp)
702{
703 mp->m_agfrotor = mp->m_agirotor = 0;
704 spin_lock_init(&mp->m_agirotor_lock);
705 mp->m_maxagi = mp->m_sb.sb_agcount;
706 mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG;
707 mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
708 mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
709 mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
710 mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
711 mp->m_blockmask = sbp->sb_blocksize - 1;
712 mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
713 mp->m_blockwmask = mp->m_blockwsize - 1;
714
715 mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
716 mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
717 mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
718 mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
719
720 mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
721 mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
722 mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
723 mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
724
725 mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
726 mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
727 mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
728 mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
729
730 mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
731 mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
732 sbp->sb_inopblock);
733 mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
734}
735
736/*
737 * xfs_initialize_perag_data
738 *
739 * Read in each per-ag structure so we can count up the number of
740 * allocated inodes, free inodes and used filesystem blocks as this
741 * information is no longer persistent in the superblock. Once we have
742 * this information, write it into the in-core superblock structure.
743 */
744int
745xfs_initialize_perag_data(
746 struct xfs_mount *mp,
747 xfs_agnumber_t agcount)
748{
749 xfs_agnumber_t index;
750 xfs_perag_t *pag;
751 xfs_sb_t *sbp = &mp->m_sb;
752 uint64_t ifree = 0;
753 uint64_t ialloc = 0;
754 uint64_t bfree = 0;
755 uint64_t bfreelst = 0;
756 uint64_t btree = 0;
757 int error;
758
759 for (index = 0; index < agcount; index++) {
760 /*
761 * read the agf, then the agi. This gets us
762 * all the information we need and populates the
763 * per-ag structures for us.
764 */
765 error = xfs_alloc_pagf_init(mp, NULL, index, 0);
766 if (error)
767 return error;
768
769 error = xfs_ialloc_pagi_init(mp, NULL, index);
770 if (error)
771 return error;
772 pag = xfs_perag_get(mp, index);
773 ifree += pag->pagi_freecount;
774 ialloc += pag->pagi_count;
775 bfree += pag->pagf_freeblks;
776 bfreelst += pag->pagf_flcount;
777 btree += pag->pagf_btreeblks;
778 xfs_perag_put(pag);
779 }
780 /*
781 * Overwrite incore superblock counters with just-read data
782 */
783 spin_lock(&mp->m_sb_lock);
784 sbp->sb_ifree = ifree;
785 sbp->sb_icount = ialloc;
786 sbp->sb_fdblocks = bfree + bfreelst + btree;
787 spin_unlock(&mp->m_sb_lock);
788
789 /* Fixup the per-cpu counters as well. */
790 xfs_icsb_reinit_counters(mp);
791
792 return 0;
793}
794
795/*
796 * xfs_mod_sb() can be used to copy arbitrary changes to the
797 * in-core superblock into the superblock buffer to be logged.
798 * It does not provide the higher level of locking that is
799 * needed to protect the in-core superblock from concurrent
800 * access.
801 */
802void
803xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
804{
805 xfs_buf_t *bp;
806 int first;
807 int last;
808 xfs_mount_t *mp;
809 xfs_sb_field_t f;
810
811 ASSERT(fields);
812 if (!fields)
813 return;
814 mp = tp->t_mountp;
815 bp = xfs_trans_getsb(tp, mp, 0);
816 first = sizeof(xfs_sb_t);
817 last = 0;
818
819 /* translate/copy */
820
821 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
822
823 /* find modified range */
824 f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
825 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
826 last = xfs_sb_info[f + 1].offset - 1;
827
828 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
829 ASSERT((1LL << f) & XFS_SB_MOD_BITS);
830 first = xfs_sb_info[f].offset;
831
832 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
833 xfs_trans_log_buf(tp, bp, first, last);
834}
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 78f9e70b80c7..6835b44f850e 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -26,6 +26,7 @@
26 26
27struct xfs_buf; 27struct xfs_buf;
28struct xfs_mount; 28struct xfs_mount;
29struct xfs_trans;
29 30
30#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */ 31#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */
31#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */ 32#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */
@@ -83,11 +84,13 @@ struct xfs_mount;
83#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ 84#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
84#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ 85#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
85#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */ 86#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */
87#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */
86 88
87#define XFS_SB_VERSION2_OKREALFBITS \ 89#define XFS_SB_VERSION2_OKREALFBITS \
88 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ 90 (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
89 XFS_SB_VERSION2_ATTR2BIT | \ 91 XFS_SB_VERSION2_ATTR2BIT | \
90 XFS_SB_VERSION2_PROJID32BIT) 92 XFS_SB_VERSION2_PROJID32BIT | \
93 XFS_SB_VERSION2_FTYPE)
91#define XFS_SB_VERSION2_OKSASHFBITS \ 94#define XFS_SB_VERSION2_OKSASHFBITS \
92 (0) 95 (0)
93#define XFS_SB_VERSION2_OKREALBITS \ 96#define XFS_SB_VERSION2_OKREALBITS \
@@ -354,15 +357,8 @@ static inline int xfs_sb_good_version(xfs_sb_t *sbp)
354 (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) 357 (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS)))
355 return 0; 358 return 0;
356 359
357#ifdef __KERNEL__
358 if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN) 360 if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
359 return 0; 361 return 0;
360#else
361 if ((sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) &&
362 sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
363 return 0;
364#endif
365
366 return 1; 362 return 1;
367 } 363 }
368 if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) 364 if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
@@ -554,12 +550,13 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
554 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); 550 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
555} 551}
556 552
557static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp) 553static inline void xfs_sb_version_addprojid32bit(xfs_sb_t *sbp)
558{ 554{
559 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; 555 sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
556 sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
557 sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
560} 558}
561 559
562
563/* 560/*
564 * Extended v5 superblock feature masks. These are to be used for new v5 561 * Extended v5 superblock feature masks. These are to be used for new v5
565 * superblock features only. 562 * superblock features only.
@@ -598,7 +595,10 @@ xfs_sb_has_ro_compat_feature(
598 return (sbp->sb_features_ro_compat & feature) != 0; 595 return (sbp->sb_features_ro_compat & feature) != 0;
599} 596}
600 597
601#define XFS_SB_FEAT_INCOMPAT_ALL 0 598#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */
599#define XFS_SB_FEAT_INCOMPAT_ALL \
600 (XFS_SB_FEAT_INCOMPAT_FTYPE)
601
602#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL 602#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
603static inline bool 603static inline bool
604xfs_sb_has_incompat_feature( 604xfs_sb_has_incompat_feature(
@@ -618,16 +618,39 @@ xfs_sb_has_incompat_log_feature(
618 return (sbp->sb_features_log_incompat & feature) != 0; 618 return (sbp->sb_features_log_incompat & feature) != 0;
619} 619}
620 620
621static inline bool 621/*
622xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) 622 * V5 superblock specific feature checks
623 */
624static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
623{ 625{
624 return (ino == sbp->sb_uquotino || ino == sbp->sb_gquotino); 626 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
627}
628
629static inline int xfs_sb_version_has_pquotino(xfs_sb_t *sbp)
630{
631 return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
632}
633
634static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
635{
636 return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
637 xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) ||
638 (xfs_sb_version_hasmorebits(sbp) &&
639 (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
625} 640}
626 641
627/* 642/*
628 * end of superblock version macros 643 * end of superblock version macros
629 */ 644 */
630 645
646static inline bool
647xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
648{
649 return (ino == sbp->sb_uquotino ||
650 ino == sbp->sb_gquotino ||
651 ino == sbp->sb_pquotino);
652}
653
631#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ 654#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */
632#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) 655#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR)
633#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) 656#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr))
@@ -660,4 +683,23 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
660#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog) 683#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog)
661#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask) 684#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask)
662 685
686/*
687 * perag get/put wrappers for ref counting
688 */
689extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t);
690extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
691 int tag);
692extern void xfs_perag_put(struct xfs_perag *pag);
693extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
694
695extern void xfs_sb_calc_crc(struct xfs_buf *);
696extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
697extern void xfs_sb_mount_common(struct xfs_mount *, struct xfs_sb *);
698extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
699extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
700extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp);
701
702extern const struct xfs_buf_ops xfs_sb_buf_ops;
703extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
704
663#endif /* __XFS_SB_H__ */ 705#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 1d68ffcdeaa7..979a77d4b87d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -17,12 +17,12 @@
17 */ 17 */
18 18
19#include "xfs.h" 19#include "xfs.h"
20#include "xfs_format.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
21#include "xfs_inum.h" 22#include "xfs_inum.h"
22#include "xfs_trans.h" 23#include "xfs_trans.h"
23#include "xfs_sb.h" 24#include "xfs_sb.h"
24#include "xfs_ag.h" 25#include "xfs_ag.h"
25#include "xfs_dir2.h"
26#include "xfs_alloc.h" 26#include "xfs_alloc.h"
27#include "xfs_quota.h" 27#include "xfs_quota.h"
28#include "xfs_mount.h" 28#include "xfs_mount.h"
@@ -40,12 +40,12 @@
40#include "xfs_fsops.h" 40#include "xfs_fsops.h"
41#include "xfs_attr.h" 41#include "xfs_attr.h"
42#include "xfs_buf_item.h" 42#include "xfs_buf_item.h"
43#include "xfs_utils.h"
44#include "xfs_vnodeops.h"
45#include "xfs_log_priv.h" 43#include "xfs_log_priv.h"
46#include "xfs_trans_priv.h" 44#include "xfs_trans_priv.h"
47#include "xfs_filestream.h" 45#include "xfs_filestream.h"
48#include "xfs_da_btree.h" 46#include "xfs_da_btree.h"
47#include "xfs_dir2_format.h"
48#include "xfs_dir2.h"
49#include "xfs_extfree_item.h" 49#include "xfs_extfree_item.h"
50#include "xfs_mru_cache.h" 50#include "xfs_mru_cache.h"
51#include "xfs_inode_item.h" 51#include "xfs_inode_item.h"
@@ -421,12 +421,6 @@ xfs_parseargs(
421 } 421 }
422#endif 422#endif
423 423
424 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
425 (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
426 xfs_warn(mp, "cannot mount with both project and group quota");
427 return EINVAL;
428 }
429
430 if ((dsunit && !dswidth) || (!dsunit && dswidth)) { 424 if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
431 xfs_warn(mp, "sunit and swidth must be specified together"); 425 xfs_warn(mp, "sunit and swidth must be specified together");
432 return EINVAL; 426 return EINVAL;
@@ -556,14 +550,13 @@ xfs_showargs(
556 else if (mp->m_qflags & XFS_UQUOTA_ACCT) 550 else if (mp->m_qflags & XFS_UQUOTA_ACCT)
557 seq_puts(m, "," MNTOPT_UQUOTANOENF); 551 seq_puts(m, "," MNTOPT_UQUOTANOENF);
558 552
559 /* Either project or group quotas can be active, not both */
560
561 if (mp->m_qflags & XFS_PQUOTA_ACCT) { 553 if (mp->m_qflags & XFS_PQUOTA_ACCT) {
562 if (mp->m_qflags & XFS_PQUOTA_ENFD) 554 if (mp->m_qflags & XFS_PQUOTA_ENFD)
563 seq_puts(m, "," MNTOPT_PRJQUOTA); 555 seq_puts(m, "," MNTOPT_PRJQUOTA);
564 else 556 else
565 seq_puts(m, "," MNTOPT_PQUOTANOENF); 557 seq_puts(m, "," MNTOPT_PQUOTANOENF);
566 } else if (mp->m_qflags & XFS_GQUOTA_ACCT) { 558 }
559 if (mp->m_qflags & XFS_GQUOTA_ACCT) {
567 if (mp->m_qflags & XFS_GQUOTA_ENFD) 560 if (mp->m_qflags & XFS_GQUOTA_ENFD)
568 seq_puts(m, "," MNTOPT_GRPQUOTA); 561 seq_puts(m, "," MNTOPT_GRPQUOTA);
569 else 562 else
@@ -870,17 +863,17 @@ xfs_init_mount_workqueues(
870 goto out_destroy_unwritten; 863 goto out_destroy_unwritten;
871 864
872 mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", 865 mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
873 WQ_NON_REENTRANT, 0, mp->m_fsname); 866 0, 0, mp->m_fsname);
874 if (!mp->m_reclaim_workqueue) 867 if (!mp->m_reclaim_workqueue)
875 goto out_destroy_cil; 868 goto out_destroy_cil;
876 869
877 mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", 870 mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
878 WQ_NON_REENTRANT, 0, mp->m_fsname); 871 0, 0, mp->m_fsname);
879 if (!mp->m_log_workqueue) 872 if (!mp->m_log_workqueue)
880 goto out_destroy_reclaim; 873 goto out_destroy_reclaim;
881 874
882 mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", 875 mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
883 WQ_NON_REENTRANT, 0, mp->m_fsname); 876 0, 0, mp->m_fsname);
884 if (!mp->m_eofblocks_workqueue) 877 if (!mp->m_eofblocks_workqueue)
885 goto out_destroy_log; 878 goto out_destroy_log;
886 879
@@ -1396,6 +1389,14 @@ xfs_finish_flags(
1396 return XFS_ERROR(EROFS); 1389 return XFS_ERROR(EROFS);
1397 } 1390 }
1398 1391
1392 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
1393 (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE)) &&
1394 !xfs_sb_version_has_pquotino(&mp->m_sb)) {
1395 xfs_warn(mp,
1396 "Super block does not support project and group quota together");
1397 return XFS_ERROR(EINVAL);
1398 }
1399
1399 return 0; 1400 return 0;
1400} 1401}
1401 1402
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index f4895b662fcb..2f2a7c005be2 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -18,200 +18,29 @@
18 */ 18 */
19#include "xfs.h" 19#include "xfs.h"
20#include "xfs_fs.h" 20#include "xfs_fs.h"
21#include "xfs_types.h" 21#include "xfs_format.h"
22#include "xfs_bit.h" 22#include "xfs_bit.h"
23#include "xfs_log.h" 23#include "xfs_log.h"
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_sb.h" 25#include "xfs_sb.h"
26#include "xfs_ag.h" 26#include "xfs_ag.h"
27#include "xfs_dir2.h"
28#include "xfs_mount.h" 27#include "xfs_mount.h"
29#include "xfs_da_btree.h" 28#include "xfs_da_btree.h"
29#include "xfs_dir2_format.h"
30#include "xfs_dir2.h"
30#include "xfs_bmap_btree.h" 31#include "xfs_bmap_btree.h"
31#include "xfs_ialloc_btree.h" 32#include "xfs_ialloc_btree.h"
32#include "xfs_dinode.h" 33#include "xfs_dinode.h"
33#include "xfs_inode.h" 34#include "xfs_inode.h"
34#include "xfs_inode_item.h"
35#include "xfs_itable.h"
36#include "xfs_ialloc.h" 35#include "xfs_ialloc.h"
37#include "xfs_alloc.h" 36#include "xfs_alloc.h"
38#include "xfs_bmap.h" 37#include "xfs_bmap.h"
38#include "xfs_bmap_util.h"
39#include "xfs_error.h" 39#include "xfs_error.h"
40#include "xfs_quota.h" 40#include "xfs_quota.h"
41#include "xfs_utils.h"
42#include "xfs_trans_space.h" 41#include "xfs_trans_space.h"
43#include "xfs_log_priv.h"
44#include "xfs_trace.h" 42#include "xfs_trace.h"
45#include "xfs_symlink.h" 43#include "xfs_symlink.h"
46#include "xfs_cksum.h"
47#include "xfs_buf_item.h"
48
49
50/*
51 * Each contiguous block has a header, so it is not just a simple pathlen
52 * to FSB conversion.
53 */
54int
55xfs_symlink_blocks(
56 struct xfs_mount *mp,
57 int pathlen)
58{
59 int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
60
61 return (pathlen + buflen - 1) / buflen;
62}
63
64static int
65xfs_symlink_hdr_set(
66 struct xfs_mount *mp,
67 xfs_ino_t ino,
68 uint32_t offset,
69 uint32_t size,
70 struct xfs_buf *bp)
71{
72 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
73
74 if (!xfs_sb_version_hascrc(&mp->m_sb))
75 return 0;
76
77 dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
78 dsl->sl_offset = cpu_to_be32(offset);
79 dsl->sl_bytes = cpu_to_be32(size);
80 uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
81 dsl->sl_owner = cpu_to_be64(ino);
82 dsl->sl_blkno = cpu_to_be64(bp->b_bn);
83 bp->b_ops = &xfs_symlink_buf_ops;
84
85 return sizeof(struct xfs_dsymlink_hdr);
86}
87
88/*
89 * Checking of the symlink header is split into two parts. the verifier does
90 * CRC, location and bounds checking, the unpacking function checks the path
91 * parameters and owner.
92 */
93bool
94xfs_symlink_hdr_ok(
95 struct xfs_mount *mp,
96 xfs_ino_t ino,
97 uint32_t offset,
98 uint32_t size,
99 struct xfs_buf *bp)
100{
101 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
102
103 if (offset != be32_to_cpu(dsl->sl_offset))
104 return false;
105 if (size != be32_to_cpu(dsl->sl_bytes))
106 return false;
107 if (ino != be64_to_cpu(dsl->sl_owner))
108 return false;
109
110 /* ok */
111 return true;
112}
113
114static bool
115xfs_symlink_verify(
116 struct xfs_buf *bp)
117{
118 struct xfs_mount *mp = bp->b_target->bt_mount;
119 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
120
121 if (!xfs_sb_version_hascrc(&mp->m_sb))
122 return false;
123 if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
124 return false;
125 if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
126 return false;
127 if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
128 return false;
129 if (be32_to_cpu(dsl->sl_offset) +
130 be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
131 return false;
132 if (dsl->sl_owner == 0)
133 return false;
134
135 return true;
136}
137
138static void
139xfs_symlink_read_verify(
140 struct xfs_buf *bp)
141{
142 struct xfs_mount *mp = bp->b_target->bt_mount;
143
144 /* no verification of non-crc buffers */
145 if (!xfs_sb_version_hascrc(&mp->m_sb))
146 return;
147
148 if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
149 offsetof(struct xfs_dsymlink_hdr, sl_crc)) ||
150 !xfs_symlink_verify(bp)) {
151 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
152 xfs_buf_ioerror(bp, EFSCORRUPTED);
153 }
154}
155
156static void
157xfs_symlink_write_verify(
158 struct xfs_buf *bp)
159{
160 struct xfs_mount *mp = bp->b_target->bt_mount;
161 struct xfs_buf_log_item *bip = bp->b_fspriv;
162
163 /* no verification of non-crc buffers */
164 if (!xfs_sb_version_hascrc(&mp->m_sb))
165 return;
166
167 if (!xfs_symlink_verify(bp)) {
168 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
169 xfs_buf_ioerror(bp, EFSCORRUPTED);
170 return;
171 }
172
173 if (bip) {
174 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
175 dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
176 }
177 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
178 offsetof(struct xfs_dsymlink_hdr, sl_crc));
179}
180
181const struct xfs_buf_ops xfs_symlink_buf_ops = {
182 .verify_read = xfs_symlink_read_verify,
183 .verify_write = xfs_symlink_write_verify,
184};
185
186void
187xfs_symlink_local_to_remote(
188 struct xfs_trans *tp,
189 struct xfs_buf *bp,
190 struct xfs_inode *ip,
191 struct xfs_ifork *ifp)
192{
193 struct xfs_mount *mp = ip->i_mount;
194 char *buf;
195
196 if (!xfs_sb_version_hascrc(&mp->m_sb)) {
197 bp->b_ops = NULL;
198 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
199 return;
200 }
201
202 /*
203 * As this symlink fits in an inode literal area, it must also fit in
204 * the smallest buffer the filesystem supports.
205 */
206 ASSERT(BBTOB(bp->b_length) >=
207 ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
208
209 bp->b_ops = &xfs_symlink_buf_ops;
210
211 buf = bp->b_addr;
212 buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
213 memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
214}
215 44
216/* ----- Kernel only functions below ----- */ 45/* ----- Kernel only functions below ----- */
217STATIC int 46STATIC int
@@ -386,8 +215,11 @@ xfs_symlink(
386 /* 215 /*
387 * Make sure that we have allocated dquot(s) on disk. 216 * Make sure that we have allocated dquot(s) on disk.
388 */ 217 */
389 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, 218 error = xfs_qm_vop_dqalloc(dp,
390 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); 219 xfs_kuid_to_uid(current_fsuid()),
220 xfs_kgid_to_gid(current_fsgid()), prid,
221 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
222 &udqp, &gdqp, &pdqp);
391 if (error) 223 if (error)
392 goto std_return; 224 goto std_return;
393 225
@@ -402,12 +234,10 @@ xfs_symlink(
402 else 234 else
403 fs_blocks = xfs_symlink_blocks(mp, pathlen); 235 fs_blocks = xfs_symlink_blocks(mp, pathlen);
404 resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); 236 resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
405 error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, 237 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0);
406 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
407 if (error == ENOSPC && fs_blocks == 0) { 238 if (error == ENOSPC && fs_blocks == 0) {
408 resblks = 0; 239 resblks = 0;
409 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0, 240 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
410 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
411 } 241 }
412 if (error) { 242 if (error) {
413 cancel_flags = 0; 243 cancel_flags = 0;
@@ -710,8 +540,8 @@ xfs_inactive_symlink_rmt(
710 * Put an itruncate log reservation in the new transaction 540 * Put an itruncate log reservation in the new transaction
711 * for our caller. 541 * for our caller.
712 */ 542 */
713 if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 543 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
714 XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) { 544 if (error) {
715 ASSERT(XFS_FORCED_SHUTDOWN(mp)); 545 ASSERT(XFS_FORCED_SHUTDOWN(mp));
716 goto error0; 546 goto error0;
717 } 547 }
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index 374394880c01..99338ba666ac 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -17,50 +17,11 @@
17#ifndef __XFS_SYMLINK_H 17#ifndef __XFS_SYMLINK_H
18#define __XFS_SYMLINK_H 1 18#define __XFS_SYMLINK_H 1
19 19
20struct xfs_mount; 20/* Kernel only symlink defintions */
21struct xfs_trans;
22struct xfs_inode;
23struct xfs_buf;
24struct xfs_ifork;
25struct xfs_name;
26
27#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */
28
29struct xfs_dsymlink_hdr {
30 __be32 sl_magic;
31 __be32 sl_offset;
32 __be32 sl_bytes;
33 __be32 sl_crc;
34 uuid_t sl_uuid;
35 __be64 sl_owner;
36 __be64 sl_blkno;
37 __be64 sl_lsn;
38};
39
40/*
41 * The maximum pathlen is 1024 bytes. Since the minimum file system
42 * blocksize is 512 bytes, we can get a max of 3 extents back from
43 * bmapi when crc headers are taken into account.
44 */
45#define XFS_SYMLINK_MAPS 3
46
47#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \
48 ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
49 sizeof(struct xfs_dsymlink_hdr) : 0))
50
51int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
52
53void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
54 struct xfs_inode *ip, struct xfs_ifork *ifp);
55
56extern const struct xfs_buf_ops xfs_symlink_buf_ops;
57
58#ifdef __KERNEL__
59 21
60int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, 22int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
61 const char *target_path, umode_t mode, struct xfs_inode **ipp); 23 const char *target_path, umode_t mode, struct xfs_inode **ipp);
62int xfs_readlink(struct xfs_inode *ip, char *link); 24int xfs_readlink(struct xfs_inode *ip, char *link);
63int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp); 25int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp);
64 26
65#endif /* __KERNEL__ */
66#endif /* __XFS_SYMLINK_H */ 27#endif /* __XFS_SYMLINK_H */
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c
new file mode 100644
index 000000000000..01c85e3f6470
--- /dev/null
+++ b/fs/xfs/xfs_symlink_remote.c
@@ -0,0 +1,200 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * Copyright (c) 2012-2013 Red Hat, Inc.
4 * All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_format.h"
22#include "xfs_log.h"
23#include "xfs_trans.h"
24#include "xfs_ag.h"
25#include "xfs_sb.h"
26#include "xfs_mount.h"
27#include "xfs_bmap_btree.h"
28#include "xfs_inode.h"
29#include "xfs_error.h"
30#include "xfs_trace.h"
31#include "xfs_symlink.h"
32#include "xfs_cksum.h"
33#include "xfs_buf_item.h"
34
35
36/*
37 * Each contiguous block has a header, so it is not just a simple pathlen
38 * to FSB conversion.
39 */
40int
41xfs_symlink_blocks(
42 struct xfs_mount *mp,
43 int pathlen)
44{
45 int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize);
46
47 return (pathlen + buflen - 1) / buflen;
48}
49
50int
51xfs_symlink_hdr_set(
52 struct xfs_mount *mp,
53 xfs_ino_t ino,
54 uint32_t offset,
55 uint32_t size,
56 struct xfs_buf *bp)
57{
58 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
59
60 if (!xfs_sb_version_hascrc(&mp->m_sb))
61 return 0;
62
63 dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
64 dsl->sl_offset = cpu_to_be32(offset);
65 dsl->sl_bytes = cpu_to_be32(size);
66 uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid);
67 dsl->sl_owner = cpu_to_be64(ino);
68 dsl->sl_blkno = cpu_to_be64(bp->b_bn);
69 bp->b_ops = &xfs_symlink_buf_ops;
70
71 return sizeof(struct xfs_dsymlink_hdr);
72}
73
74/*
75 * Checking of the symlink header is split into two parts. the verifier does
76 * CRC, location and bounds checking, the unpacking function checks the path
77 * parameters and owner.
78 */
79bool
80xfs_symlink_hdr_ok(
81 struct xfs_mount *mp,
82 xfs_ino_t ino,
83 uint32_t offset,
84 uint32_t size,
85 struct xfs_buf *bp)
86{
87 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
88
89 if (offset != be32_to_cpu(dsl->sl_offset))
90 return false;
91 if (size != be32_to_cpu(dsl->sl_bytes))
92 return false;
93 if (ino != be64_to_cpu(dsl->sl_owner))
94 return false;
95
96 /* ok */
97 return true;
98}
99
100static bool
101xfs_symlink_verify(
102 struct xfs_buf *bp)
103{
104 struct xfs_mount *mp = bp->b_target->bt_mount;
105 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
106
107 if (!xfs_sb_version_hascrc(&mp->m_sb))
108 return false;
109 if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
110 return false;
111 if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid))
112 return false;
113 if (bp->b_bn != be64_to_cpu(dsl->sl_blkno))
114 return false;
115 if (be32_to_cpu(dsl->sl_offset) +
116 be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN)
117 return false;
118 if (dsl->sl_owner == 0)
119 return false;
120
121 return true;
122}
123
124static void
125xfs_symlink_read_verify(
126 struct xfs_buf *bp)
127{
128 struct xfs_mount *mp = bp->b_target->bt_mount;
129
130 /* no verification of non-crc buffers */
131 if (!xfs_sb_version_hascrc(&mp->m_sb))
132 return;
133
134 if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
135 offsetof(struct xfs_dsymlink_hdr, sl_crc)) ||
136 !xfs_symlink_verify(bp)) {
137 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
138 xfs_buf_ioerror(bp, EFSCORRUPTED);
139 }
140}
141
142static void
143xfs_symlink_write_verify(
144 struct xfs_buf *bp)
145{
146 struct xfs_mount *mp = bp->b_target->bt_mount;
147 struct xfs_buf_log_item *bip = bp->b_fspriv;
148
149 /* no verification of non-crc buffers */
150 if (!xfs_sb_version_hascrc(&mp->m_sb))
151 return;
152
153 if (!xfs_symlink_verify(bp)) {
154 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
155 xfs_buf_ioerror(bp, EFSCORRUPTED);
156 return;
157 }
158
159 if (bip) {
160 struct xfs_dsymlink_hdr *dsl = bp->b_addr;
161 dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
162 }
163 xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
164 offsetof(struct xfs_dsymlink_hdr, sl_crc));
165}
166
167const struct xfs_buf_ops xfs_symlink_buf_ops = {
168 .verify_read = xfs_symlink_read_verify,
169 .verify_write = xfs_symlink_write_verify,
170};
171
172void
173xfs_symlink_local_to_remote(
174 struct xfs_trans *tp,
175 struct xfs_buf *bp,
176 struct xfs_inode *ip,
177 struct xfs_ifork *ifp)
178{
179 struct xfs_mount *mp = ip->i_mount;
180 char *buf;
181
182 if (!xfs_sb_version_hascrc(&mp->m_sb)) {
183 bp->b_ops = NULL;
184 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
185 return;
186 }
187
188 /*
189 * As this symlink fits in an inode literal area, it must also fit in
190 * the smallest buffer the filesystem supports.
191 */
192 ASSERT(BBTOB(bp->b_length) >=
193 ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr));
194
195 bp->b_ops = &xfs_symlink_buf_ops;
196
197 buf = bp->b_addr;
198 buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
199 memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
200}
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index b6e3897c1d9f..5d7b3e40705f 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -18,6 +18,7 @@
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_types.h" 20#include "xfs_types.h"
21#include "xfs_format.h"
21#include "xfs_log.h" 22#include "xfs_log.h"
22#include "xfs_trans.h" 23#include "xfs_trans.h"
23#include "xfs_sb.h" 24#include "xfs_sb.h"
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 35a229981354..5411e01ab452 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -18,7 +18,7 @@
18 */ 18 */
19#include "xfs.h" 19#include "xfs.h"
20#include "xfs_fs.h" 20#include "xfs_fs.h"
21#include "xfs_types.h" 21#include "xfs_format.h"
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_trans.h" 23#include "xfs_trans.h"
24#include "xfs_sb.h" 24#include "xfs_sb.h"
@@ -49,629 +49,6 @@ kmem_zone_t *xfs_trans_zone;
49kmem_zone_t *xfs_log_item_desc_zone; 49kmem_zone_t *xfs_log_item_desc_zone;
50 50
51/* 51/*
52 * A buffer has a format structure overhead in the log in addition
53 * to the data, so we need to take this into account when reserving
54 * space in a transaction for a buffer. Round the space required up
55 * to a multiple of 128 bytes so that we don't change the historical
56 * reservation that has been used for this overhead.
57 */
58STATIC uint
59xfs_buf_log_overhead(void)
60{
61 return round_up(sizeof(struct xlog_op_header) +
62 sizeof(struct xfs_buf_log_format), 128);
63}
64
65/*
66 * Calculate out transaction log reservation per item in bytes.
67 *
68 * The nbufs argument is used to indicate the number of items that
69 * will be changed in a transaction. size is used to tell how many
70 * bytes should be reserved per item.
71 */
72STATIC uint
73xfs_calc_buf_res(
74 uint nbufs,
75 uint size)
76{
77 return nbufs * (size + xfs_buf_log_overhead());
78}
79
80/*
81 * Various log reservation values.
82 *
83 * These are based on the size of the file system block because that is what
84 * most transactions manipulate. Each adds in an additional 128 bytes per
85 * item logged to try to account for the overhead of the transaction mechanism.
86 *
87 * Note: Most of the reservations underestimate the number of allocation
88 * groups into which they could free extents in the xfs_bmap_finish() call.
89 * This is because the number in the worst case is quite high and quite
90 * unusual. In order to fix this we need to change xfs_bmap_finish() to free
91 * extents in only a single AG at a time. This will require changes to the
92 * EFI code as well, however, so that the EFI for the extents not freed is
93 * logged again in each transaction. See SGI PV #261917.
94 *
95 * Reservation functions here avoid a huge stack in xfs_trans_init due to
96 * register overflow from temporaries in the calculations.
97 */
98
99
100/*
101 * In a write transaction we can allocate a maximum of 2
102 * extents. This gives:
103 * the inode getting the new extents: inode size
104 * the inode's bmap btree: max depth * block size
105 * the agfs of the ags from which the extents are allocated: 2 * sector
106 * the superblock free block counter: sector size
107 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
108 * And the bmap_finish transaction can free bmap blocks in a join:
109 * the agfs of the ags containing the blocks: 2 * sector size
110 * the agfls of the ags containing the blocks: 2 * sector size
111 * the super block free block counter: sector size
112 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
113 */
114STATIC uint
115xfs_calc_write_reservation(
116 struct xfs_mount *mp)
117{
118 return XFS_DQUOT_LOGRES(mp) +
119 MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
120 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
121 XFS_FSB_TO_B(mp, 1)) +
122 xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
123 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
124 XFS_FSB_TO_B(mp, 1))),
125 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
126 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
127 XFS_FSB_TO_B(mp, 1))));
128}
129
130/*
131 * In truncating a file we free up to two extents at once. We can modify:
132 * the inode being truncated: inode size
133 * the inode's bmap btree: (max depth + 1) * block size
134 * And the bmap_finish transaction can free the blocks and bmap blocks:
135 * the agf for each of the ags: 4 * sector size
136 * the agfl for each of the ags: 4 * sector size
137 * the super block to reflect the freed blocks: sector size
138 * worst case split in allocation btrees per extent assuming 4 extents:
139 * 4 exts * 2 trees * (2 * max depth - 1) * block size
140 * the inode btree: max depth * blocksize
141 * the allocation btrees: 2 trees * (max depth - 1) * block size
142 */
143STATIC uint
144xfs_calc_itruncate_reservation(
145 struct xfs_mount *mp)
146{
147 return XFS_DQUOT_LOGRES(mp) +
148 MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
149 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
150 XFS_FSB_TO_B(mp, 1))),
151 (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
152 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
153 XFS_FSB_TO_B(mp, 1)) +
154 xfs_calc_buf_res(5, 0) +
155 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
156 XFS_FSB_TO_B(mp, 1)) +
157 xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
158 mp->m_in_maxlevels, 0)));
159}
160
161/*
162 * In renaming a files we can modify:
163 * the four inodes involved: 4 * inode size
164 * the two directory btrees: 2 * (max depth + v2) * dir block size
165 * the two directory bmap btrees: 2 * max depth * block size
166 * And the bmap_finish transaction can free dir and bmap blocks (two sets
167 * of bmap blocks) giving:
168 * the agf for the ags in which the blocks live: 3 * sector size
169 * the agfl for the ags in which the blocks live: 3 * sector size
170 * the superblock for the free block count: sector size
171 * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
172 */
173STATIC uint
174xfs_calc_rename_reservation(
175 struct xfs_mount *mp)
176{
177 return XFS_DQUOT_LOGRES(mp) +
178 MAX((xfs_calc_buf_res(4, mp->m_sb.sb_inodesize) +
179 xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
180 XFS_FSB_TO_B(mp, 1))),
181 (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
182 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
183 XFS_FSB_TO_B(mp, 1))));
184}
185
186/*
187 * For creating a link to an inode:
188 * the parent directory inode: inode size
189 * the linked inode: inode size
190 * the directory btree could split: (max depth + v2) * dir block size
191 * the directory bmap btree could join or split: (max depth + v2) * blocksize
192 * And the bmap_finish transaction can free some bmap blocks giving:
193 * the agf for the ag in which the blocks live: sector size
194 * the agfl for the ag in which the blocks live: sector size
195 * the superblock for the free block count: sector size
196 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
197 */
198STATIC uint
199xfs_calc_link_reservation(
200 struct xfs_mount *mp)
201{
202 return XFS_DQUOT_LOGRES(mp) +
203 MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
204 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
205 XFS_FSB_TO_B(mp, 1))),
206 (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
207 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
208 XFS_FSB_TO_B(mp, 1))));
209}
210
211/*
212 * For removing a directory entry we can modify:
213 * the parent directory inode: inode size
214 * the removed inode: inode size
215 * the directory btree could join: (max depth + v2) * dir block size
216 * the directory bmap btree could join or split: (max depth + v2) * blocksize
217 * And the bmap_finish transaction can free the dir and bmap blocks giving:
218 * the agf for the ag in which the blocks live: 2 * sector size
219 * the agfl for the ag in which the blocks live: 2 * sector size
220 * the superblock for the free block count: sector size
221 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
222 */
223STATIC uint
224xfs_calc_remove_reservation(
225 struct xfs_mount *mp)
226{
227 return XFS_DQUOT_LOGRES(mp) +
228 MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
229 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
230 XFS_FSB_TO_B(mp, 1))),
231 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
232 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
233 XFS_FSB_TO_B(mp, 1))));
234}
235
236/*
237 * For create, break it in to the two cases that the transaction
238 * covers. We start with the modify case - allocation done by modification
239 * of the state of existing inodes - and the allocation case.
240 */
241
242/*
243 * For create we can modify:
244 * the parent directory inode: inode size
245 * the new inode: inode size
246 * the inode btree entry: block size
247 * the superblock for the nlink flag: sector size
248 * the directory btree: (max depth + v2) * dir block size
249 * the directory inode's bmap btree: (max depth + v2) * block size
250 */
251STATIC uint
252xfs_calc_create_resv_modify(
253 struct xfs_mount *mp)
254{
255 return xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
256 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
257 (uint)XFS_FSB_TO_B(mp, 1) +
258 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
259}
260
261/*
262 * For create we can allocate some inodes giving:
263 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
264 * the superblock for the nlink flag: sector size
265 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
266 * the inode btree: max depth * blocksize
267 * the allocation btrees: 2 trees * (max depth - 1) * block size
268 */
269STATIC uint
270xfs_calc_create_resv_alloc(
271 struct xfs_mount *mp)
272{
273 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
274 mp->m_sb.sb_sectsize +
275 xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
276 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
277 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
278 XFS_FSB_TO_B(mp, 1));
279}
280
281STATIC uint
282__xfs_calc_create_reservation(
283 struct xfs_mount *mp)
284{
285 return XFS_DQUOT_LOGRES(mp) +
286 MAX(xfs_calc_create_resv_alloc(mp),
287 xfs_calc_create_resv_modify(mp));
288}
289
290/*
291 * For icreate we can allocate some inodes giving:
292 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
293 * the superblock for the nlink flag: sector size
294 * the inode btree: max depth * blocksize
295 * the allocation btrees: 2 trees * (max depth - 1) * block size
296 */
297STATIC uint
298xfs_calc_icreate_resv_alloc(
299 struct xfs_mount *mp)
300{
301 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
302 mp->m_sb.sb_sectsize +
303 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
304 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
305 XFS_FSB_TO_B(mp, 1));
306}
307
308STATIC uint
309xfs_calc_icreate_reservation(xfs_mount_t *mp)
310{
311 return XFS_DQUOT_LOGRES(mp) +
312 MAX(xfs_calc_icreate_resv_alloc(mp),
313 xfs_calc_create_resv_modify(mp));
314}
315
316STATIC uint
317xfs_calc_create_reservation(
318 struct xfs_mount *mp)
319{
320 if (xfs_sb_version_hascrc(&mp->m_sb))
321 return xfs_calc_icreate_reservation(mp);
322 return __xfs_calc_create_reservation(mp);
323
324}
325
326/*
327 * Making a new directory is the same as creating a new file.
328 */
329STATIC uint
330xfs_calc_mkdir_reservation(
331 struct xfs_mount *mp)
332{
333 return xfs_calc_create_reservation(mp);
334}
335
336
337/*
338 * Making a new symplink is the same as creating a new file, but
339 * with the added blocks for remote symlink data which can be up to 1kB in
340 * length (MAXPATHLEN).
341 */
342STATIC uint
343xfs_calc_symlink_reservation(
344 struct xfs_mount *mp)
345{
346 return xfs_calc_create_reservation(mp) +
347 xfs_calc_buf_res(1, MAXPATHLEN);
348}
349
350/*
351 * In freeing an inode we can modify:
352 * the inode being freed: inode size
353 * the super block free inode counter: sector size
354 * the agi hash list and counters: sector size
355 * the inode btree entry: block size
356 * the on disk inode before ours in the agi hash list: inode cluster size
357 * the inode btree: max depth * blocksize
358 * the allocation btrees: 2 trees * (max depth - 1) * block size
359 */
360STATIC uint
361xfs_calc_ifree_reservation(
362 struct xfs_mount *mp)
363{
364 return XFS_DQUOT_LOGRES(mp) +
365 xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
366 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
367 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
368 MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
369 XFS_INODE_CLUSTER_SIZE(mp)) +
370 xfs_calc_buf_res(1, 0) +
371 xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
372 mp->m_in_maxlevels, 0) +
373 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
374 XFS_FSB_TO_B(mp, 1));
375}
376
377/*
378 * When only changing the inode we log the inode and possibly the superblock
379 * We also add a bit of slop for the transaction stuff.
380 */
381STATIC uint
382xfs_calc_ichange_reservation(
383 struct xfs_mount *mp)
384{
385 return XFS_DQUOT_LOGRES(mp) +
386 mp->m_sb.sb_inodesize +
387 mp->m_sb.sb_sectsize +
388 512;
389
390}
391
392/*
393 * Growing the data section of the filesystem.
394 * superblock
395 * agi and agf
396 * allocation btrees
397 */
398STATIC uint
399xfs_calc_growdata_reservation(
400 struct xfs_mount *mp)
401{
402 return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
403 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
404 XFS_FSB_TO_B(mp, 1));
405}
406
407/*
408 * Growing the rt section of the filesystem.
409 * In the first set of transactions (ALLOC) we allocate space to the
410 * bitmap or summary files.
411 * superblock: sector size
412 * agf of the ag from which the extent is allocated: sector size
413 * bmap btree for bitmap/summary inode: max depth * blocksize
414 * bitmap/summary inode: inode size
415 * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
416 */
417STATIC uint
418xfs_calc_growrtalloc_reservation(
419 struct xfs_mount *mp)
420{
421 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
422 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
423 XFS_FSB_TO_B(mp, 1)) +
424 xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
425 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
426 XFS_FSB_TO_B(mp, 1));
427}
428
429/*
430 * Growing the rt section of the filesystem.
431 * In the second set of transactions (ZERO) we zero the new metadata blocks.
432 * one bitmap/summary block: blocksize
433 */
434STATIC uint
435xfs_calc_growrtzero_reservation(
436 struct xfs_mount *mp)
437{
438 return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
439}
440
441/*
442 * Growing the rt section of the filesystem.
443 * In the third set of transactions (FREE) we update metadata without
444 * allocating any new blocks.
445 * superblock: sector size
446 * bitmap inode: inode size
447 * summary inode: inode size
448 * one bitmap block: blocksize
449 * summary blocks: new summary size
450 */
451STATIC uint
452xfs_calc_growrtfree_reservation(
453 struct xfs_mount *mp)
454{
455 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
456 xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
457 xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
458 xfs_calc_buf_res(1, mp->m_rsumsize);
459}
460
461/*
462 * Logging the inode modification timestamp on a synchronous write.
463 * inode
464 */
465STATIC uint
466xfs_calc_swrite_reservation(
467 struct xfs_mount *mp)
468{
469 return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize);
470}
471
472/*
473 * Logging the inode mode bits when writing a setuid/setgid file
474 * inode
475 */
476STATIC uint
477xfs_calc_writeid_reservation(xfs_mount_t *mp)
478{
479 return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize);
480}
481
482/*
483 * Converting the inode from non-attributed to attributed.
484 * the inode being converted: inode size
485 * agf block and superblock (for block allocation)
486 * the new block (directory sized)
487 * bmap blocks for the new directory block
488 * allocation btrees
489 */
490STATIC uint
491xfs_calc_addafork_reservation(
492 struct xfs_mount *mp)
493{
494 return XFS_DQUOT_LOGRES(mp) +
495 xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
496 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
497 xfs_calc_buf_res(1, mp->m_dirblksize) +
498 xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
499 XFS_FSB_TO_B(mp, 1)) +
500 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
501 XFS_FSB_TO_B(mp, 1));
502}
503
504/*
505 * Removing the attribute fork of a file
506 * the inode being truncated: inode size
507 * the inode's bmap btree: max depth * block size
508 * And the bmap_finish transaction can free the blocks and bmap blocks:
509 * the agf for each of the ags: 4 * sector size
510 * the agfl for each of the ags: 4 * sector size
511 * the super block to reflect the freed blocks: sector size
512 * worst case split in allocation btrees per extent assuming 4 extents:
513 * 4 exts * 2 trees * (2 * max depth - 1) * block size
514 */
515STATIC uint
516xfs_calc_attrinval_reservation(
517 struct xfs_mount *mp)
518{
519 return MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
520 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
521 XFS_FSB_TO_B(mp, 1))),
522 (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
523 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
524 XFS_FSB_TO_B(mp, 1))));
525}
526
527/*
528 * Setting an attribute at mount time.
529 * the inode getting the attribute
530 * the superblock for allocations
531 * the agfs extents are allocated from
532 * the attribute btree * max depth
533 * the inode allocation btree
534 * Since attribute transaction space is dependent on the size of the attribute,
535 * the calculation is done partially at mount time and partially at runtime(see
536 * below).
537 */
538STATIC uint
539xfs_calc_attrsetm_reservation(
540 struct xfs_mount *mp)
541{
542 return XFS_DQUOT_LOGRES(mp) +
543 xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
544 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
545 xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
546}
547
548/*
549 * Setting an attribute at runtime, transaction space unit per block.
550 * the superblock for allocations: sector size
551 * the inode bmap btree could join or split: max depth * block size
552 * Since the runtime attribute transaction space is dependent on the total
553 * blocks needed for the 1st bmap, here we calculate out the space unit for
554 * one block so that the caller could figure out the total space according
555 * to the attibute extent length in blocks by: ext * XFS_ATTRSETRT_LOG_RES(mp).
556 */
557STATIC uint
558xfs_calc_attrsetrt_reservation(
559 struct xfs_mount *mp)
560{
561 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
562 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
563 XFS_FSB_TO_B(mp, 1));
564}
565
566/*
567 * Removing an attribute.
568 * the inode: inode size
569 * the attribute btree could join: max depth * block size
570 * the inode bmap btree could join or split: max depth * block size
571 * And the bmap_finish transaction can free the attr blocks freed giving:
572 * the agf for the ag in which the blocks live: 2 * sector size
573 * the agfl for the ag in which the blocks live: 2 * sector size
574 * the superblock for the free block count: sector size
575 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
576 */
577STATIC uint
578xfs_calc_attrrm_reservation(
579 struct xfs_mount *mp)
580{
581 return XFS_DQUOT_LOGRES(mp) +
582 MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
583 xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
584 XFS_FSB_TO_B(mp, 1)) +
585 (uint)XFS_FSB_TO_B(mp,
586 XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
587 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
588 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
589 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
590 XFS_FSB_TO_B(mp, 1))));
591}
592
593/*
594 * Clearing a bad agino number in an agi hash bucket.
595 */
596STATIC uint
597xfs_calc_clear_agi_bucket_reservation(
598 struct xfs_mount *mp)
599{
600 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
601}
602
603/*
604 * Clearing the quotaflags in the superblock.
605 * the super block for changing quota flags: sector size
606 */
607STATIC uint
608xfs_calc_qm_sbchange_reservation(
609 struct xfs_mount *mp)
610{
611 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
612}
613
614/*
615 * Adjusting quota limits.
616 * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
617 */
618STATIC uint
619xfs_calc_qm_setqlim_reservation(
620 struct xfs_mount *mp)
621{
622 return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
623}
624
625/*
626 * Allocating quota on disk if needed.
627 * the write transaction log space: XFS_WRITE_LOG_RES(mp)
628 * the unit of quota allocation: one system block size
629 */
630STATIC uint
631xfs_calc_qm_dqalloc_reservation(
632 struct xfs_mount *mp)
633{
634 return XFS_WRITE_LOG_RES(mp) +
635 xfs_calc_buf_res(1,
636 XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
637}
638
639/*
640 * Turning off quotas.
641 * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
642 * the superblock for the quota flags: sector size
643 */
644STATIC uint
645xfs_calc_qm_quotaoff_reservation(
646 struct xfs_mount *mp)
647{
648 return sizeof(struct xfs_qoff_logitem) * 2 +
649 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
650}
651
652/*
653 * End of turning off quotas.
654 * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
655 */
656STATIC uint
657xfs_calc_qm_quotaoff_end_reservation(
658 struct xfs_mount *mp)
659{
660 return sizeof(struct xfs_qoff_logitem) * 2;
661}
662
663/*
664 * Syncing the incore super block changes to disk.
665 * the super block to reflect the changes: sector size
666 */
667STATIC uint
668xfs_calc_sb_reservation(
669 struct xfs_mount *mp)
670{
671 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
672}
673
674/*
675 * Initialize the precomputed transaction reservation values 52 * Initialize the precomputed transaction reservation values
676 * in the mount structure. 53 * in the mount structure.
677 */ 54 */
@@ -679,36 +56,7 @@ void
679xfs_trans_init( 56xfs_trans_init(
680 struct xfs_mount *mp) 57 struct xfs_mount *mp)
681{ 58{
682 struct xfs_trans_reservations *resp = &mp->m_reservations; 59 xfs_trans_resv_calc(mp, M_RES(mp));
683
684 resp->tr_write = xfs_calc_write_reservation(mp);
685 resp->tr_itruncate = xfs_calc_itruncate_reservation(mp);
686 resp->tr_rename = xfs_calc_rename_reservation(mp);
687 resp->tr_link = xfs_calc_link_reservation(mp);
688 resp->tr_remove = xfs_calc_remove_reservation(mp);
689 resp->tr_symlink = xfs_calc_symlink_reservation(mp);
690 resp->tr_create = xfs_calc_create_reservation(mp);
691 resp->tr_mkdir = xfs_calc_mkdir_reservation(mp);
692 resp->tr_ifree = xfs_calc_ifree_reservation(mp);
693 resp->tr_ichange = xfs_calc_ichange_reservation(mp);
694 resp->tr_growdata = xfs_calc_growdata_reservation(mp);
695 resp->tr_swrite = xfs_calc_swrite_reservation(mp);
696 resp->tr_writeid = xfs_calc_writeid_reservation(mp);
697 resp->tr_addafork = xfs_calc_addafork_reservation(mp);
698 resp->tr_attrinval = xfs_calc_attrinval_reservation(mp);
699 resp->tr_attrsetm = xfs_calc_attrsetm_reservation(mp);
700 resp->tr_attrsetrt = xfs_calc_attrsetrt_reservation(mp);
701 resp->tr_attrrm = xfs_calc_attrrm_reservation(mp);
702 resp->tr_clearagi = xfs_calc_clear_agi_bucket_reservation(mp);
703 resp->tr_growrtalloc = xfs_calc_growrtalloc_reservation(mp);
704 resp->tr_growrtzero = xfs_calc_growrtzero_reservation(mp);
705 resp->tr_growrtfree = xfs_calc_growrtfree_reservation(mp);
706 resp->tr_qm_sbchange = xfs_calc_qm_sbchange_reservation(mp);
707 resp->tr_qm_setqlim = xfs_calc_qm_setqlim_reservation(mp);
708 resp->tr_qm_dqalloc = xfs_calc_qm_dqalloc_reservation(mp);
709 resp->tr_qm_quotaoff = xfs_calc_qm_quotaoff_reservation(mp);
710 resp->tr_qm_equotaoff = xfs_calc_qm_quotaoff_end_reservation(mp);
711 resp->tr_sb = xfs_calc_sb_reservation(mp);
712} 60}
713 61
714/* 62/*
@@ -744,7 +92,7 @@ _xfs_trans_alloc(
744 atomic_inc(&mp->m_active_trans); 92 atomic_inc(&mp->m_active_trans);
745 93
746 tp = kmem_zone_zalloc(xfs_trans_zone, memflags); 94 tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
747 tp->t_magic = XFS_TRANS_MAGIC; 95 tp->t_magic = XFS_TRANS_HEADER_MAGIC;
748 tp->t_type = type; 96 tp->t_type = type;
749 tp->t_mountp = mp; 97 tp->t_mountp = mp;
750 INIT_LIST_HEAD(&tp->t_items); 98 INIT_LIST_HEAD(&tp->t_items);
@@ -789,7 +137,7 @@ xfs_trans_dup(
789 /* 137 /*
790 * Initialize the new transaction structure. 138 * Initialize the new transaction structure.
791 */ 139 */
792 ntp->t_magic = XFS_TRANS_MAGIC; 140 ntp->t_magic = XFS_TRANS_HEADER_MAGIC;
793 ntp->t_type = tp->t_type; 141 ntp->t_type = tp->t_type;
794 ntp->t_mountp = tp->t_mountp; 142 ntp->t_mountp = tp->t_mountp;
795 INIT_LIST_HEAD(&ntp->t_items); 143 INIT_LIST_HEAD(&ntp->t_items);
@@ -832,12 +180,10 @@ xfs_trans_dup(
832 */ 180 */
833int 181int
834xfs_trans_reserve( 182xfs_trans_reserve(
835 xfs_trans_t *tp, 183 struct xfs_trans *tp,
836 uint blocks, 184 struct xfs_trans_res *resp,
837 uint logspace, 185 uint blocks,
838 uint rtextents, 186 uint rtextents)
839 uint flags,
840 uint logcount)
841{ 187{
842 int error = 0; 188 int error = 0;
843 int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; 189 int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
@@ -863,13 +209,15 @@ xfs_trans_reserve(
863 /* 209 /*
864 * Reserve the log space needed for this transaction. 210 * Reserve the log space needed for this transaction.
865 */ 211 */
866 if (logspace > 0) { 212 if (resp->tr_logres > 0) {
867 bool permanent = false; 213 bool permanent = false;
868 214
869 ASSERT(tp->t_log_res == 0 || tp->t_log_res == logspace); 215 ASSERT(tp->t_log_res == 0 ||
870 ASSERT(tp->t_log_count == 0 || tp->t_log_count == logcount); 216 tp->t_log_res == resp->tr_logres);
217 ASSERT(tp->t_log_count == 0 ||
218 tp->t_log_count == resp->tr_logcount);
871 219
872 if (flags & XFS_TRANS_PERM_LOG_RES) { 220 if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
873 tp->t_flags |= XFS_TRANS_PERM_LOG_RES; 221 tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
874 permanent = true; 222 permanent = true;
875 } else { 223 } else {
@@ -878,20 +226,21 @@ xfs_trans_reserve(
878 } 226 }
879 227
880 if (tp->t_ticket != NULL) { 228 if (tp->t_ticket != NULL) {
881 ASSERT(flags & XFS_TRANS_PERM_LOG_RES); 229 ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
882 error = xfs_log_regrant(tp->t_mountp, tp->t_ticket); 230 error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
883 } else { 231 } else {
884 error = xfs_log_reserve(tp->t_mountp, logspace, 232 error = xfs_log_reserve(tp->t_mountp,
885 logcount, &tp->t_ticket, 233 resp->tr_logres,
886 XFS_TRANSACTION, permanent, 234 resp->tr_logcount,
887 tp->t_type); 235 &tp->t_ticket, XFS_TRANSACTION,
236 permanent, tp->t_type);
888 } 237 }
889 238
890 if (error) 239 if (error)
891 goto undo_blocks; 240 goto undo_blocks;
892 241
893 tp->t_log_res = logspace; 242 tp->t_log_res = resp->tr_logres;
894 tp->t_log_count = logcount; 243 tp->t_log_count = resp->tr_logcount;
895 } 244 }
896 245
897 /* 246 /*
@@ -916,10 +265,10 @@ xfs_trans_reserve(
916 * reservations which have already been performed. 265 * reservations which have already been performed.
917 */ 266 */
918undo_log: 267undo_log:
919 if (logspace > 0) { 268 if (resp->tr_logres > 0) {
920 int log_flags; 269 int log_flags;
921 270
922 if (flags & XFS_TRANS_PERM_LOG_RES) { 271 if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
923 log_flags = XFS_LOG_REL_PERM_RESERV; 272 log_flags = XFS_LOG_REL_PERM_RESERV;
924 } else { 273 } else {
925 log_flags = 0; 274 log_flags = 0;
@@ -1367,10 +716,10 @@ xfs_trans_free_items(
1367 lip->li_desc = NULL; 716 lip->li_desc = NULL;
1368 717
1369 if (commit_lsn != NULLCOMMITLSN) 718 if (commit_lsn != NULLCOMMITLSN)
1370 IOP_COMMITTING(lip, commit_lsn); 719 lip->li_ops->iop_committing(lip, commit_lsn);
1371 if (flags & XFS_TRANS_ABORT) 720 if (flags & XFS_TRANS_ABORT)
1372 lip->li_flags |= XFS_LI_ABORTED; 721 lip->li_flags |= XFS_LI_ABORTED;
1373 IOP_UNLOCK(lip); 722 lip->li_ops->iop_unlock(lip);
1374 723
1375 xfs_trans_free_item_desc(lidp); 724 xfs_trans_free_item_desc(lidp);
1376 } 725 }
@@ -1390,8 +739,11 @@ xfs_log_item_batch_insert(
1390 /* xfs_trans_ail_update_bulk drops ailp->xa_lock */ 739 /* xfs_trans_ail_update_bulk drops ailp->xa_lock */
1391 xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn); 740 xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn);
1392 741
1393 for (i = 0; i < nr_items; i++) 742 for (i = 0; i < nr_items; i++) {
1394 IOP_UNPIN(log_items[i], 0); 743 struct xfs_log_item *lip = log_items[i];
744
745 lip->li_ops->iop_unpin(lip, 0);
746 }
1395} 747}
1396 748
1397/* 749/*
@@ -1401,11 +753,11 @@ xfs_log_item_batch_insert(
1401 * 753 *
1402 * If we are called with the aborted flag set, it is because a log write during 754 * If we are called with the aborted flag set, it is because a log write during
1403 * a CIL checkpoint commit has failed. In this case, all the items in the 755 * a CIL checkpoint commit has failed. In this case, all the items in the
1404 * checkpoint have already gone through IOP_COMMITED and IOP_UNLOCK, which 756 * checkpoint have already gone through iop_commited and iop_unlock, which
1405 * means that checkpoint commit abort handling is treated exactly the same 757 * means that checkpoint commit abort handling is treated exactly the same
1406 * as an iclog write error even though we haven't started any IO yet. Hence in 758 * as an iclog write error even though we haven't started any IO yet. Hence in
1407 * this case all we need to do is IOP_COMMITTED processing, followed by an 759 * this case all we need to do is iop_committed processing, followed by an
1408 * IOP_UNPIN(aborted) call. 760 * iop_unpin(aborted) call.
1409 * 761 *
1410 * The AIL cursor is used to optimise the insert process. If commit_lsn is not 762 * The AIL cursor is used to optimise the insert process. If commit_lsn is not
1411 * at the end of the AIL, the insert cursor avoids the need to walk 763 * at the end of the AIL, the insert cursor avoids the need to walk
@@ -1438,7 +790,7 @@ xfs_trans_committed_bulk(
1438 790
1439 if (aborted) 791 if (aborted)
1440 lip->li_flags |= XFS_LI_ABORTED; 792 lip->li_flags |= XFS_LI_ABORTED;
1441 item_lsn = IOP_COMMITTED(lip, commit_lsn); 793 item_lsn = lip->li_ops->iop_committed(lip, commit_lsn);
1442 794
1443 /* item_lsn of -1 means the item needs no further processing */ 795 /* item_lsn of -1 means the item needs no further processing */
1444 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) 796 if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
@@ -1450,7 +802,7 @@ xfs_trans_committed_bulk(
1450 */ 802 */
1451 if (aborted) { 803 if (aborted) {
1452 ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount)); 804 ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount));
1453 IOP_UNPIN(lip, 1); 805 lip->li_ops->iop_unpin(lip, 1);
1454 continue; 806 continue;
1455 } 807 }
1456 808
@@ -1468,7 +820,7 @@ xfs_trans_committed_bulk(
1468 xfs_trans_ail_update(ailp, lip, item_lsn); 820 xfs_trans_ail_update(ailp, lip, item_lsn);
1469 else 821 else
1470 spin_unlock(&ailp->xa_lock); 822 spin_unlock(&ailp->xa_lock);
1471 IOP_UNPIN(lip, 0); 823 lip->li_ops->iop_unpin(lip, 0);
1472 continue; 824 continue;
1473 } 825 }
1474 826
@@ -1666,7 +1018,7 @@ xfs_trans_roll(
1666 struct xfs_inode *dp) 1018 struct xfs_inode *dp)
1667{ 1019{
1668 struct xfs_trans *trans; 1020 struct xfs_trans *trans;
1669 unsigned int logres, count; 1021 struct xfs_trans_res tres;
1670 int error; 1022 int error;
1671 1023
1672 /* 1024 /*
@@ -1678,8 +1030,8 @@ xfs_trans_roll(
1678 /* 1030 /*
1679 * Copy the critical parameters from one trans to the next. 1031 * Copy the critical parameters from one trans to the next.
1680 */ 1032 */
1681 logres = trans->t_log_res; 1033 tres.tr_logres = trans->t_log_res;
1682 count = trans->t_log_count; 1034 tres.tr_logcount = trans->t_log_count;
1683 *tpp = xfs_trans_dup(trans); 1035 *tpp = xfs_trans_dup(trans);
1684 1036
1685 /* 1037 /*
@@ -1710,8 +1062,8 @@ xfs_trans_roll(
1710 * across this call, or that anything that is locked be logged in 1062 * across this call, or that anything that is locked be logged in
1711 * the prior and the next transactions. 1063 * the prior and the next transactions.
1712 */ 1064 */
1713 error = xfs_trans_reserve(trans, 0, logres, 0, 1065 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
1714 XFS_TRANS_PERM_LOG_RES, count); 1066 error = xfs_trans_reserve(trans, &tres, 0, 0);
1715 /* 1067 /*
1716 * Ensure that the inode is in the new transaction and locked. 1068 * Ensure that the inode is in the new transaction and locked.
1717 */ 1069 */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 2b4946393e30..09cf40b89e8c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -20,285 +20,9 @@
20 20
21struct xfs_log_item; 21struct xfs_log_item;
22 22
23/* 23#include "xfs_trans_resv.h"
24 * This is the structure written in the log at the head of
25 * every transaction. It identifies the type and id of the
26 * transaction, and contains the number of items logged by
27 * the transaction so we know how many to expect during recovery.
28 *
29 * Do not change the below structure without redoing the code in
30 * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
31 */
32typedef struct xfs_trans_header {
33 uint th_magic; /* magic number */
34 uint th_type; /* transaction type */
35 __int32_t th_tid; /* transaction id (unused) */
36 uint th_num_items; /* num items logged by trans */
37} xfs_trans_header_t;
38
39#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */
40
41/*
42 * Log item types.
43 */
44#define XFS_LI_EFI 0x1236
45#define XFS_LI_EFD 0x1237
46#define XFS_LI_IUNLINK 0x1238
47#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */
48#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */
49#define XFS_LI_DQUOT 0x123d
50#define XFS_LI_QUOTAOFF 0x123e
51#define XFS_LI_ICREATE 0x123f
52
53#define XFS_LI_TYPE_DESC \
54 { XFS_LI_EFI, "XFS_LI_EFI" }, \
55 { XFS_LI_EFD, "XFS_LI_EFD" }, \
56 { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \
57 { XFS_LI_INODE, "XFS_LI_INODE" }, \
58 { XFS_LI_BUF, "XFS_LI_BUF" }, \
59 { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \
60 { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }
61
62/*
63 * Transaction types. Used to distinguish types of buffers.
64 */
65#define XFS_TRANS_SETATTR_NOT_SIZE 1
66#define XFS_TRANS_SETATTR_SIZE 2
67#define XFS_TRANS_INACTIVE 3
68#define XFS_TRANS_CREATE 4
69#define XFS_TRANS_CREATE_TRUNC 5
70#define XFS_TRANS_TRUNCATE_FILE 6
71#define XFS_TRANS_REMOVE 7
72#define XFS_TRANS_LINK 8
73#define XFS_TRANS_RENAME 9
74#define XFS_TRANS_MKDIR 10
75#define XFS_TRANS_RMDIR 11
76#define XFS_TRANS_SYMLINK 12
77#define XFS_TRANS_SET_DMATTRS 13
78#define XFS_TRANS_GROWFS 14
79#define XFS_TRANS_STRAT_WRITE 15
80#define XFS_TRANS_DIOSTRAT 16
81/* 17 was XFS_TRANS_WRITE_SYNC */
82#define XFS_TRANS_WRITEID 18
83#define XFS_TRANS_ADDAFORK 19
84#define XFS_TRANS_ATTRINVAL 20
85#define XFS_TRANS_ATRUNCATE 21
86#define XFS_TRANS_ATTR_SET 22
87#define XFS_TRANS_ATTR_RM 23
88#define XFS_TRANS_ATTR_FLAG 24
89#define XFS_TRANS_CLEAR_AGI_BUCKET 25
90#define XFS_TRANS_QM_SBCHANGE 26
91/*
92 * Dummy entries since we use the transaction type to index into the
93 * trans_type[] in xlog_recover_print_trans_head()
94 */
95#define XFS_TRANS_DUMMY1 27
96#define XFS_TRANS_DUMMY2 28
97#define XFS_TRANS_QM_QUOTAOFF 29
98#define XFS_TRANS_QM_DQALLOC 30
99#define XFS_TRANS_QM_SETQLIM 31
100#define XFS_TRANS_QM_DQCLUSTER 32
101#define XFS_TRANS_QM_QINOCREATE 33
102#define XFS_TRANS_QM_QUOTAOFF_END 34
103#define XFS_TRANS_SB_UNIT 35
104#define XFS_TRANS_FSYNC_TS 36
105#define XFS_TRANS_GROWFSRT_ALLOC 37
106#define XFS_TRANS_GROWFSRT_ZERO 38
107#define XFS_TRANS_GROWFSRT_FREE 39
108#define XFS_TRANS_SWAPEXT 40
109#define XFS_TRANS_SB_COUNT 41
110#define XFS_TRANS_CHECKPOINT 42
111#define XFS_TRANS_ICREATE 43
112#define XFS_TRANS_TYPE_MAX 43
113/* new transaction types need to be reflected in xfs_logprint(8) */
114
115#define XFS_TRANS_TYPES \
116 { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \
117 { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \
118 { XFS_TRANS_INACTIVE, "INACTIVE" }, \
119 { XFS_TRANS_CREATE, "CREATE" }, \
120 { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \
121 { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \
122 { XFS_TRANS_REMOVE, "REMOVE" }, \
123 { XFS_TRANS_LINK, "LINK" }, \
124 { XFS_TRANS_RENAME, "RENAME" }, \
125 { XFS_TRANS_MKDIR, "MKDIR" }, \
126 { XFS_TRANS_RMDIR, "RMDIR" }, \
127 { XFS_TRANS_SYMLINK, "SYMLINK" }, \
128 { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \
129 { XFS_TRANS_GROWFS, "GROWFS" }, \
130 { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \
131 { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \
132 { XFS_TRANS_WRITEID, "WRITEID" }, \
133 { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \
134 { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \
135 { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \
136 { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \
137 { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \
138 { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \
139 { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \
140 { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \
141 { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \
142 { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \
143 { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \
144 { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \
145 { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \
146 { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \
147 { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \
148 { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \
149 { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \
150 { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \
151 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
152 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
153 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
154 { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
155 { XFS_TRANS_DUMMY1, "DUMMY1" }, \
156 { XFS_TRANS_DUMMY2, "DUMMY2" }, \
157 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
158
159/*
160 * This structure is used to track log items associated with
161 * a transaction. It points to the log item and keeps some
162 * flags to track the state of the log item. It also tracks
163 * the amount of space needed to log the item it describes
164 * once we get to commit processing (see xfs_trans_commit()).
165 */
166struct xfs_log_item_desc {
167 struct xfs_log_item *lid_item;
168 struct list_head lid_trans;
169 unsigned char lid_flags;
170};
171
172#define XFS_LID_DIRTY 0x1
173
174#define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */
175/*
176 * Values for t_flags.
177 */
178#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */
179#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */
180#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */
181#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
182#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
183#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
184#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer
185 count in superblock */
186
187/*
188 * Values for call flags parameter.
189 */
190#define XFS_TRANS_RELEASE_LOG_RES 0x4
191#define XFS_TRANS_ABORT 0x8
192
193/*
194 * Field values for xfs_trans_mod_sb.
195 */
196#define XFS_TRANS_SB_ICOUNT 0x00000001
197#define XFS_TRANS_SB_IFREE 0x00000002
198#define XFS_TRANS_SB_FDBLOCKS 0x00000004
199#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008
200#define XFS_TRANS_SB_FREXTENTS 0x00000010
201#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020
202#define XFS_TRANS_SB_DBLOCKS 0x00000040
203#define XFS_TRANS_SB_AGCOUNT 0x00000080
204#define XFS_TRANS_SB_IMAXPCT 0x00000100
205#define XFS_TRANS_SB_REXTSIZE 0x00000200
206#define XFS_TRANS_SB_RBMBLOCKS 0x00000400
207#define XFS_TRANS_SB_RBLOCKS 0x00000800
208#define XFS_TRANS_SB_REXTENTS 0x00001000
209#define XFS_TRANS_SB_REXTSLOG 0x00002000
210
211
212/*
213 * Per-extent log reservation for the allocation btree changes
214 * involved in freeing or allocating an extent.
215 * 2 trees * (2 blocks/level * max depth - 1)
216 */
217#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
218 ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
219
220/*
221 * Per-directory log reservation for any directory change.
222 * dir blocks: (1 btree block per level + data block + free block)
223 * bmap btree: (levels + 2) * max depth
224 * v2 directory blocks can be fragmented below the dirblksize down to the fsb
225 * size, so account for that in the DAENTER macros.
226 */
227#define XFS_DIROP_LOG_COUNT(mp) \
228 (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
229 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
230
231 24
232#define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write) 25/* kernel only transaction subsystem defines */
233#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate)
234#define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename)
235#define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link)
236#define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove)
237#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
238#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create)
239#define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir)
240#define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree)
241#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
242#define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata)
243#define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc)
244#define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero)
245#define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree)
246#define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
247/*
248 * Logging the inode timestamps on an fsync -- same as SWRITE
249 * as long as SWRITE logs the entire inode core
250 */
251#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
252#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
253#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork)
254#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval)
255#define XFS_ATTRSETM_LOG_RES(mp) ((mp)->m_reservations.tr_attrsetm)
256#define XFS_ATTRSETRT_LOG_RES(mp) ((mp)->m_reservations.tr_attrsetrt)
257#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm)
258#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi)
259#define XFS_QM_SBCHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_qm_sbchange)
260#define XFS_QM_SETQLIM_LOG_RES(mp) ((mp)->m_reservations.tr_qm_setqlim)
261#define XFS_QM_DQALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_qm_dqalloc)
262#define XFS_QM_QUOTAOFF_LOG_RES(mp) ((mp)->m_reservations.tr_qm_quotaoff)
263#define XFS_QM_QUOTAOFF_END_LOG_RES(mp) ((mp)->m_reservations.tr_qm_equotaoff)
264#define XFS_SB_LOG_RES(mp) ((mp)->m_reservations.tr_sb)
265
266/*
267 * Various log count values.
268 */
269#define XFS_DEFAULT_LOG_COUNT 1
270#define XFS_DEFAULT_PERM_LOG_COUNT 2
271#define XFS_ITRUNCATE_LOG_COUNT 2
272#define XFS_INACTIVE_LOG_COUNT 2
273#define XFS_CREATE_LOG_COUNT 2
274#define XFS_MKDIR_LOG_COUNT 3
275#define XFS_SYMLINK_LOG_COUNT 3
276#define XFS_REMOVE_LOG_COUNT 2
277#define XFS_LINK_LOG_COUNT 2
278#define XFS_RENAME_LOG_COUNT 2
279#define XFS_WRITE_LOG_COUNT 2
280#define XFS_ADDAFORK_LOG_COUNT 2
281#define XFS_ATTRINVAL_LOG_COUNT 1
282#define XFS_ATTRSET_LOG_COUNT 3
283#define XFS_ATTRRM_LOG_COUNT 3
284
285/*
286 * Here we centralize the specification of XFS meta-data buffer
287 * reference count values. This determine how hard the buffer
288 * cache tries to hold onto the buffer.
289 */
290#define XFS_AGF_REF 4
291#define XFS_AGI_REF 4
292#define XFS_AGFL_REF 3
293#define XFS_INO_BTREE_REF 3
294#define XFS_ALLOC_BTREE_REF 2
295#define XFS_BMAP_BTREE_REF 2
296#define XFS_DIR_BTREE_REF 2
297#define XFS_INO_REF 2
298#define XFS_ATTR_BTREE_REF 1
299#define XFS_DQUOT_REF 1
300
301#ifdef __KERNEL__
302 26
303struct xfs_buf; 27struct xfs_buf;
304struct xfs_buftarg; 28struct xfs_buftarg;
@@ -310,6 +34,7 @@ struct xfs_log_iovec;
310struct xfs_log_item_desc; 34struct xfs_log_item_desc;
311struct xfs_mount; 35struct xfs_mount;
312struct xfs_trans; 36struct xfs_trans;
37struct xfs_trans_res;
313struct xfs_dquot_acct; 38struct xfs_dquot_acct;
314struct xfs_busy_extent; 39struct xfs_busy_extent;
315 40
@@ -342,7 +67,7 @@ typedef struct xfs_log_item {
342 { XFS_LI_ABORTED, "ABORTED" } 67 { XFS_LI_ABORTED, "ABORTED" }
343 68
344struct xfs_item_ops { 69struct xfs_item_ops {
345 uint (*iop_size)(xfs_log_item_t *); 70 void (*iop_size)(xfs_log_item_t *, int *, int *);
346 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); 71 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
347 void (*iop_pin)(xfs_log_item_t *); 72 void (*iop_pin)(xfs_log_item_t *);
348 void (*iop_unpin)(xfs_log_item_t *, int remove); 73 void (*iop_unpin)(xfs_log_item_t *, int remove);
@@ -352,17 +77,8 @@ struct xfs_item_ops {
352 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); 77 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
353}; 78};
354 79
355#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
356#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
357#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
358#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove)
359#define IOP_PUSH(ip, list) (*(ip)->li_ops->iop_push)(ip, list)
360#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
361#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
362#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
363
364/* 80/*
365 * Return values for the IOP_PUSH() routines. 81 * Return values for the iop_push() routines.
366 */ 82 */
367#define XFS_ITEM_SUCCESS 0 83#define XFS_ITEM_SUCCESS 0
368#define XFS_ITEM_PINNED 1 84#define XFS_ITEM_PINNED 1
@@ -446,7 +162,7 @@ typedef struct xfs_trans {
446xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); 162xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
447xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); 163xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
448xfs_trans_t *xfs_trans_dup(xfs_trans_t *); 164xfs_trans_t *xfs_trans_dup(xfs_trans_t *);
449int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, 165int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *,
450 uint, uint); 166 uint, uint);
451void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); 167void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
452 168
@@ -528,9 +244,4 @@ void xfs_trans_ail_destroy(struct xfs_mount *);
528extern kmem_zone_t *xfs_trans_zone; 244extern kmem_zone_t *xfs_trans_zone;
529extern kmem_zone_t *xfs_log_item_desc_zone; 245extern kmem_zone_t *xfs_log_item_desc_zone;
530 246
531#endif /* __KERNEL__ */
532
533void xfs_trans_init(struct xfs_mount *);
534int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
535
536#endif /* __XFS_TRANS_H__ */ 247#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 0eda7254305f..21c6d7ddbc06 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -61,20 +61,6 @@ xfs_ail_check(
61#endif /* DEBUG */ 61#endif /* DEBUG */
62 62
63/* 63/*
64 * Return a pointer to the first item in the AIL. If the AIL is empty, then
65 * return NULL.
66 */
67xfs_log_item_t *
68xfs_ail_min(
69 struct xfs_ail *ailp)
70{
71 if (list_empty(&ailp->xa_ail))
72 return NULL;
73
74 return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
75}
76
77 /*
78 * Return a pointer to the last item in the AIL. If the AIL is empty, then 64 * Return a pointer to the last item in the AIL. If the AIL is empty, then
79 * return NULL. 65 * return NULL.
80 */ 66 */
@@ -393,11 +379,11 @@ xfsaild_push(
393 int lock_result; 379 int lock_result;
394 380
395 /* 381 /*
396 * Note that IOP_PUSH may unlock and reacquire the AIL lock. We 382 * Note that iop_push may unlock and reacquire the AIL lock. We
397 * rely on the AIL cursor implementation to be able to deal with 383 * rely on the AIL cursor implementation to be able to deal with
398 * the dropped lock. 384 * the dropped lock.
399 */ 385 */
400 lock_result = IOP_PUSH(lip, &ailp->xa_buf_list); 386 lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list);
401 switch (lock_result) { 387 switch (lock_result) {
402 case XFS_ITEM_SUCCESS: 388 case XFS_ITEM_SUCCESS:
403 XFS_STATS_INC(xs_push_ail_success); 389 XFS_STATS_INC(xs_push_ail_success);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index aa5a04b844d6..8c75b8f67270 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -505,7 +505,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
505 505
506/* 506/*
507 * Mark the buffer as not needing to be unlocked when the buf item's 507 * Mark the buffer as not needing to be unlocked when the buf item's
508 * IOP_UNLOCK() routine is called. The buffer must already be locked 508 * iop_unlock() routine is called. The buffer must already be locked
509 * and associated with the given transaction. 509 * and associated with the given transaction.
510 */ 510 */
511/* ARGSUSED */ 511/* ARGSUSED */
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 61407a847b86..54ee3c5dee76 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -17,6 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h" 19#include "xfs_fs.h"
20#include "xfs_format.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
21#include "xfs_trans.h" 22#include "xfs_trans.h"
22#include "xfs_sb.h" 23#include "xfs_sb.h"
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 53b7c9b0f8f7..c52def0b441c 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -25,6 +25,9 @@ struct xfs_trans;
25struct xfs_ail; 25struct xfs_ail;
26struct xfs_log_vec; 26struct xfs_log_vec;
27 27
28
29void xfs_trans_init(struct xfs_mount *);
30int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
28void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); 31void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
29void xfs_trans_del_item(struct xfs_log_item *); 32void xfs_trans_del_item(struct xfs_log_item *);
30void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, 33void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
@@ -83,6 +86,18 @@ void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
83 struct xfs_ail_cursor *cur, 86 struct xfs_ail_cursor *cur,
84 struct xfs_log_item **log_items, int nr_items, 87 struct xfs_log_item **log_items, int nr_items,
85 xfs_lsn_t lsn) __releases(ailp->xa_lock); 88 xfs_lsn_t lsn) __releases(ailp->xa_lock);
89/*
90 * Return a pointer to the first item in the AIL. If the AIL is empty, then
91 * return NULL.
92 */
93static inline struct xfs_log_item *
94xfs_ail_min(
95 struct xfs_ail *ailp)
96{
97 return list_first_entry_or_null(&ailp->xa_ail, struct xfs_log_item,
98 li_ail);
99}
100
86static inline void 101static inline void
87xfs_trans_ail_update( 102xfs_trans_ail_update(
88 struct xfs_ail *ailp, 103 struct xfs_ail *ailp,
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
new file mode 100644
index 000000000000..a65a3cc40610
--- /dev/null
+++ b/fs/xfs/xfs_trans_resv.c
@@ -0,0 +1,803 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * Copyright (C) 2010 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19#include "xfs.h"
20#include "xfs_fs.h"
21#include "xfs_format.h"
22#include "xfs_log.h"
23#include "xfs_trans_resv.h"
24#include "xfs_trans.h"
25#include "xfs_sb.h"
26#include "xfs_ag.h"
27#include "xfs_mount.h"
28#include "xfs_error.h"
29#include "xfs_da_btree.h"
30#include "xfs_bmap_btree.h"
31#include "xfs_alloc_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_dinode.h"
34#include "xfs_inode.h"
35#include "xfs_btree.h"
36#include "xfs_ialloc.h"
37#include "xfs_alloc.h"
38#include "xfs_extent_busy.h"
39#include "xfs_bmap.h"
40#include "xfs_bmap_util.h"
41#include "xfs_quota.h"
42#include "xfs_qm.h"
43#include "xfs_trans_space.h"
44#include "xfs_trace.h"
45
46/*
47 * A buffer has a format structure overhead in the log in addition
48 * to the data, so we need to take this into account when reserving
49 * space in a transaction for a buffer. Round the space required up
50 * to a multiple of 128 bytes so that we don't change the historical
51 * reservation that has been used for this overhead.
52 */
53STATIC uint
54xfs_buf_log_overhead(void)
55{
56 return round_up(sizeof(struct xlog_op_header) +
57 sizeof(struct xfs_buf_log_format), 128);
58}
59
60/*
61 * Calculate out transaction log reservation per item in bytes.
62 *
63 * The nbufs argument is used to indicate the number of items that
64 * will be changed in a transaction. size is used to tell how many
65 * bytes should be reserved per item.
66 */
67STATIC uint
68xfs_calc_buf_res(
69 uint nbufs,
70 uint size)
71{
72 return nbufs * (size + xfs_buf_log_overhead());
73}
74
75/*
76 * Logging inodes is really tricksy. They are logged in memory format,
77 * which means that what we write into the log doesn't directly translate into
78 * the amount of space they use on disk.
79 *
80 * Case in point - btree format forks in memory format use more space than the
81 * on-disk format. In memory, the buffer contains a normal btree block header so
82 * the btree code can treat it as though it is just another generic buffer.
83 * However, when we write it to the inode fork, we don't write all of this
84 * header as it isn't needed. e.g. the root is only ever in the inode, so
85 * there's no need for sibling pointers which would waste 16 bytes of space.
86 *
87 * Hence when we have an inode with a maximally sized btree format fork, then
88 * amount of information we actually log is greater than the size of the inode
89 * on disk. Hence we need an inode reservation function that calculates all this
90 * correctly. So, we log:
91 *
92 * - log op headers for object
93 * - inode log format object
94 * - the entire inode contents (core + 2 forks)
95 * - two bmap btree block headers
96 */
97STATIC uint
98xfs_calc_inode_res(
99 struct xfs_mount *mp,
100 uint ninodes)
101{
102 return ninodes * (sizeof(struct xlog_op_header) +
103 sizeof(struct xfs_inode_log_format) +
104 mp->m_sb.sb_inodesize +
105 2 * XFS_BMBT_BLOCK_LEN(mp));
106}
107
108/*
109 * Various log reservation values.
110 *
111 * These are based on the size of the file system block because that is what
112 * most transactions manipulate. Each adds in an additional 128 bytes per
113 * item logged to try to account for the overhead of the transaction mechanism.
114 *
115 * Note: Most of the reservations underestimate the number of allocation
116 * groups into which they could free extents in the xfs_bmap_finish() call.
117 * This is because the number in the worst case is quite high and quite
118 * unusual. In order to fix this we need to change xfs_bmap_finish() to free
119 * extents in only a single AG at a time. This will require changes to the
120 * EFI code as well, however, so that the EFI for the extents not freed is
121 * logged again in each transaction. See SGI PV #261917.
122 *
123 * Reservation functions here avoid a huge stack in xfs_trans_init due to
124 * register overflow from temporaries in the calculations.
125 */
126
127
128/*
129 * In a write transaction we can allocate a maximum of 2
130 * extents. This gives:
131 * the inode getting the new extents: inode size
132 * the inode's bmap btree: max depth * block size
133 * the agfs of the ags from which the extents are allocated: 2 * sector
134 * the superblock free block counter: sector size
135 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
136 * And the bmap_finish transaction can free bmap blocks in a join:
137 * the agfs of the ags containing the blocks: 2 * sector size
138 * the agfls of the ags containing the blocks: 2 * sector size
139 * the super block free block counter: sector size
140 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
141 */
142STATIC uint
143xfs_calc_write_reservation(
144 struct xfs_mount *mp)
145{
146 return XFS_DQUOT_LOGRES(mp) +
147 MAX((xfs_calc_inode_res(mp, 1) +
148 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
149 XFS_FSB_TO_B(mp, 1)) +
150 xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
151 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
152 XFS_FSB_TO_B(mp, 1))),
153 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
154 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
155 XFS_FSB_TO_B(mp, 1))));
156}
157
158/*
159 * In truncating a file we free up to two extents at once. We can modify:
160 * the inode being truncated: inode size
161 * the inode's bmap btree: (max depth + 1) * block size
162 * And the bmap_finish transaction can free the blocks and bmap blocks:
163 * the agf for each of the ags: 4 * sector size
164 * the agfl for each of the ags: 4 * sector size
165 * the super block to reflect the freed blocks: sector size
166 * worst case split in allocation btrees per extent assuming 4 extents:
167 * 4 exts * 2 trees * (2 * max depth - 1) * block size
168 * the inode btree: max depth * blocksize
169 * the allocation btrees: 2 trees * (max depth - 1) * block size
170 */
171STATIC uint
172xfs_calc_itruncate_reservation(
173 struct xfs_mount *mp)
174{
175 return XFS_DQUOT_LOGRES(mp) +
176 MAX((xfs_calc_inode_res(mp, 1) +
177 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
178 XFS_FSB_TO_B(mp, 1))),
179 (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
180 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
181 XFS_FSB_TO_B(mp, 1)) +
182 xfs_calc_buf_res(5, 0) +
183 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
184 XFS_FSB_TO_B(mp, 1)) +
185 xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
186 mp->m_in_maxlevels, 0)));
187}
188
189/*
190 * In renaming a files we can modify:
191 * the four inodes involved: 4 * inode size
192 * the two directory btrees: 2 * (max depth + v2) * dir block size
193 * the two directory bmap btrees: 2 * max depth * block size
194 * And the bmap_finish transaction can free dir and bmap blocks (two sets
195 * of bmap blocks) giving:
196 * the agf for the ags in which the blocks live: 3 * sector size
197 * the agfl for the ags in which the blocks live: 3 * sector size
198 * the superblock for the free block count: sector size
199 * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
200 */
201STATIC uint
202xfs_calc_rename_reservation(
203 struct xfs_mount *mp)
204{
205 return XFS_DQUOT_LOGRES(mp) +
206 MAX((xfs_calc_inode_res(mp, 4) +
207 xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
208 XFS_FSB_TO_B(mp, 1))),
209 (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
210 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
211 XFS_FSB_TO_B(mp, 1))));
212}
213
214/*
215 * For creating a link to an inode:
216 * the parent directory inode: inode size
217 * the linked inode: inode size
218 * the directory btree could split: (max depth + v2) * dir block size
219 * the directory bmap btree could join or split: (max depth + v2) * blocksize
220 * And the bmap_finish transaction can free some bmap blocks giving:
221 * the agf for the ag in which the blocks live: sector size
222 * the agfl for the ag in which the blocks live: sector size
223 * the superblock for the free block count: sector size
224 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
225 */
226STATIC uint
227xfs_calc_link_reservation(
228 struct xfs_mount *mp)
229{
230 return XFS_DQUOT_LOGRES(mp) +
231 MAX((xfs_calc_inode_res(mp, 2) +
232 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
233 XFS_FSB_TO_B(mp, 1))),
234 (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
235 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
236 XFS_FSB_TO_B(mp, 1))));
237}
238
239/*
240 * For removing a directory entry we can modify:
241 * the parent directory inode: inode size
242 * the removed inode: inode size
243 * the directory btree could join: (max depth + v2) * dir block size
244 * the directory bmap btree could join or split: (max depth + v2) * blocksize
245 * And the bmap_finish transaction can free the dir and bmap blocks giving:
246 * the agf for the ag in which the blocks live: 2 * sector size
247 * the agfl for the ag in which the blocks live: 2 * sector size
248 * the superblock for the free block count: sector size
249 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
250 */
251STATIC uint
252xfs_calc_remove_reservation(
253 struct xfs_mount *mp)
254{
255 return XFS_DQUOT_LOGRES(mp) +
256 MAX((xfs_calc_inode_res(mp, 2) +
257 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
258 XFS_FSB_TO_B(mp, 1))),
259 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
260 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
261 XFS_FSB_TO_B(mp, 1))));
262}
263
264/*
265 * For create, break it in to the two cases that the transaction
266 * covers. We start with the modify case - allocation done by modification
267 * of the state of existing inodes - and the allocation case.
268 */
269
270/*
271 * For create we can modify:
272 * the parent directory inode: inode size
273 * the new inode: inode size
274 * the inode btree entry: block size
275 * the superblock for the nlink flag: sector size
276 * the directory btree: (max depth + v2) * dir block size
277 * the directory inode's bmap btree: (max depth + v2) * block size
278 */
279STATIC uint
280xfs_calc_create_resv_modify(
281 struct xfs_mount *mp)
282{
283 return xfs_calc_inode_res(mp, 2) +
284 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
285 (uint)XFS_FSB_TO_B(mp, 1) +
286 xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
287}
288
289/*
290 * For create we can allocate some inodes giving:
291 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
292 * the superblock for the nlink flag: sector size
293 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
294 * the inode btree: max depth * blocksize
295 * the allocation btrees: 2 trees * (max depth - 1) * block size
296 */
297STATIC uint
298xfs_calc_create_resv_alloc(
299 struct xfs_mount *mp)
300{
301 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
302 mp->m_sb.sb_sectsize +
303 xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
304 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
305 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
306 XFS_FSB_TO_B(mp, 1));
307}
308
309STATIC uint
310__xfs_calc_create_reservation(
311 struct xfs_mount *mp)
312{
313 return XFS_DQUOT_LOGRES(mp) +
314 MAX(xfs_calc_create_resv_alloc(mp),
315 xfs_calc_create_resv_modify(mp));
316}
317
318/*
319 * For icreate we can allocate some inodes giving:
320 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
321 * the superblock for the nlink flag: sector size
322 * the inode btree: max depth * blocksize
323 * the allocation btrees: 2 trees * (max depth - 1) * block size
324 */
325STATIC uint
326xfs_calc_icreate_resv_alloc(
327 struct xfs_mount *mp)
328{
329 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
330 mp->m_sb.sb_sectsize +
331 xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
332 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
333 XFS_FSB_TO_B(mp, 1));
334}
335
336STATIC uint
337xfs_calc_icreate_reservation(xfs_mount_t *mp)
338{
339 return XFS_DQUOT_LOGRES(mp) +
340 MAX(xfs_calc_icreate_resv_alloc(mp),
341 xfs_calc_create_resv_modify(mp));
342}
343
344STATIC uint
345xfs_calc_create_reservation(
346 struct xfs_mount *mp)
347{
348 if (xfs_sb_version_hascrc(&mp->m_sb))
349 return xfs_calc_icreate_reservation(mp);
350 return __xfs_calc_create_reservation(mp);
351
352}
353
354/*
355 * Making a new directory is the same as creating a new file.
356 */
357STATIC uint
358xfs_calc_mkdir_reservation(
359 struct xfs_mount *mp)
360{
361 return xfs_calc_create_reservation(mp);
362}
363
364
365/*
366 * Making a new symplink is the same as creating a new file, but
367 * with the added blocks for remote symlink data which can be up to 1kB in
368 * length (MAXPATHLEN).
369 */
370STATIC uint
371xfs_calc_symlink_reservation(
372 struct xfs_mount *mp)
373{
374 return xfs_calc_create_reservation(mp) +
375 xfs_calc_buf_res(1, MAXPATHLEN);
376}
377
378/*
379 * In freeing an inode we can modify:
380 * the inode being freed: inode size
381 * the super block free inode counter: sector size
382 * the agi hash list and counters: sector size
383 * the inode btree entry: block size
384 * the on disk inode before ours in the agi hash list: inode cluster size
385 * the inode btree: max depth * blocksize
386 * the allocation btrees: 2 trees * (max depth - 1) * block size
387 */
388STATIC uint
389xfs_calc_ifree_reservation(
390 struct xfs_mount *mp)
391{
392 return XFS_DQUOT_LOGRES(mp) +
393 xfs_calc_inode_res(mp, 1) +
394 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
395 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
396 MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
397 XFS_INODE_CLUSTER_SIZE(mp)) +
398 xfs_calc_buf_res(1, 0) +
399 xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
400 mp->m_in_maxlevels, 0) +
401 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
402 XFS_FSB_TO_B(mp, 1));
403}
404
405/*
406 * When only changing the inode we log the inode and possibly the superblock
407 * We also add a bit of slop for the transaction stuff.
408 */
409STATIC uint
410xfs_calc_ichange_reservation(
411 struct xfs_mount *mp)
412{
413 return XFS_DQUOT_LOGRES(mp) +
414 xfs_calc_inode_res(mp, 1) +
415 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
416
417}
418
419/*
420 * Growing the data section of the filesystem.
421 * superblock
422 * agi and agf
423 * allocation btrees
424 */
425STATIC uint
426xfs_calc_growdata_reservation(
427 struct xfs_mount *mp)
428{
429 return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
430 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
431 XFS_FSB_TO_B(mp, 1));
432}
433
434/*
435 * Growing the rt section of the filesystem.
436 * In the first set of transactions (ALLOC) we allocate space to the
437 * bitmap or summary files.
438 * superblock: sector size
439 * agf of the ag from which the extent is allocated: sector size
440 * bmap btree for bitmap/summary inode: max depth * blocksize
441 * bitmap/summary inode: inode size
442 * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
443 */
444STATIC uint
445xfs_calc_growrtalloc_reservation(
446 struct xfs_mount *mp)
447{
448 return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
449 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
450 XFS_FSB_TO_B(mp, 1)) +
451 xfs_calc_inode_res(mp, 1) +
452 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
453 XFS_FSB_TO_B(mp, 1));
454}
455
456/*
457 * Growing the rt section of the filesystem.
458 * In the second set of transactions (ZERO) we zero the new metadata blocks.
459 * one bitmap/summary block: blocksize
460 */
461STATIC uint
462xfs_calc_growrtzero_reservation(
463 struct xfs_mount *mp)
464{
465 return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
466}
467
468/*
469 * Growing the rt section of the filesystem.
470 * In the third set of transactions (FREE) we update metadata without
471 * allocating any new blocks.
472 * superblock: sector size
473 * bitmap inode: inode size
474 * summary inode: inode size
475 * one bitmap block: blocksize
476 * summary blocks: new summary size
477 */
478STATIC uint
479xfs_calc_growrtfree_reservation(
480 struct xfs_mount *mp)
481{
482 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
483 xfs_calc_inode_res(mp, 2) +
484 xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
485 xfs_calc_buf_res(1, mp->m_rsumsize);
486}
487
488/*
489 * Logging the inode modification timestamp on a synchronous write.
490 * inode
491 */
492STATIC uint
493xfs_calc_swrite_reservation(
494 struct xfs_mount *mp)
495{
496 return xfs_calc_inode_res(mp, 1);
497}
498
499/*
500 * Logging the inode mode bits when writing a setuid/setgid file
501 * inode
502 */
503STATIC uint
504xfs_calc_writeid_reservation(
505 struct xfs_mount *mp)
506{
507 return xfs_calc_inode_res(mp, 1);
508}
509
510/*
511 * Converting the inode from non-attributed to attributed.
512 * the inode being converted: inode size
513 * agf block and superblock (for block allocation)
514 * the new block (directory sized)
515 * bmap blocks for the new directory block
516 * allocation btrees
517 */
518STATIC uint
519xfs_calc_addafork_reservation(
520 struct xfs_mount *mp)
521{
522 return XFS_DQUOT_LOGRES(mp) +
523 xfs_calc_inode_res(mp, 1) +
524 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
525 xfs_calc_buf_res(1, mp->m_dirblksize) +
526 xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
527 XFS_FSB_TO_B(mp, 1)) +
528 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
529 XFS_FSB_TO_B(mp, 1));
530}
531
532/*
533 * Removing the attribute fork of a file
534 * the inode being truncated: inode size
535 * the inode's bmap btree: max depth * block size
536 * And the bmap_finish transaction can free the blocks and bmap blocks:
537 * the agf for each of the ags: 4 * sector size
538 * the agfl for each of the ags: 4 * sector size
539 * the super block to reflect the freed blocks: sector size
540 * worst case split in allocation btrees per extent assuming 4 extents:
541 * 4 exts * 2 trees * (2 * max depth - 1) * block size
542 */
543STATIC uint
544xfs_calc_attrinval_reservation(
545 struct xfs_mount *mp)
546{
547 return MAX((xfs_calc_inode_res(mp, 1) +
548 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
549 XFS_FSB_TO_B(mp, 1))),
550 (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
551 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
552 XFS_FSB_TO_B(mp, 1))));
553}
554
555/*
556 * Setting an attribute at mount time.
557 * the inode getting the attribute
558 * the superblock for allocations
559 * the agfs extents are allocated from
560 * the attribute btree * max depth
561 * the inode allocation btree
562 * Since attribute transaction space is dependent on the size of the attribute,
563 * the calculation is done partially at mount time and partially at runtime(see
564 * below).
565 */
566STATIC uint
567xfs_calc_attrsetm_reservation(
568 struct xfs_mount *mp)
569{
570 return XFS_DQUOT_LOGRES(mp) +
571 xfs_calc_inode_res(mp, 1) +
572 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
573 xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
574}
575
576/*
577 * Setting an attribute at runtime, transaction space unit per block.
578 * the superblock for allocations: sector size
579 * the inode bmap btree could join or split: max depth * block size
580 * Since the runtime attribute transaction space is dependent on the total
581 * blocks needed for the 1st bmap, here we calculate out the space unit for
582 * one block so that the caller could figure out the total space according
583 * to the attibute extent length in blocks by:
584 * ext * M_RES(mp)->tr_attrsetrt.tr_logres
585 */
586STATIC uint
587xfs_calc_attrsetrt_reservation(
588 struct xfs_mount *mp)
589{
590 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
591 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
592 XFS_FSB_TO_B(mp, 1));
593}
594
595/*
596 * Removing an attribute.
597 * the inode: inode size
598 * the attribute btree could join: max depth * block size
599 * the inode bmap btree could join or split: max depth * block size
600 * And the bmap_finish transaction can free the attr blocks freed giving:
601 * the agf for the ag in which the blocks live: 2 * sector size
602 * the agfl for the ag in which the blocks live: 2 * sector size
603 * the superblock for the free block count: sector size
604 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
605 */
606STATIC uint
607xfs_calc_attrrm_reservation(
608 struct xfs_mount *mp)
609{
610 return XFS_DQUOT_LOGRES(mp) +
611 MAX((xfs_calc_inode_res(mp, 1) +
612 xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
613 XFS_FSB_TO_B(mp, 1)) +
614 (uint)XFS_FSB_TO_B(mp,
615 XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
616 xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
617 (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
618 xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
619 XFS_FSB_TO_B(mp, 1))));
620}
621
622/*
623 * Clearing a bad agino number in an agi hash bucket.
624 */
625STATIC uint
626xfs_calc_clear_agi_bucket_reservation(
627 struct xfs_mount *mp)
628{
629 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
630}
631
632/*
633 * Clearing the quotaflags in the superblock.
634 * the super block for changing quota flags: sector size
635 */
636STATIC uint
637xfs_calc_qm_sbchange_reservation(
638 struct xfs_mount *mp)
639{
640 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
641}
642
643/*
644 * Adjusting quota limits.
645 * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
646 */
647STATIC uint
648xfs_calc_qm_setqlim_reservation(
649 struct xfs_mount *mp)
650{
651 return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
652}
653
654/*
655 * Allocating quota on disk if needed.
656 * the write transaction log space: M_RES(mp)->tr_write.tr_logres
657 * the unit of quota allocation: one system block size
658 */
659STATIC uint
660xfs_calc_qm_dqalloc_reservation(
661 struct xfs_mount *mp)
662{
663 ASSERT(M_RES(mp)->tr_write.tr_logres);
664 return M_RES(mp)->tr_write.tr_logres +
665 xfs_calc_buf_res(1,
666 XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
667}
668
669/*
670 * Turning off quotas.
671 * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
672 * the superblock for the quota flags: sector size
673 */
674STATIC uint
675xfs_calc_qm_quotaoff_reservation(
676 struct xfs_mount *mp)
677{
678 return sizeof(struct xfs_qoff_logitem) * 2 +
679 xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
680}
681
682/*
683 * End of turning off quotas.
684 * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
685 */
686STATIC uint
687xfs_calc_qm_quotaoff_end_reservation(
688 struct xfs_mount *mp)
689{
690 return sizeof(struct xfs_qoff_logitem) * 2;
691}
692
693/*
694 * Syncing the incore super block changes to disk.
695 * the super block to reflect the changes: sector size
696 */
697STATIC uint
698xfs_calc_sb_reservation(
699 struct xfs_mount *mp)
700{
701 return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
702}
703
704void
705xfs_trans_resv_calc(
706 struct xfs_mount *mp,
707 struct xfs_trans_resv *resp)
708{
709 /*
710 * The following transactions are logged in physical format and
711 * require a permanent reservation on space.
712 */
713 resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
714 resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
715 resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
716
717 resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
718 resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
719 resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
720
721 resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
722 resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT;
723 resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
724
725 resp->tr_link.tr_logres = xfs_calc_link_reservation(mp);
726 resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT;
727 resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
728
729 resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp);
730 resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT;
731 resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
732
733 resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp);
734 resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT;
735 resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
736
737 resp->tr_create.tr_logres = xfs_calc_create_reservation(mp);
738 resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
739 resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
740
741 resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
742 resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
743 resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
744
745 resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp);
746 resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT;
747 resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
748
749 resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp);
750 resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT;
751 resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
752
753 resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp);
754 resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT;
755 resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
756
757 resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp);
758 resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT;
759 resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
760
761 resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp);
762 resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT;
763 resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
764
765 resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp);
766 resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
767 resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
768
769 resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
770 resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
771 resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
772
773 /*
774 * The following transactions are logged in logical format with
775 * a default log count.
776 */
777 resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp);
778 resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT;
779
780 resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp);
781 resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
782
783 resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp);
784 resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
785
786 resp->tr_qm_equotaoff.tr_logres =
787 xfs_calc_qm_quotaoff_end_reservation(mp);
788 resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT;
789
790 resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
791 resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
792
793 /* The following transaction are logged in logical format */
794 resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
795 resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
796 resp->tr_swrite.tr_logres = xfs_calc_swrite_reservation(mp);
797 resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
798 resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
799 resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
800 resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
801 resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
802 resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
803}
diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/xfs_trans_resv.h
new file mode 100644
index 000000000000..de7de9aaad8a
--- /dev/null
+++ b/fs/xfs/xfs_trans_resv.h
@@ -0,0 +1,116 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_TRANS_RESV_H__
19#define __XFS_TRANS_RESV_H__
20
21struct xfs_mount;
22
23/*
24 * structure for maintaining pre-calculated transaction reservations.
25 */
26struct xfs_trans_res {
27 uint tr_logres; /* log space unit in bytes per log ticket */
28 int tr_logcount; /* number of log operations per log ticket */
29 int tr_logflags; /* log flags, currently only used for indicating
30 * a reservation request is permanent or not */
31};
32
33struct xfs_trans_resv {
34 struct xfs_trans_res tr_write; /* extent alloc trans */
35 struct xfs_trans_res tr_itruncate; /* truncate trans */
36 struct xfs_trans_res tr_rename; /* rename trans */
37 struct xfs_trans_res tr_link; /* link trans */
38 struct xfs_trans_res tr_remove; /* unlink trans */
39 struct xfs_trans_res tr_symlink; /* symlink trans */
40 struct xfs_trans_res tr_create; /* create trans */
41 struct xfs_trans_res tr_mkdir; /* mkdir trans */
42 struct xfs_trans_res tr_ifree; /* inode free trans */
43 struct xfs_trans_res tr_ichange; /* inode update trans */
44 struct xfs_trans_res tr_growdata; /* fs data section grow trans */
45 struct xfs_trans_res tr_swrite; /* sync write inode trans */
46 struct xfs_trans_res tr_addafork; /* add inode attr fork trans */
47 struct xfs_trans_res tr_writeid; /* write setuid/setgid file */
48 struct xfs_trans_res tr_attrinval; /* attr fork buffer
49 * invalidation */
50 struct xfs_trans_res tr_attrsetm; /* set/create an attribute at
51 * mount time */
52 struct xfs_trans_res tr_attrsetrt; /* set/create an attribute at
53 * runtime */
54 struct xfs_trans_res tr_attrrm; /* remove an attribute */
55 struct xfs_trans_res tr_clearagi; /* clear agi unlinked bucket */
56 struct xfs_trans_res tr_growrtalloc; /* grow realtime allocations */
57 struct xfs_trans_res tr_growrtzero; /* grow realtime zeroing */
58 struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */
59 struct xfs_trans_res tr_qm_sbchange; /* change quota flags */
60 struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */
61 struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */
62 struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */
63 struct xfs_trans_res tr_qm_equotaoff;/* end of turn quota off */
64 struct xfs_trans_res tr_sb; /* modify superblock */
65 struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */
66};
67
68/* shorthand way of accessing reservation structure */
69#define M_RES(mp) (&(mp)->m_resv)
70
71/*
72 * Per-extent log reservation for the allocation btree changes
73 * involved in freeing or allocating an extent.
74 * 2 trees * (2 blocks/level * max depth - 1) * block size
75 */
76#define XFS_ALLOCFREE_LOG_RES(mp,nx) \
77 ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1)))
78#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \
79 ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1)))
80
81/*
82 * Per-directory log reservation for any directory change.
83 * dir blocks: (1 btree block per level + data block + free block) * dblock size
84 * bmap btree: (levels + 2) * max depth * block size
85 * v2 directory blocks can be fragmented below the dirblksize down to the fsb
86 * size, so account for that in the DAENTER macros.
87 */
88#define XFS_DIROP_LOG_RES(mp) \
89 (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \
90 (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)))
91#define XFS_DIROP_LOG_COUNT(mp) \
92 (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
93 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
94
95/*
96 * Various log count values.
97 */
98#define XFS_DEFAULT_LOG_COUNT 1
99#define XFS_DEFAULT_PERM_LOG_COUNT 2
100#define XFS_ITRUNCATE_LOG_COUNT 2
101#define XFS_INACTIVE_LOG_COUNT 2
102#define XFS_CREATE_LOG_COUNT 2
103#define XFS_MKDIR_LOG_COUNT 3
104#define XFS_SYMLINK_LOG_COUNT 3
105#define XFS_REMOVE_LOG_COUNT 2
106#define XFS_LINK_LOG_COUNT 2
107#define XFS_RENAME_LOG_COUNT 2
108#define XFS_WRITE_LOG_COUNT 2
109#define XFS_ADDAFORK_LOG_COUNT 2
110#define XFS_ATTRINVAL_LOG_COUNT 1
111#define XFS_ATTRSET_LOG_COUNT 3
112#define XFS_ATTRRM_LOG_COUNT 3
113
114void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
115
116#endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 61ba1cfa974c..82bbc34d54a3 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -18,42 +18,7 @@
18#ifndef __XFS_TYPES_H__ 18#ifndef __XFS_TYPES_H__
19#define __XFS_TYPES_H__ 19#define __XFS_TYPES_H__
20 20
21#ifdef __KERNEL__ 21typedef __uint32_t prid_t; /* project ID */
22
23/*
24 * Additional type declarations for XFS
25 */
26typedef signed char __int8_t;
27typedef unsigned char __uint8_t;
28typedef signed short int __int16_t;
29typedef unsigned short int __uint16_t;
30typedef signed int __int32_t;
31typedef unsigned int __uint32_t;
32typedef signed long long int __int64_t;
33typedef unsigned long long int __uint64_t;
34
35typedef __uint32_t prid_t; /* project ID */
36typedef __uint32_t inst_t; /* an instruction */
37
38typedef __s64 xfs_off_t; /* <file offset> type */
39typedef unsigned long long xfs_ino_t; /* <inode> type */
40typedef __s64 xfs_daddr_t; /* <disk address> type */
41typedef char * xfs_caddr_t; /* <core address> type */
42typedef __u32 xfs_dev_t;
43typedef __u32 xfs_nlink_t;
44
45/* __psint_t is the same size as a pointer */
46#if (BITS_PER_LONG == 32)
47typedef __int32_t __psint_t;
48typedef __uint32_t __psunsigned_t;
49#elif (BITS_PER_LONG == 64)
50typedef __int64_t __psint_t;
51typedef __uint64_t __psunsigned_t;
52#else
53#error BITS_PER_LONG must be 32 or 64
54#endif
55
56#endif /* __KERNEL__ */
57 22
58typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */ 23typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */
59typedef __uint32_t xfs_agino_t; /* inode # within allocation grp */ 24typedef __uint32_t xfs_agino_t; /* inode # within allocation grp */
@@ -146,6 +111,12 @@ typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */
146#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG) 111#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG)
147 112
148/* 113/*
114 * Inode fork identifiers.
115 */
116#define XFS_DATA_FORK 0
117#define XFS_ATTR_FORK 1
118
119/*
149 * Min numbers of data/attr fork btree root pointers. 120 * Min numbers of data/attr fork btree root pointers.
150 */ 121 */
151#define MINDBTPTRS 3 122#define MINDBTPTRS 3
@@ -169,6 +140,23 @@ typedef enum {
169struct xfs_name { 140struct xfs_name {
170 const unsigned char *name; 141 const unsigned char *name;
171 int len; 142 int len;
143 int type;
172}; 144};
173 145
146/*
147 * uid_t and gid_t are hard-coded to 32 bits in the inode.
148 * Hence, an 'id' in a dquot is 32 bits..
149 */
150typedef __uint32_t xfs_dqid_t;
151
152/*
153 * Constants for bit manipulations.
154 */
155#define XFS_NBBYLOG 3 /* log2(NBBY) */
156#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */
157#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG)
158#define XFS_NBWORD (1 << XFS_NBWORDLOG)
159#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1)
160
161
174#endif /* __XFS_TYPES_H__ */ 162#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
deleted file mode 100644
index 0025c78ac03c..000000000000
--- a/fs/xfs/xfs_utils.c
+++ /dev/null
@@ -1,314 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_log.h"
22#include "xfs_trans.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_dir2.h"
26#include "xfs_mount.h"
27#include "xfs_bmap_btree.h"
28#include "xfs_dinode.h"
29#include "xfs_inode.h"
30#include "xfs_inode_item.h"
31#include "xfs_bmap.h"
32#include "xfs_error.h"
33#include "xfs_quota.h"
34#include "xfs_itable.h"
35#include "xfs_utils.h"
36
37
38/*
39 * Allocates a new inode from disk and return a pointer to the
40 * incore copy. This routine will internally commit the current
41 * transaction and allocate a new one if the Space Manager needed
42 * to do an allocation to replenish the inode free-list.
43 *
44 * This routine is designed to be called from xfs_create and
45 * xfs_create_dir.
46 *
47 */
48int
49xfs_dir_ialloc(
50 xfs_trans_t **tpp, /* input: current transaction;
51 output: may be a new transaction. */
52 xfs_inode_t *dp, /* directory within whose allocate
53 the inode. */
54 umode_t mode,
55 xfs_nlink_t nlink,
56 xfs_dev_t rdev,
57 prid_t prid, /* project id */
58 int okalloc, /* ok to allocate new space */
59 xfs_inode_t **ipp, /* pointer to inode; it will be
60 locked. */
61 int *committed)
62
63{
64 xfs_trans_t *tp;
65 xfs_trans_t *ntp;
66 xfs_inode_t *ip;
67 xfs_buf_t *ialloc_context = NULL;
68 int code;
69 uint log_res;
70 uint log_count;
71 void *dqinfo;
72 uint tflags;
73
74 tp = *tpp;
75 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
76
77 /*
78 * xfs_ialloc will return a pointer to an incore inode if
79 * the Space Manager has an available inode on the free
80 * list. Otherwise, it will do an allocation and replenish
81 * the freelist. Since we can only do one allocation per
82 * transaction without deadlocks, we will need to commit the
83 * current transaction and start a new one. We will then
84 * need to call xfs_ialloc again to get the inode.
85 *
86 * If xfs_ialloc did an allocation to replenish the freelist,
87 * it returns the bp containing the head of the freelist as
88 * ialloc_context. We will hold a lock on it across the
89 * transaction commit so that no other process can steal
90 * the inode(s) that we've just allocated.
91 */
92 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
93 &ialloc_context, &ip);
94
95 /*
96 * Return an error if we were unable to allocate a new inode.
97 * This should only happen if we run out of space on disk or
98 * encounter a disk error.
99 */
100 if (code) {
101 *ipp = NULL;
102 return code;
103 }
104 if (!ialloc_context && !ip) {
105 *ipp = NULL;
106 return XFS_ERROR(ENOSPC);
107 }
108
109 /*
110 * If the AGI buffer is non-NULL, then we were unable to get an
111 * inode in one operation. We need to commit the current
112 * transaction and call xfs_ialloc() again. It is guaranteed
113 * to succeed the second time.
114 */
115 if (ialloc_context) {
116 /*
117 * Normally, xfs_trans_commit releases all the locks.
118 * We call bhold to hang on to the ialloc_context across
119 * the commit. Holding this buffer prevents any other
120 * processes from doing any allocations in this
121 * allocation group.
122 */
123 xfs_trans_bhold(tp, ialloc_context);
124 /*
125 * Save the log reservation so we can use
126 * them in the next transaction.
127 */
128 log_res = xfs_trans_get_log_res(tp);
129 log_count = xfs_trans_get_log_count(tp);
130
131 /*
132 * We want the quota changes to be associated with the next
133 * transaction, NOT this one. So, detach the dqinfo from this
134 * and attach it to the next transaction.
135 */
136 dqinfo = NULL;
137 tflags = 0;
138 if (tp->t_dqinfo) {
139 dqinfo = (void *)tp->t_dqinfo;
140 tp->t_dqinfo = NULL;
141 tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
142 tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
143 }
144
145 ntp = xfs_trans_dup(tp);
146 code = xfs_trans_commit(tp, 0);
147 tp = ntp;
148 if (committed != NULL) {
149 *committed = 1;
150 }
151 /*
152 * If we get an error during the commit processing,
153 * release the buffer that is still held and return
154 * to the caller.
155 */
156 if (code) {
157 xfs_buf_relse(ialloc_context);
158 if (dqinfo) {
159 tp->t_dqinfo = dqinfo;
160 xfs_trans_free_dqinfo(tp);
161 }
162 *tpp = ntp;
163 *ipp = NULL;
164 return code;
165 }
166
167 /*
168 * transaction commit worked ok so we can drop the extra ticket
169 * reference that we gained in xfs_trans_dup()
170 */
171 xfs_log_ticket_put(tp->t_ticket);
172 code = xfs_trans_reserve(tp, 0, log_res, 0,
173 XFS_TRANS_PERM_LOG_RES, log_count);
174 /*
175 * Re-attach the quota info that we detached from prev trx.
176 */
177 if (dqinfo) {
178 tp->t_dqinfo = dqinfo;
179 tp->t_flags |= tflags;
180 }
181
182 if (code) {
183 xfs_buf_relse(ialloc_context);
184 *tpp = ntp;
185 *ipp = NULL;
186 return code;
187 }
188 xfs_trans_bjoin(tp, ialloc_context);
189
190 /*
191 * Call ialloc again. Since we've locked out all
192 * other allocations in this allocation group,
193 * this call should always succeed.
194 */
195 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
196 okalloc, &ialloc_context, &ip);
197
198 /*
199 * If we get an error at this point, return to the caller
200 * so that the current transaction can be aborted.
201 */
202 if (code) {
203 *tpp = tp;
204 *ipp = NULL;
205 return code;
206 }
207 ASSERT(!ialloc_context && ip);
208
209 } else {
210 if (committed != NULL)
211 *committed = 0;
212 }
213
214 *ipp = ip;
215 *tpp = tp;
216
217 return 0;
218}
219
220/*
221 * Decrement the link count on an inode & log the change.
222 * If this causes the link count to go to zero, initiate the
223 * logging activity required to truncate a file.
224 */
225int /* error */
226xfs_droplink(
227 xfs_trans_t *tp,
228 xfs_inode_t *ip)
229{
230 int error;
231
232 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
233
234 ASSERT (ip->i_d.di_nlink > 0);
235 ip->i_d.di_nlink--;
236 drop_nlink(VFS_I(ip));
237 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
238
239 error = 0;
240 if (ip->i_d.di_nlink == 0) {
241 /*
242 * We're dropping the last link to this file.
243 * Move the on-disk inode to the AGI unlinked list.
244 * From xfs_inactive() we will pull the inode from
245 * the list and free it.
246 */
247 error = xfs_iunlink(tp, ip);
248 }
249 return error;
250}
251
252/*
253 * This gets called when the inode's version needs to be changed from 1 to 2.
254 * Currently this happens when the nlink field overflows the old 16-bit value
255 * or when chproj is called to change the project for the first time.
256 * As a side effect the superblock version will also get rev'd
257 * to contain the NLINK bit.
258 */
259void
260xfs_bump_ino_vers2(
261 xfs_trans_t *tp,
262 xfs_inode_t *ip)
263{
264 xfs_mount_t *mp;
265
266 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
267 ASSERT(ip->i_d.di_version == 1);
268
269 ip->i_d.di_version = 2;
270 ip->i_d.di_onlink = 0;
271 memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
272 mp = tp->t_mountp;
273 if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
274 spin_lock(&mp->m_sb_lock);
275 if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
276 xfs_sb_version_addnlink(&mp->m_sb);
277 spin_unlock(&mp->m_sb_lock);
278 xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
279 } else {
280 spin_unlock(&mp->m_sb_lock);
281 }
282 }
283 /* Caller must log the inode */
284}
285
286/*
287 * Increment the link count on an inode & log the change.
288 */
289int
290xfs_bumplink(
291 xfs_trans_t *tp,
292 xfs_inode_t *ip)
293{
294 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
295
296 ASSERT(ip->i_d.di_nlink > 0);
297 ip->i_d.di_nlink++;
298 inc_nlink(VFS_I(ip));
299 if ((ip->i_d.di_version == 1) &&
300 (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
301 /*
302 * The inode has increased its number of links beyond
303 * what can fit in an old format inode. It now needs
304 * to be converted to a version 2 inode with a 32 bit
305 * link count. If this is the first inode in the file
306 * system to do this, then we need to bump the superblock
307 * version number as well.
308 */
309 xfs_bump_ino_vers2(tp, ip);
310 }
311
312 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
313 return 0;
314}
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
deleted file mode 100644
index 5eeab4690cfe..000000000000
--- a/fs/xfs/xfs_utils.h
+++ /dev/null
@@ -1,27 +0,0 @@
1/*
2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_UTILS_H__
19#define __XFS_UTILS_H__
20
21extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, umode_t, xfs_nlink_t,
22 xfs_dev_t, prid_t, int, xfs_inode_t **, int *);
23extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
24extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
25extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
26
27#endif /* __XFS_UTILS_H__ */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
deleted file mode 100644
index dc730ac272be..000000000000
--- a/fs/xfs/xfs_vnodeops.c
+++ /dev/null
@@ -1,1870 +0,0 @@
1/*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * Copyright (c) 2012 Red Hat, Inc.
4 * All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 */
19
20#include "xfs.h"
21#include "xfs_fs.h"
22#include "xfs_types.h"
23#include "xfs_bit.h"
24#include "xfs_log.h"
25#include "xfs_trans.h"
26#include "xfs_sb.h"
27#include "xfs_ag.h"
28#include "xfs_dir2.h"
29#include "xfs_mount.h"
30#include "xfs_da_btree.h"
31#include "xfs_bmap_btree.h"
32#include "xfs_ialloc_btree.h"
33#include "xfs_dinode.h"
34#include "xfs_inode.h"
35#include "xfs_inode_item.h"
36#include "xfs_itable.h"
37#include "xfs_ialloc.h"
38#include "xfs_alloc.h"
39#include "xfs_bmap.h"
40#include "xfs_acl.h"
41#include "xfs_attr.h"
42#include "xfs_error.h"
43#include "xfs_quota.h"
44#include "xfs_utils.h"
45#include "xfs_rtalloc.h"
46#include "xfs_trans_space.h"
47#include "xfs_log_priv.h"
48#include "xfs_filestream.h"
49#include "xfs_vnodeops.h"
50#include "xfs_trace.h"
51#include "xfs_icache.h"
52#include "xfs_symlink.h"
53
54
55/*
56 * This is called by xfs_inactive to free any blocks beyond eof
57 * when the link count isn't zero and by xfs_dm_punch_hole() when
58 * punching a hole to EOF.
59 */
60int
61xfs_free_eofblocks(
62 xfs_mount_t *mp,
63 xfs_inode_t *ip,
64 bool need_iolock)
65{
66 xfs_trans_t *tp;
67 int error;
68 xfs_fileoff_t end_fsb;
69 xfs_fileoff_t last_fsb;
70 xfs_filblks_t map_len;
71 int nimaps;
72 xfs_bmbt_irec_t imap;
73
74 /*
75 * Figure out if there are any blocks beyond the end
76 * of the file. If not, then there is nothing to do.
77 */
78 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
79 last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
80 if (last_fsb <= end_fsb)
81 return 0;
82 map_len = last_fsb - end_fsb;
83
84 nimaps = 1;
85 xfs_ilock(ip, XFS_ILOCK_SHARED);
86 error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
87 xfs_iunlock(ip, XFS_ILOCK_SHARED);
88
89 if (!error && (nimaps != 0) &&
90 (imap.br_startblock != HOLESTARTBLOCK ||
91 ip->i_delayed_blks)) {
92 /*
93 * Attach the dquots to the inode up front.
94 */
95 error = xfs_qm_dqattach(ip, 0);
96 if (error)
97 return error;
98
99 /*
100 * There are blocks after the end of file.
101 * Free them up now by truncating the file to
102 * its current size.
103 */
104 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
105
106 if (need_iolock) {
107 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
108 xfs_trans_cancel(tp, 0);
109 return EAGAIN;
110 }
111 }
112
113 error = xfs_trans_reserve(tp, 0,
114 XFS_ITRUNCATE_LOG_RES(mp),
115 0, XFS_TRANS_PERM_LOG_RES,
116 XFS_ITRUNCATE_LOG_COUNT);
117 if (error) {
118 ASSERT(XFS_FORCED_SHUTDOWN(mp));
119 xfs_trans_cancel(tp, 0);
120 if (need_iolock)
121 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
122 return error;
123 }
124
125 xfs_ilock(ip, XFS_ILOCK_EXCL);
126 xfs_trans_ijoin(tp, ip, 0);
127
128 /*
129 * Do not update the on-disk file size. If we update the
130 * on-disk file size and then the system crashes before the
131 * contents of the file are flushed to disk then the files
132 * may be full of holes (ie NULL files bug).
133 */
134 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
135 XFS_ISIZE(ip));
136 if (error) {
137 /*
138 * If we get an error at this point we simply don't
139 * bother truncating the file.
140 */
141 xfs_trans_cancel(tp,
142 (XFS_TRANS_RELEASE_LOG_RES |
143 XFS_TRANS_ABORT));
144 } else {
145 error = xfs_trans_commit(tp,
146 XFS_TRANS_RELEASE_LOG_RES);
147 if (!error)
148 xfs_inode_clear_eofblocks_tag(ip);
149 }
150
151 xfs_iunlock(ip, XFS_ILOCK_EXCL);
152 if (need_iolock)
153 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
154 }
155 return error;
156}
157
158int
159xfs_release(
160 xfs_inode_t *ip)
161{
162 xfs_mount_t *mp = ip->i_mount;
163 int error;
164
165 if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
166 return 0;
167
168 /* If this is a read-only mount, don't do this (would generate I/O) */
169 if (mp->m_flags & XFS_MOUNT_RDONLY)
170 return 0;
171
172 if (!XFS_FORCED_SHUTDOWN(mp)) {
173 int truncated;
174
175 /*
176 * If we are using filestreams, and we have an unlinked
177 * file that we are processing the last close on, then nothing
178 * will be able to reopen and write to this file. Purge this
179 * inode from the filestreams cache so that it doesn't delay
180 * teardown of the inode.
181 */
182 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
183 xfs_filestream_deassociate(ip);
184
185 /*
186 * If we previously truncated this file and removed old data
187 * in the process, we want to initiate "early" writeout on
188 * the last close. This is an attempt to combat the notorious
189 * NULL files problem which is particularly noticeable from a
190 * truncate down, buffered (re-)write (delalloc), followed by
191 * a crash. What we are effectively doing here is
192 * significantly reducing the time window where we'd otherwise
193 * be exposed to that problem.
194 */
195 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
196 if (truncated) {
197 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
198 if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
199 error = -filemap_flush(VFS_I(ip)->i_mapping);
200 if (error)
201 return error;
202 }
203 }
204 }
205
206 if (ip->i_d.di_nlink == 0)
207 return 0;
208
209 if (xfs_can_free_eofblocks(ip, false)) {
210
211 /*
212 * If we can't get the iolock just skip truncating the blocks
213 * past EOF because we could deadlock with the mmap_sem
214 * otherwise. We'll get another chance to drop them once the
215 * last reference to the inode is dropped, so we'll never leak
216 * blocks permanently.
217 *
218 * Further, check if the inode is being opened, written and
219 * closed frequently and we have delayed allocation blocks
220 * outstanding (e.g. streaming writes from the NFS server),
221 * truncating the blocks past EOF will cause fragmentation to
222 * occur.
223 *
224 * In this case don't do the truncation, either, but we have to
225 * be careful how we detect this case. Blocks beyond EOF show
226 * up as i_delayed_blks even when the inode is clean, so we
227 * need to truncate them away first before checking for a dirty
228 * release. Hence on the first dirty close we will still remove
229 * the speculative allocation, but after that we will leave it
230 * in place.
231 */
232 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
233 return 0;
234
235 error = xfs_free_eofblocks(mp, ip, true);
236 if (error && error != EAGAIN)
237 return error;
238
239 /* delalloc blocks after truncation means it really is dirty */
240 if (ip->i_delayed_blks)
241 xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
242 }
243 return 0;
244}
245
246/*
247 * xfs_inactive
248 *
249 * This is called when the vnode reference count for the vnode
250 * goes to zero. If the file has been unlinked, then it must
251 * now be truncated. Also, we clear all of the read-ahead state
252 * kept for the inode here since the file is now closed.
253 */
254int
255xfs_inactive(
256 xfs_inode_t *ip)
257{
258 xfs_bmap_free_t free_list;
259 xfs_fsblock_t first_block;
260 int committed;
261 xfs_trans_t *tp;
262 xfs_mount_t *mp;
263 int error;
264 int truncate = 0;
265
266 /*
267 * If the inode is already free, then there can be nothing
268 * to clean up here.
269 */
270 if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
271 ASSERT(ip->i_df.if_real_bytes == 0);
272 ASSERT(ip->i_df.if_broot_bytes == 0);
273 return VN_INACTIVE_CACHE;
274 }
275
276 mp = ip->i_mount;
277
278 error = 0;
279
280 /* If this is a read-only mount, don't do this (would generate I/O) */
281 if (mp->m_flags & XFS_MOUNT_RDONLY)
282 goto out;
283
284 if (ip->i_d.di_nlink != 0) {
285 /*
286 * force is true because we are evicting an inode from the
287 * cache. Post-eof blocks must be freed, lest we end up with
288 * broken free space accounting.
289 */
290 if (xfs_can_free_eofblocks(ip, true)) {
291 error = xfs_free_eofblocks(mp, ip, false);
292 if (error)
293 return VN_INACTIVE_CACHE;
294 }
295 goto out;
296 }
297
298 if (S_ISREG(ip->i_d.di_mode) &&
299 (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
300 ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
301 truncate = 1;
302
303 error = xfs_qm_dqattach(ip, 0);
304 if (error)
305 return VN_INACTIVE_CACHE;
306
307 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
308 error = xfs_trans_reserve(tp, 0,
309 (truncate || S_ISLNK(ip->i_d.di_mode)) ?
310 XFS_ITRUNCATE_LOG_RES(mp) :
311 XFS_IFREE_LOG_RES(mp),
312 0,
313 XFS_TRANS_PERM_LOG_RES,
314 XFS_ITRUNCATE_LOG_COUNT);
315 if (error) {
316 ASSERT(XFS_FORCED_SHUTDOWN(mp));
317 xfs_trans_cancel(tp, 0);
318 return VN_INACTIVE_CACHE;
319 }
320
321 xfs_ilock(ip, XFS_ILOCK_EXCL);
322 xfs_trans_ijoin(tp, ip, 0);
323
324 if (S_ISLNK(ip->i_d.di_mode)) {
325 error = xfs_inactive_symlink(ip, &tp);
326 if (error)
327 goto out_cancel;
328 } else if (truncate) {
329 ip->i_d.di_size = 0;
330 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
331
332 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
333 if (error)
334 goto out_cancel;
335
336 ASSERT(ip->i_d.di_nextents == 0);
337 }
338
339 /*
340 * If there are attributes associated with the file then blow them away
341 * now. The code calls a routine that recursively deconstructs the
342 * attribute fork. We need to just commit the current transaction
343 * because we can't use it for xfs_attr_inactive().
344 */
345 if (ip->i_d.di_anextents > 0) {
346 ASSERT(ip->i_d.di_forkoff != 0);
347
348 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
349 if (error)
350 goto out_unlock;
351
352 xfs_iunlock(ip, XFS_ILOCK_EXCL);
353
354 error = xfs_attr_inactive(ip);
355 if (error)
356 goto out;
357
358 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
359 error = xfs_trans_reserve(tp, 0,
360 XFS_IFREE_LOG_RES(mp),
361 0, XFS_TRANS_PERM_LOG_RES,
362 XFS_INACTIVE_LOG_COUNT);
363 if (error) {
364 xfs_trans_cancel(tp, 0);
365 goto out;
366 }
367
368 xfs_ilock(ip, XFS_ILOCK_EXCL);
369 xfs_trans_ijoin(tp, ip, 0);
370 }
371
372 if (ip->i_afp)
373 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
374
375 ASSERT(ip->i_d.di_anextents == 0);
376
377 /*
378 * Free the inode.
379 */
380 xfs_bmap_init(&free_list, &first_block);
381 error = xfs_ifree(tp, ip, &free_list);
382 if (error) {
383 /*
384 * If we fail to free the inode, shut down. The cancel
385 * might do that, we need to make sure. Otherwise the
386 * inode might be lost for a long time or forever.
387 */
388 if (!XFS_FORCED_SHUTDOWN(mp)) {
389 xfs_notice(mp, "%s: xfs_ifree returned error %d",
390 __func__, error);
391 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
392 }
393 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
394 } else {
395 /*
396 * Credit the quota account(s). The inode is gone.
397 */
398 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
399
400 /*
401 * Just ignore errors at this point. There is nothing we can
402 * do except to try to keep going. Make sure it's not a silent
403 * error.
404 */
405 error = xfs_bmap_finish(&tp, &free_list, &committed);
406 if (error)
407 xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
408 __func__, error);
409 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
410 if (error)
411 xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
412 __func__, error);
413 }
414
415 /*
416 * Release the dquots held by inode, if any.
417 */
418 xfs_qm_dqdetach(ip);
419out_unlock:
420 xfs_iunlock(ip, XFS_ILOCK_EXCL);
421out:
422 return VN_INACTIVE_CACHE;
423out_cancel:
424 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
425 goto out_unlock;
426}
427
428/*
429 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
430 * is allowed, otherwise it has to be an exact match. If a CI match is found,
431 * ci_name->name will point to a the actual name (caller must free) or
432 * will be set to NULL if an exact match is found.
433 */
434int
435xfs_lookup(
436 xfs_inode_t *dp,
437 struct xfs_name *name,
438 xfs_inode_t **ipp,
439 struct xfs_name *ci_name)
440{
441 xfs_ino_t inum;
442 int error;
443 uint lock_mode;
444
445 trace_xfs_lookup(dp, name);
446
447 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
448 return XFS_ERROR(EIO);
449
450 lock_mode = xfs_ilock_map_shared(dp);
451 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
452 xfs_iunlock_map_shared(dp, lock_mode);
453
454 if (error)
455 goto out;
456
457 error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
458 if (error)
459 goto out_free_name;
460
461 return 0;
462
463out_free_name:
464 if (ci_name)
465 kmem_free(ci_name->name);
466out:
467 *ipp = NULL;
468 return error;
469}
470
471int
472xfs_create(
473 xfs_inode_t *dp,
474 struct xfs_name *name,
475 umode_t mode,
476 xfs_dev_t rdev,
477 xfs_inode_t **ipp)
478{
479 int is_dir = S_ISDIR(mode);
480 struct xfs_mount *mp = dp->i_mount;
481 struct xfs_inode *ip = NULL;
482 struct xfs_trans *tp = NULL;
483 int error;
484 xfs_bmap_free_t free_list;
485 xfs_fsblock_t first_block;
486 bool unlock_dp_on_error = false;
487 uint cancel_flags;
488 int committed;
489 prid_t prid;
490 struct xfs_dquot *udqp = NULL;
491 struct xfs_dquot *gdqp = NULL;
492 struct xfs_dquot *pdqp = NULL;
493 uint resblks;
494 uint log_res;
495 uint log_count;
496
497 trace_xfs_create(dp, name);
498
499 if (XFS_FORCED_SHUTDOWN(mp))
500 return XFS_ERROR(EIO);
501
502 if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
503 prid = xfs_get_projid(dp);
504 else
505 prid = XFS_PROJID_DEFAULT;
506
507 /*
508 * Make sure that we have allocated dquot(s) on disk.
509 */
510 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
511 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
512 &udqp, &gdqp, &pdqp);
513 if (error)
514 return error;
515
516 if (is_dir) {
517 rdev = 0;
518 resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
519 log_res = XFS_MKDIR_LOG_RES(mp);
520 log_count = XFS_MKDIR_LOG_COUNT;
521 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
522 } else {
523 resblks = XFS_CREATE_SPACE_RES(mp, name->len);
524 log_res = XFS_CREATE_LOG_RES(mp);
525 log_count = XFS_CREATE_LOG_COUNT;
526 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
527 }
528
529 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
530
531 /*
532 * Initially assume that the file does not exist and
533 * reserve the resources for that case. If that is not
534 * the case we'll drop the one we have and get a more
535 * appropriate transaction later.
536 */
537 error = xfs_trans_reserve(tp, resblks, log_res, 0,
538 XFS_TRANS_PERM_LOG_RES, log_count);
539 if (error == ENOSPC) {
540 /* flush outstanding delalloc blocks and retry */
541 xfs_flush_inodes(mp);
542 error = xfs_trans_reserve(tp, resblks, log_res, 0,
543 XFS_TRANS_PERM_LOG_RES, log_count);
544 }
545 if (error == ENOSPC) {
546 /* No space at all so try a "no-allocation" reservation */
547 resblks = 0;
548 error = xfs_trans_reserve(tp, 0, log_res, 0,
549 XFS_TRANS_PERM_LOG_RES, log_count);
550 }
551 if (error) {
552 cancel_flags = 0;
553 goto out_trans_cancel;
554 }
555
556 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
557 unlock_dp_on_error = true;
558
559 xfs_bmap_init(&free_list, &first_block);
560
561 /*
562 * Reserve disk quota and the inode.
563 */
564 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
565 pdqp, resblks, 1, 0);
566 if (error)
567 goto out_trans_cancel;
568
569 error = xfs_dir_canenter(tp, dp, name, resblks);
570 if (error)
571 goto out_trans_cancel;
572
573 /*
574 * A newly created regular or special file just has one directory
575 * entry pointing to them, but a directory also the "." entry
576 * pointing to itself.
577 */
578 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
579 prid, resblks > 0, &ip, &committed);
580 if (error) {
581 if (error == ENOSPC)
582 goto out_trans_cancel;
583 goto out_trans_abort;
584 }
585
586 /*
587 * Now we join the directory inode to the transaction. We do not do it
588 * earlier because xfs_dir_ialloc might commit the previous transaction
589 * (and release all the locks). An error from here on will result in
590 * the transaction cancel unlocking dp so don't do it explicitly in the
591 * error path.
592 */
593 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
594 unlock_dp_on_error = false;
595
596 error = xfs_dir_createname(tp, dp, name, ip->i_ino,
597 &first_block, &free_list, resblks ?
598 resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
599 if (error) {
600 ASSERT(error != ENOSPC);
601 goto out_trans_abort;
602 }
603 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
604 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
605
606 if (is_dir) {
607 error = xfs_dir_init(tp, ip, dp);
608 if (error)
609 goto out_bmap_cancel;
610
611 error = xfs_bumplink(tp, dp);
612 if (error)
613 goto out_bmap_cancel;
614 }
615
616 /*
617 * If this is a synchronous mount, make sure that the
618 * create transaction goes to disk before returning to
619 * the user.
620 */
621 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
622 xfs_trans_set_sync(tp);
623
624 /*
625 * Attach the dquot(s) to the inodes and modify them incore.
626 * These ids of the inode couldn't have changed since the new
627 * inode has been locked ever since it was created.
628 */
629 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
630
631 error = xfs_bmap_finish(&tp, &free_list, &committed);
632 if (error)
633 goto out_bmap_cancel;
634
635 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
636 if (error)
637 goto out_release_inode;
638
639 xfs_qm_dqrele(udqp);
640 xfs_qm_dqrele(gdqp);
641 xfs_qm_dqrele(pdqp);
642
643 *ipp = ip;
644 return 0;
645
646 out_bmap_cancel:
647 xfs_bmap_cancel(&free_list);
648 out_trans_abort:
649 cancel_flags |= XFS_TRANS_ABORT;
650 out_trans_cancel:
651 xfs_trans_cancel(tp, cancel_flags);
652 out_release_inode:
653 /*
654 * Wait until after the current transaction is aborted to
655 * release the inode. This prevents recursive transactions
656 * and deadlocks from xfs_inactive.
657 */
658 if (ip)
659 IRELE(ip);
660
661 xfs_qm_dqrele(udqp);
662 xfs_qm_dqrele(gdqp);
663 xfs_qm_dqrele(pdqp);
664
665 if (unlock_dp_on_error)
666 xfs_iunlock(dp, XFS_ILOCK_EXCL);
667 return error;
668}
669
670#ifdef DEBUG
671int xfs_locked_n;
672int xfs_small_retries;
673int xfs_middle_retries;
674int xfs_lots_retries;
675int xfs_lock_delays;
676#endif
677
678/*
679 * Bump the subclass so xfs_lock_inodes() acquires each lock with
680 * a different value
681 */
682static inline int
683xfs_lock_inumorder(int lock_mode, int subclass)
684{
685 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
686 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
687 if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
688 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
689
690 return lock_mode;
691}
692
693/*
694 * The following routine will lock n inodes in exclusive mode.
695 * We assume the caller calls us with the inodes in i_ino order.
696 *
697 * We need to detect deadlock where an inode that we lock
698 * is in the AIL and we start waiting for another inode that is locked
699 * by a thread in a long running transaction (such as truncate). This can
700 * result in deadlock since the long running trans might need to wait
701 * for the inode we just locked in order to push the tail and free space
702 * in the log.
703 */
704void
705xfs_lock_inodes(
706 xfs_inode_t **ips,
707 int inodes,
708 uint lock_mode)
709{
710 int attempts = 0, i, j, try_lock;
711 xfs_log_item_t *lp;
712
713 ASSERT(ips && (inodes >= 2)); /* we need at least two */
714
715 try_lock = 0;
716 i = 0;
717
718again:
719 for (; i < inodes; i++) {
720 ASSERT(ips[i]);
721
722 if (i && (ips[i] == ips[i-1])) /* Already locked */
723 continue;
724
725 /*
726 * If try_lock is not set yet, make sure all locked inodes
727 * are not in the AIL.
728 * If any are, set try_lock to be used later.
729 */
730
731 if (!try_lock) {
732 for (j = (i - 1); j >= 0 && !try_lock; j--) {
733 lp = (xfs_log_item_t *)ips[j]->i_itemp;
734 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
735 try_lock++;
736 }
737 }
738 }
739
740 /*
741 * If any of the previous locks we have locked is in the AIL,
742 * we must TRY to get the second and subsequent locks. If
743 * we can't get any, we must release all we have
744 * and try again.
745 */
746
747 if (try_lock) {
748 /* try_lock must be 0 if i is 0. */
749 /*
750 * try_lock means we have an inode locked
751 * that is in the AIL.
752 */
753 ASSERT(i != 0);
754 if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
755 attempts++;
756
757 /*
758 * Unlock all previous guys and try again.
759 * xfs_iunlock will try to push the tail
760 * if the inode is in the AIL.
761 */
762
763 for(j = i - 1; j >= 0; j--) {
764
765 /*
766 * Check to see if we've already
767 * unlocked this one.
768 * Not the first one going back,
769 * and the inode ptr is the same.
770 */
771 if ((j != (i - 1)) && ips[j] ==
772 ips[j+1])
773 continue;
774
775 xfs_iunlock(ips[j], lock_mode);
776 }
777
778 if ((attempts % 5) == 0) {
779 delay(1); /* Don't just spin the CPU */
780#ifdef DEBUG
781 xfs_lock_delays++;
782#endif
783 }
784 i = 0;
785 try_lock = 0;
786 goto again;
787 }
788 } else {
789 xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
790 }
791 }
792
793#ifdef DEBUG
794 if (attempts) {
795 if (attempts < 5) xfs_small_retries++;
796 else if (attempts < 100) xfs_middle_retries++;
797 else xfs_lots_retries++;
798 } else {
799 xfs_locked_n++;
800 }
801#endif
802}
803
804/*
805 * xfs_lock_two_inodes() can only be used to lock one type of lock
806 * at a time - the iolock or the ilock, but not both at once. If
807 * we lock both at once, lockdep will report false positives saying
808 * we have violated locking orders.
809 */
810void
811xfs_lock_two_inodes(
812 xfs_inode_t *ip0,
813 xfs_inode_t *ip1,
814 uint lock_mode)
815{
816 xfs_inode_t *temp;
817 int attempts = 0;
818 xfs_log_item_t *lp;
819
820 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
821 ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
822 ASSERT(ip0->i_ino != ip1->i_ino);
823
824 if (ip0->i_ino > ip1->i_ino) {
825 temp = ip0;
826 ip0 = ip1;
827 ip1 = temp;
828 }
829
830 again:
831 xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
832
833 /*
834 * If the first lock we have locked is in the AIL, we must TRY to get
835 * the second lock. If we can't get it, we must release the first one
836 * and try again.
837 */
838 lp = (xfs_log_item_t *)ip0->i_itemp;
839 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
840 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
841 xfs_iunlock(ip0, lock_mode);
842 if ((++attempts % 5) == 0)
843 delay(1); /* Don't just spin the CPU */
844 goto again;
845 }
846 } else {
847 xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
848 }
849}
850
851int
852xfs_remove(
853 xfs_inode_t *dp,
854 struct xfs_name *name,
855 xfs_inode_t *ip)
856{
857 xfs_mount_t *mp = dp->i_mount;
858 xfs_trans_t *tp = NULL;
859 int is_dir = S_ISDIR(ip->i_d.di_mode);
860 int error = 0;
861 xfs_bmap_free_t free_list;
862 xfs_fsblock_t first_block;
863 int cancel_flags;
864 int committed;
865 int link_zero;
866 uint resblks;
867 uint log_count;
868
869 trace_xfs_remove(dp, name);
870
871 if (XFS_FORCED_SHUTDOWN(mp))
872 return XFS_ERROR(EIO);
873
874 error = xfs_qm_dqattach(dp, 0);
875 if (error)
876 goto std_return;
877
878 error = xfs_qm_dqattach(ip, 0);
879 if (error)
880 goto std_return;
881
882 if (is_dir) {
883 tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
884 log_count = XFS_DEFAULT_LOG_COUNT;
885 } else {
886 tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
887 log_count = XFS_REMOVE_LOG_COUNT;
888 }
889 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
890
891 /*
892 * We try to get the real space reservation first,
893 * allowing for directory btree deletion(s) implying
894 * possible bmap insert(s). If we can't get the space
895 * reservation then we use 0 instead, and avoid the bmap
896 * btree insert(s) in the directory code by, if the bmap
897 * insert tries to happen, instead trimming the LAST
898 * block from the directory.
899 */
900 resblks = XFS_REMOVE_SPACE_RES(mp);
901 error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
902 XFS_TRANS_PERM_LOG_RES, log_count);
903 if (error == ENOSPC) {
904 resblks = 0;
905 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
906 XFS_TRANS_PERM_LOG_RES, log_count);
907 }
908 if (error) {
909 ASSERT(error != ENOSPC);
910 cancel_flags = 0;
911 goto out_trans_cancel;
912 }
913
914 xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
915
916 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
917 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
918
919 /*
920 * If we're removing a directory perform some additional validation.
921 */
922 if (is_dir) {
923 ASSERT(ip->i_d.di_nlink >= 2);
924 if (ip->i_d.di_nlink != 2) {
925 error = XFS_ERROR(ENOTEMPTY);
926 goto out_trans_cancel;
927 }
928 if (!xfs_dir_isempty(ip)) {
929 error = XFS_ERROR(ENOTEMPTY);
930 goto out_trans_cancel;
931 }
932 }
933
934 xfs_bmap_init(&free_list, &first_block);
935 error = xfs_dir_removename(tp, dp, name, ip->i_ino,
936 &first_block, &free_list, resblks);
937 if (error) {
938 ASSERT(error != ENOENT);
939 goto out_bmap_cancel;
940 }
941 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
942
943 if (is_dir) {
944 /*
945 * Drop the link from ip's "..".
946 */
947 error = xfs_droplink(tp, dp);
948 if (error)
949 goto out_bmap_cancel;
950
951 /*
952 * Drop the "." link from ip to self.
953 */
954 error = xfs_droplink(tp, ip);
955 if (error)
956 goto out_bmap_cancel;
957 } else {
958 /*
959 * When removing a non-directory we need to log the parent
960 * inode here. For a directory this is done implicitly
961 * by the xfs_droplink call for the ".." entry.
962 */
963 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
964 }
965
966 /*
967 * Drop the link from dp to ip.
968 */
969 error = xfs_droplink(tp, ip);
970 if (error)
971 goto out_bmap_cancel;
972
973 /*
974 * Determine if this is the last link while
975 * we are in the transaction.
976 */
977 link_zero = (ip->i_d.di_nlink == 0);
978
979 /*
980 * If this is a synchronous mount, make sure that the
981 * remove transaction goes to disk before returning to
982 * the user.
983 */
984 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
985 xfs_trans_set_sync(tp);
986
987 error = xfs_bmap_finish(&tp, &free_list, &committed);
988 if (error)
989 goto out_bmap_cancel;
990
991 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
992 if (error)
993 goto std_return;
994
995 /*
996 * If we are using filestreams, kill the stream association.
997 * If the file is still open it may get a new one but that
998 * will get killed on last close in xfs_close() so we don't
999 * have to worry about that.
1000 */
1001 if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
1002 xfs_filestream_deassociate(ip);
1003
1004 return 0;
1005
1006 out_bmap_cancel:
1007 xfs_bmap_cancel(&free_list);
1008 cancel_flags |= XFS_TRANS_ABORT;
1009 out_trans_cancel:
1010 xfs_trans_cancel(tp, cancel_flags);
1011 std_return:
1012 return error;
1013}
1014
1015int
1016xfs_link(
1017 xfs_inode_t *tdp,
1018 xfs_inode_t *sip,
1019 struct xfs_name *target_name)
1020{
1021 xfs_mount_t *mp = tdp->i_mount;
1022 xfs_trans_t *tp;
1023 int error;
1024 xfs_bmap_free_t free_list;
1025 xfs_fsblock_t first_block;
1026 int cancel_flags;
1027 int committed;
1028 int resblks;
1029
1030 trace_xfs_link(tdp, target_name);
1031
1032 ASSERT(!S_ISDIR(sip->i_d.di_mode));
1033
1034 if (XFS_FORCED_SHUTDOWN(mp))
1035 return XFS_ERROR(EIO);
1036
1037 error = xfs_qm_dqattach(sip, 0);
1038 if (error)
1039 goto std_return;
1040
1041 error = xfs_qm_dqattach(tdp, 0);
1042 if (error)
1043 goto std_return;
1044
1045 tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
1046 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1047 resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1048 error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
1049 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
1050 if (error == ENOSPC) {
1051 resblks = 0;
1052 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
1053 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
1054 }
1055 if (error) {
1056 cancel_flags = 0;
1057 goto error_return;
1058 }
1059
1060 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1061
1062 xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1063 xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1064
1065 /*
1066 * If we are using project inheritance, we only allow hard link
1067 * creation in our tree when the project IDs are the same; else
1068 * the tree quota mechanism could be circumvented.
1069 */
1070 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1071 (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1072 error = XFS_ERROR(EXDEV);
1073 goto error_return;
1074 }
1075
1076 error = xfs_dir_canenter(tp, tdp, target_name, resblks);
1077 if (error)
1078 goto error_return;
1079
1080 xfs_bmap_init(&free_list, &first_block);
1081
1082 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1083 &first_block, &free_list, resblks);
1084 if (error)
1085 goto abort_return;
1086 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1087 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1088
1089 error = xfs_bumplink(tp, sip);
1090 if (error)
1091 goto abort_return;
1092
1093 /*
1094 * If this is a synchronous mount, make sure that the
1095 * link transaction goes to disk before returning to
1096 * the user.
1097 */
1098 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1099 xfs_trans_set_sync(tp);
1100 }
1101
1102 error = xfs_bmap_finish (&tp, &free_list, &committed);
1103 if (error) {
1104 xfs_bmap_cancel(&free_list);
1105 goto abort_return;
1106 }
1107
1108 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1109
1110 abort_return:
1111 cancel_flags |= XFS_TRANS_ABORT;
1112 error_return:
1113 xfs_trans_cancel(tp, cancel_flags);
1114 std_return:
1115 return error;
1116}
1117
1118int
1119xfs_set_dmattrs(
1120 xfs_inode_t *ip,
1121 u_int evmask,
1122 u_int16_t state)
1123{
1124 xfs_mount_t *mp = ip->i_mount;
1125 xfs_trans_t *tp;
1126 int error;
1127
1128 if (!capable(CAP_SYS_ADMIN))
1129 return XFS_ERROR(EPERM);
1130
1131 if (XFS_FORCED_SHUTDOWN(mp))
1132 return XFS_ERROR(EIO);
1133
1134 tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
1135 error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
1136 if (error) {
1137 xfs_trans_cancel(tp, 0);
1138 return error;
1139 }
1140 xfs_ilock(ip, XFS_ILOCK_EXCL);
1141 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1142
1143 ip->i_d.di_dmevmask = evmask;
1144 ip->i_d.di_dmstate = state;
1145
1146 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1147 error = xfs_trans_commit(tp, 0);
1148
1149 return error;
1150}
1151
1152/*
1153 * xfs_alloc_file_space()
1154 * This routine allocates disk space for the given file.
1155 *
1156 * If alloc_type == 0, this request is for an ALLOCSP type
1157 * request which will change the file size. In this case, no
1158 * DMAPI event will be generated by the call. A TRUNCATE event
1159 * will be generated later by xfs_setattr.
1160 *
1161 * If alloc_type != 0, this request is for a RESVSP type
1162 * request, and a DMAPI DM_EVENT_WRITE will be generated if the
1163 * lower block boundary byte address is less than the file's
1164 * length.
1165 *
1166 * RETURNS:
1167 * 0 on success
1168 * errno on error
1169 *
1170 */
1171STATIC int
1172xfs_alloc_file_space(
1173 xfs_inode_t *ip,
1174 xfs_off_t offset,
1175 xfs_off_t len,
1176 int alloc_type,
1177 int attr_flags)
1178{
1179 xfs_mount_t *mp = ip->i_mount;
1180 xfs_off_t count;
1181 xfs_filblks_t allocated_fsb;
1182 xfs_filblks_t allocatesize_fsb;
1183 xfs_extlen_t extsz, temp;
1184 xfs_fileoff_t startoffset_fsb;
1185 xfs_fsblock_t firstfsb;
1186 int nimaps;
1187 int quota_flag;
1188 int rt;
1189 xfs_trans_t *tp;
1190 xfs_bmbt_irec_t imaps[1], *imapp;
1191 xfs_bmap_free_t free_list;
1192 uint qblocks, resblks, resrtextents;
1193 int committed;
1194 int error;
1195
1196 trace_xfs_alloc_file_space(ip);
1197
1198 if (XFS_FORCED_SHUTDOWN(mp))
1199 return XFS_ERROR(EIO);
1200
1201 error = xfs_qm_dqattach(ip, 0);
1202 if (error)
1203 return error;
1204
1205 if (len <= 0)
1206 return XFS_ERROR(EINVAL);
1207
1208 rt = XFS_IS_REALTIME_INODE(ip);
1209 extsz = xfs_get_extsz_hint(ip);
1210
1211 count = len;
1212 imapp = &imaps[0];
1213 nimaps = 1;
1214 startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
1215 allocatesize_fsb = XFS_B_TO_FSB(mp, count);
1216
1217 /*
1218 * Allocate file space until done or until there is an error
1219 */
1220 while (allocatesize_fsb && !error) {
1221 xfs_fileoff_t s, e;
1222
1223 /*
1224 * Determine space reservations for data/realtime.
1225 */
1226 if (unlikely(extsz)) {
1227 s = startoffset_fsb;
1228 do_div(s, extsz);
1229 s *= extsz;
1230 e = startoffset_fsb + allocatesize_fsb;
1231 if ((temp = do_mod(startoffset_fsb, extsz)))
1232 e += temp;
1233 if ((temp = do_mod(e, extsz)))
1234 e += extsz - temp;
1235 } else {
1236 s = 0;
1237 e = allocatesize_fsb;
1238 }
1239
1240 /*
1241 * The transaction reservation is limited to a 32-bit block
1242 * count, hence we need to limit the number of blocks we are
1243 * trying to reserve to avoid an overflow. We can't allocate
1244 * more than @nimaps extents, and an extent is limited on disk
1245 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
1246 */
1247 resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
1248 if (unlikely(rt)) {
1249 resrtextents = qblocks = resblks;
1250 resrtextents /= mp->m_sb.sb_rextsize;
1251 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1252 quota_flag = XFS_QMOPT_RES_RTBLKS;
1253 } else {
1254 resrtextents = 0;
1255 resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
1256 quota_flag = XFS_QMOPT_RES_REGBLKS;
1257 }
1258
1259 /*
1260 * Allocate and setup the transaction.
1261 */
1262 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1263 error = xfs_trans_reserve(tp, resblks,
1264 XFS_WRITE_LOG_RES(mp), resrtextents,
1265 XFS_TRANS_PERM_LOG_RES,
1266 XFS_WRITE_LOG_COUNT);
1267 /*
1268 * Check for running out of space
1269 */
1270 if (error) {
1271 /*
1272 * Free the transaction structure.
1273 */
1274 ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1275 xfs_trans_cancel(tp, 0);
1276 break;
1277 }
1278 xfs_ilock(ip, XFS_ILOCK_EXCL);
1279 error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
1280 0, quota_flag);
1281 if (error)
1282 goto error1;
1283
1284 xfs_trans_ijoin(tp, ip, 0);
1285
1286 xfs_bmap_init(&free_list, &firstfsb);
1287 error = xfs_bmapi_write(tp, ip, startoffset_fsb,
1288 allocatesize_fsb, alloc_type, &firstfsb,
1289 0, imapp, &nimaps, &free_list);
1290 if (error) {
1291 goto error0;
1292 }
1293
1294 /*
1295 * Complete the transaction
1296 */
1297 error = xfs_bmap_finish(&tp, &free_list, &committed);
1298 if (error) {
1299 goto error0;
1300 }
1301
1302 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1303 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1304 if (error) {
1305 break;
1306 }
1307
1308 allocated_fsb = imapp->br_blockcount;
1309
1310 if (nimaps == 0) {
1311 error = XFS_ERROR(ENOSPC);
1312 break;
1313 }
1314
1315 startoffset_fsb += allocated_fsb;
1316 allocatesize_fsb -= allocated_fsb;
1317 }
1318
1319 return error;
1320
1321error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
1322 xfs_bmap_cancel(&free_list);
1323 xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
1324
1325error1: /* Just cancel transaction */
1326 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1327 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1328 return error;
1329}
1330
1331/*
1332 * Zero file bytes between startoff and endoff inclusive.
1333 * The iolock is held exclusive and no blocks are buffered.
1334 *
1335 * This function is used by xfs_free_file_space() to zero
1336 * partial blocks when the range to free is not block aligned.
1337 * When unreserving space with boundaries that are not block
1338 * aligned we round up the start and round down the end
1339 * boundaries and then use this function to zero the parts of
1340 * the blocks that got dropped during the rounding.
1341 */
1342STATIC int
1343xfs_zero_remaining_bytes(
1344 xfs_inode_t *ip,
1345 xfs_off_t startoff,
1346 xfs_off_t endoff)
1347{
1348 xfs_bmbt_irec_t imap;
1349 xfs_fileoff_t offset_fsb;
1350 xfs_off_t lastoffset;
1351 xfs_off_t offset;
1352 xfs_buf_t *bp;
1353 xfs_mount_t *mp = ip->i_mount;
1354 int nimap;
1355 int error = 0;
1356
1357 /*
1358 * Avoid doing I/O beyond eof - it's not necessary
1359 * since nothing can read beyond eof. The space will
1360 * be zeroed when the file is extended anyway.
1361 */
1362 if (startoff >= XFS_ISIZE(ip))
1363 return 0;
1364
1365 if (endoff > XFS_ISIZE(ip))
1366 endoff = XFS_ISIZE(ip);
1367
1368 bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
1369 mp->m_rtdev_targp : mp->m_ddev_targp,
1370 BTOBB(mp->m_sb.sb_blocksize), 0);
1371 if (!bp)
1372 return XFS_ERROR(ENOMEM);
1373
1374 xfs_buf_unlock(bp);
1375
1376 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
1377 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1378 nimap = 1;
1379 error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
1380 if (error || nimap < 1)
1381 break;
1382 ASSERT(imap.br_blockcount >= 1);
1383 ASSERT(imap.br_startoff == offset_fsb);
1384 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
1385 if (lastoffset > endoff)
1386 lastoffset = endoff;
1387 if (imap.br_startblock == HOLESTARTBLOCK)
1388 continue;
1389 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1390 if (imap.br_state == XFS_EXT_UNWRITTEN)
1391 continue;
1392 XFS_BUF_UNDONE(bp);
1393 XFS_BUF_UNWRITE(bp);
1394 XFS_BUF_READ(bp);
1395 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
1396 xfsbdstrat(mp, bp);
1397 error = xfs_buf_iowait(bp);
1398 if (error) {
1399 xfs_buf_ioerror_alert(bp,
1400 "xfs_zero_remaining_bytes(read)");
1401 break;
1402 }
1403 memset(bp->b_addr +
1404 (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
1405 0, lastoffset - offset + 1);
1406 XFS_BUF_UNDONE(bp);
1407 XFS_BUF_UNREAD(bp);
1408 XFS_BUF_WRITE(bp);
1409 xfsbdstrat(mp, bp);
1410 error = xfs_buf_iowait(bp);
1411 if (error) {
1412 xfs_buf_ioerror_alert(bp,
1413 "xfs_zero_remaining_bytes(write)");
1414 break;
1415 }
1416 }
1417 xfs_buf_free(bp);
1418 return error;
1419}
1420
1421/*
1422 * xfs_free_file_space()
1423 * This routine frees disk space for the given file.
1424 *
1425 * This routine is only called by xfs_change_file_space
1426 * for an UNRESVSP type call.
1427 *
1428 * RETURNS:
1429 * 0 on success
1430 * errno on error
1431 *
1432 */
1433STATIC int
1434xfs_free_file_space(
1435 xfs_inode_t *ip,
1436 xfs_off_t offset,
1437 xfs_off_t len,
1438 int attr_flags)
1439{
1440 int committed;
1441 int done;
1442 xfs_fileoff_t endoffset_fsb;
1443 int error;
1444 xfs_fsblock_t firstfsb;
1445 xfs_bmap_free_t free_list;
1446 xfs_bmbt_irec_t imap;
1447 xfs_off_t ioffset;
1448 xfs_extlen_t mod=0;
1449 xfs_mount_t *mp;
1450 int nimap;
1451 uint resblks;
1452 xfs_off_t rounding;
1453 int rt;
1454 xfs_fileoff_t startoffset_fsb;
1455 xfs_trans_t *tp;
1456 int need_iolock = 1;
1457
1458 mp = ip->i_mount;
1459
1460 trace_xfs_free_file_space(ip);
1461
1462 error = xfs_qm_dqattach(ip, 0);
1463 if (error)
1464 return error;
1465
1466 error = 0;
1467 if (len <= 0) /* if nothing being freed */
1468 return error;
1469 rt = XFS_IS_REALTIME_INODE(ip);
1470 startoffset_fsb = XFS_B_TO_FSB(mp, offset);
1471 endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
1472
1473 if (attr_flags & XFS_ATTR_NOLOCK)
1474 need_iolock = 0;
1475 if (need_iolock) {
1476 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1477 /* wait for the completion of any pending DIOs */
1478 inode_dio_wait(VFS_I(ip));
1479 }
1480
1481 rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1482 ioffset = offset & ~(rounding - 1);
1483 error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1484 ioffset, -1);
1485 if (error)
1486 goto out_unlock_iolock;
1487 truncate_pagecache_range(VFS_I(ip), ioffset, -1);
1488
1489 /*
1490 * Need to zero the stuff we're not freeing, on disk.
1491 * If it's a realtime file & can't use unwritten extents then we
1492 * actually need to zero the extent edges. Otherwise xfs_bunmapi
1493 * will take care of it for us.
1494 */
1495 if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
1496 nimap = 1;
1497 error = xfs_bmapi_read(ip, startoffset_fsb, 1,
1498 &imap, &nimap, 0);
1499 if (error)
1500 goto out_unlock_iolock;
1501 ASSERT(nimap == 0 || nimap == 1);
1502 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1503 xfs_daddr_t block;
1504
1505 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1506 block = imap.br_startblock;
1507 mod = do_div(block, mp->m_sb.sb_rextsize);
1508 if (mod)
1509 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
1510 }
1511 nimap = 1;
1512 error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
1513 &imap, &nimap, 0);
1514 if (error)
1515 goto out_unlock_iolock;
1516 ASSERT(nimap == 0 || nimap == 1);
1517 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1518 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1519 mod++;
1520 if (mod && (mod != mp->m_sb.sb_rextsize))
1521 endoffset_fsb -= mod;
1522 }
1523 }
1524 if ((done = (endoffset_fsb <= startoffset_fsb)))
1525 /*
1526 * One contiguous piece to clear
1527 */
1528 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
1529 else {
1530 /*
1531 * Some full blocks, possibly two pieces to clear
1532 */
1533 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
1534 error = xfs_zero_remaining_bytes(ip, offset,
1535 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
1536 if (!error &&
1537 XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
1538 error = xfs_zero_remaining_bytes(ip,
1539 XFS_FSB_TO_B(mp, endoffset_fsb),
1540 offset + len - 1);
1541 }
1542
1543 /*
1544 * free file space until done or until there is an error
1545 */
1546 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1547 while (!error && !done) {
1548
1549 /*
1550 * allocate and setup the transaction. Allow this
1551 * transaction to dip into the reserve blocks to ensure
1552 * the freeing of the space succeeds at ENOSPC.
1553 */
1554 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1555 tp->t_flags |= XFS_TRANS_RESERVE;
1556 error = xfs_trans_reserve(tp,
1557 resblks,
1558 XFS_WRITE_LOG_RES(mp),
1559 0,
1560 XFS_TRANS_PERM_LOG_RES,
1561 XFS_WRITE_LOG_COUNT);
1562
1563 /*
1564 * check for running out of space
1565 */
1566 if (error) {
1567 /*
1568 * Free the transaction structure.
1569 */
1570 ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1571 xfs_trans_cancel(tp, 0);
1572 break;
1573 }
1574 xfs_ilock(ip, XFS_ILOCK_EXCL);
1575 error = xfs_trans_reserve_quota(tp, mp,
1576 ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
1577 resblks, 0, XFS_QMOPT_RES_REGBLKS);
1578 if (error)
1579 goto error1;
1580
1581 xfs_trans_ijoin(tp, ip, 0);
1582
1583 /*
1584 * issue the bunmapi() call to free the blocks
1585 */
1586 xfs_bmap_init(&free_list, &firstfsb);
1587 error = xfs_bunmapi(tp, ip, startoffset_fsb,
1588 endoffset_fsb - startoffset_fsb,
1589 0, 2, &firstfsb, &free_list, &done);
1590 if (error) {
1591 goto error0;
1592 }
1593
1594 /*
1595 * complete the transaction
1596 */
1597 error = xfs_bmap_finish(&tp, &free_list, &committed);
1598 if (error) {
1599 goto error0;
1600 }
1601
1602 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1603 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1604 }
1605
1606 out_unlock_iolock:
1607 if (need_iolock)
1608 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1609 return error;
1610
1611 error0:
1612 xfs_bmap_cancel(&free_list);
1613 error1:
1614 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1615 xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
1616 XFS_ILOCK_EXCL);
1617 return error;
1618}
1619
1620
1621STATIC int
1622xfs_zero_file_space(
1623 struct xfs_inode *ip,
1624 xfs_off_t offset,
1625 xfs_off_t len,
1626 int attr_flags)
1627{
1628 struct xfs_mount *mp = ip->i_mount;
1629 uint granularity;
1630 xfs_off_t start_boundary;
1631 xfs_off_t end_boundary;
1632 int error;
1633
1634 granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1635
1636 /*
1637 * Round the range of extents we are going to convert inwards. If the
1638 * offset is aligned, then it doesn't get changed so we zero from the
1639 * start of the block offset points to.
1640 */
1641 start_boundary = round_up(offset, granularity);
1642 end_boundary = round_down(offset + len, granularity);
1643
1644 ASSERT(start_boundary >= offset);
1645 ASSERT(end_boundary <= offset + len);
1646
1647 if (!(attr_flags & XFS_ATTR_NOLOCK))
1648 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1649
1650 if (start_boundary < end_boundary - 1) {
1651 /* punch out the page cache over the conversion range */
1652 truncate_pagecache_range(VFS_I(ip), start_boundary,
1653 end_boundary - 1);
1654 /* convert the blocks */
1655 error = xfs_alloc_file_space(ip, start_boundary,
1656 end_boundary - start_boundary - 1,
1657 XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
1658 attr_flags);
1659 if (error)
1660 goto out_unlock;
1661
1662 /* We've handled the interior of the range, now for the edges */
1663 if (start_boundary != offset)
1664 error = xfs_iozero(ip, offset, start_boundary - offset);
1665 if (error)
1666 goto out_unlock;
1667
1668 if (end_boundary != offset + len)
1669 error = xfs_iozero(ip, end_boundary,
1670 offset + len - end_boundary);
1671
1672 } else {
1673 /*
1674 * It's either a sub-granularity range or the range spanned lies
1675 * partially across two adjacent blocks.
1676 */
1677 error = xfs_iozero(ip, offset, len);
1678 }
1679
1680out_unlock:
1681 if (!(attr_flags & XFS_ATTR_NOLOCK))
1682 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1683 return error;
1684
1685}
1686
1687/*
1688 * xfs_change_file_space()
1689 * This routine allocates or frees disk space for the given file.
1690 * The user specified parameters are checked for alignment and size
1691 * limitations.
1692 *
1693 * RETURNS:
1694 * 0 on success
1695 * errno on error
1696 *
1697 */
1698int
1699xfs_change_file_space(
1700 xfs_inode_t *ip,
1701 int cmd,
1702 xfs_flock64_t *bf,
1703 xfs_off_t offset,
1704 int attr_flags)
1705{
1706 xfs_mount_t *mp = ip->i_mount;
1707 int clrprealloc;
1708 int error;
1709 xfs_fsize_t fsize;
1710 int setprealloc;
1711 xfs_off_t startoffset;
1712 xfs_trans_t *tp;
1713 struct iattr iattr;
1714
1715 if (!S_ISREG(ip->i_d.di_mode))
1716 return XFS_ERROR(EINVAL);
1717
1718 switch (bf->l_whence) {
1719 case 0: /*SEEK_SET*/
1720 break;
1721 case 1: /*SEEK_CUR*/
1722 bf->l_start += offset;
1723 break;
1724 case 2: /*SEEK_END*/
1725 bf->l_start += XFS_ISIZE(ip);
1726 break;
1727 default:
1728 return XFS_ERROR(EINVAL);
1729 }
1730
1731 /*
1732 * length of <= 0 for resv/unresv/zero is invalid. length for
1733 * alloc/free is ignored completely and we have no idea what userspace
1734 * might have set it to, so set it to zero to allow range
1735 * checks to pass.
1736 */
1737 switch (cmd) {
1738 case XFS_IOC_ZERO_RANGE:
1739 case XFS_IOC_RESVSP:
1740 case XFS_IOC_RESVSP64:
1741 case XFS_IOC_UNRESVSP:
1742 case XFS_IOC_UNRESVSP64:
1743 if (bf->l_len <= 0)
1744 return XFS_ERROR(EINVAL);
1745 break;
1746 default:
1747 bf->l_len = 0;
1748 break;
1749 }
1750
1751 if (bf->l_start < 0 ||
1752 bf->l_start > mp->m_super->s_maxbytes ||
1753 bf->l_start + bf->l_len < 0 ||
1754 bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
1755 return XFS_ERROR(EINVAL);
1756
1757 bf->l_whence = 0;
1758
1759 startoffset = bf->l_start;
1760 fsize = XFS_ISIZE(ip);
1761
1762 setprealloc = clrprealloc = 0;
1763 switch (cmd) {
1764 case XFS_IOC_ZERO_RANGE:
1765 error = xfs_zero_file_space(ip, startoffset, bf->l_len,
1766 attr_flags);
1767 if (error)
1768 return error;
1769 setprealloc = 1;
1770 break;
1771
1772 case XFS_IOC_RESVSP:
1773 case XFS_IOC_RESVSP64:
1774 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
1775 XFS_BMAPI_PREALLOC, attr_flags);
1776 if (error)
1777 return error;
1778 setprealloc = 1;
1779 break;
1780
1781 case XFS_IOC_UNRESVSP:
1782 case XFS_IOC_UNRESVSP64:
1783 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
1784 attr_flags)))
1785 return error;
1786 break;
1787
1788 case XFS_IOC_ALLOCSP:
1789 case XFS_IOC_ALLOCSP64:
1790 case XFS_IOC_FREESP:
1791 case XFS_IOC_FREESP64:
1792 /*
1793 * These operations actually do IO when extending the file, but
1794 * the allocation is done seperately to the zeroing that is
1795 * done. This set of operations need to be serialised against
1796 * other IO operations, such as truncate and buffered IO. We
1797 * need to take the IOLOCK here to serialise the allocation and
1798 * zeroing IO to prevent other IOLOCK holders (e.g. getbmap,
1799 * truncate, direct IO) from racing against the transient
1800 * allocated but not written state we can have here.
1801 */
1802 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1803 if (startoffset > fsize) {
1804 error = xfs_alloc_file_space(ip, fsize,
1805 startoffset - fsize, 0,
1806 attr_flags | XFS_ATTR_NOLOCK);
1807 if (error) {
1808 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1809 break;
1810 }
1811 }
1812
1813 iattr.ia_valid = ATTR_SIZE;
1814 iattr.ia_size = startoffset;
1815
1816 error = xfs_setattr_size(ip, &iattr,
1817 attr_flags | XFS_ATTR_NOLOCK);
1818 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1819
1820 if (error)
1821 return error;
1822
1823 clrprealloc = 1;
1824 break;
1825
1826 default:
1827 ASSERT(0);
1828 return XFS_ERROR(EINVAL);
1829 }
1830
1831 /*
1832 * update the inode timestamp, mode, and prealloc flag bits
1833 */
1834 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
1835
1836 if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
1837 0, 0, 0))) {
1838 /* ASSERT(0); */
1839 xfs_trans_cancel(tp, 0);
1840 return error;
1841 }
1842
1843 xfs_ilock(ip, XFS_ILOCK_EXCL);
1844 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1845
1846 if ((attr_flags & XFS_ATTR_DMI) == 0) {
1847 ip->i_d.di_mode &= ~S_ISUID;
1848
1849 /*
1850 * Note that we don't have to worry about mandatory
1851 * file locking being disabled here because we only
1852 * clear the S_ISGID bit if the Group execute bit is
1853 * on, but if it was on then mandatory locking wouldn't
1854 * have been enabled.
1855 */
1856 if (ip->i_d.di_mode & S_IXGRP)
1857 ip->i_d.di_mode &= ~S_ISGID;
1858
1859 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1860 }
1861 if (setprealloc)
1862 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
1863 else if (clrprealloc)
1864 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
1865
1866 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1867 if (attr_flags & XFS_ATTR_SYNC)
1868 xfs_trans_set_sync(tp);
1869 return xfs_trans_commit(tp, 0);
1870}
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
deleted file mode 100644
index 38c67c34d73f..000000000000
--- a/fs/xfs/xfs_vnodeops.h
+++ /dev/null
@@ -1,55 +0,0 @@
1#ifndef _XFS_VNODEOPS_H
2#define _XFS_VNODEOPS_H 1
3
4struct attrlist_cursor_kern;
5struct file;
6struct iattr;
7struct inode;
8struct iovec;
9struct kiocb;
10struct pipe_inode_info;
11struct uio;
12struct xfs_inode;
13
14
15int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags);
16int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap, int flags);
17#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */
18#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */
19#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */
20#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */
21#define XFS_ATTR_SYNC 0x10 /* synchronous operation required */
22
23int xfs_readlink(struct xfs_inode *ip, char *link);
24int xfs_release(struct xfs_inode *ip);
25int xfs_inactive(struct xfs_inode *ip);
26int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
27 struct xfs_inode **ipp, struct xfs_name *ci_name);
28int xfs_create(struct xfs_inode *dp, struct xfs_name *name, umode_t mode,
29 xfs_dev_t rdev, struct xfs_inode **ipp);
30int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
31 struct xfs_inode *ip);
32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
33 struct xfs_name *target_name);
34int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize);
35int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
36 const char *target_path, umode_t mode, struct xfs_inode **ipp);
37int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
38int xfs_change_file_space(struct xfs_inode *ip, int cmd,
39 xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
40int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
41 struct xfs_inode *src_ip, struct xfs_inode *target_dp,
42 struct xfs_name *target_name, struct xfs_inode *target_ip);
43int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
44 unsigned char *value, int *valuelenp, int flags);
45int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
46 unsigned char *value, int valuelen, int flags);
47int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
48int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
49 int flags, struct attrlist_cursor_kern *cursor);
50
51int xfs_iozero(struct xfs_inode *, loff_t, size_t);
52int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
53int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
54
55#endif /* _XFS_VNODEOPS_H */
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 87d3e03878c8..e01f35ea76ba 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -17,13 +17,13 @@
17 */ 17 */
18 18
19#include "xfs.h" 19#include "xfs.h"
20#include "xfs_log_format.h"
20#include "xfs_da_btree.h" 21#include "xfs_da_btree.h"
21#include "xfs_bmap_btree.h" 22#include "xfs_bmap_btree.h"
22#include "xfs_inode.h" 23#include "xfs_inode.h"
23#include "xfs_attr.h" 24#include "xfs_attr.h"
24#include "xfs_attr_leaf.h" 25#include "xfs_attr_leaf.h"
25#include "xfs_acl.h" 26#include "xfs_acl.h"
26#include "xfs_vnodeops.h"
27 27
28#include <linux/posix_acl_xattr.h> 28#include <linux/posix_acl_xattr.h>
29#include <linux/xattr.h> 29#include <linux/xattr.h>